In [11]:
import re
import spacy
from pathlib import Path

In [12]:
# Define file system and settings

input_path = "" # Path to all_wikis.txt from unzipped corpus
# Input from one file or from multiple in a directory
input_is_one_file = True
output_dir = "../data"

# Number of sentences in each output file
batch_size = 50000
num_batches = 10

In [13]:
def clean_text(text):
    text = re.sub("\s+", ' ', text).strip()
    replacements = [(r"[“”]", "\""), \
                    (r"[‘’]", "'"), \
                    (r"…", "..."), \
                    (r"[\"\']", '')] # This one is for my sanity. Let's pretend quotes don't exist :)
    for replacement in replacements:
        text = re.sub(replacement[0], replacement[1], text)
    return text.strip()

In [25]:
nlp = spacy.load("es_core_news_sm", disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer", "ner"])
nlp.enable_pipe("senter")
nlp.max_length = 200000000

if input_is_one_file:
    print(f"Reading {input_path}")
    with open(input_path, 'r') as f:
        lines = f.readlines()
    # Separate texts into groups of 1,000,000 lines to avoid insane memory usage by spaCy
    texts = [' '.join(lines[i*1_000_000:i*1_000_000+1_000_000]) for i in range(num_batches*100)]
    print("Done.")
else:
    texts = []
    for file in Path(input_dir).rglob("*.txt"):
        print(f"Reading {file}...")
        with open(file, 'r') as f:
            texts.append(' '.join(f.readlines()))
    print("Done.")
print("Texts done being created.")

batches = []
cur_batch = []
for text in texts:
    print("Sentencizing 1,000,000 lines...")
    print(f"Current batch size: {len(cur_batch)}")
    for sent in nlp(text).sents:
        if len(sent) < 64:
            cur_batch.append(sent.text)
        if len(cur_batch) >= batch_size:
            batches.append(cur_batch)
            cur_batch = []
        if len(batches) == num_batches:
            break
    print(f"Report: {len(batches)} done.")
    if len(batches) == num_batches:
        break
            
print("Writing batches...")
for i, batch in enumerate(batches):
    with open(Path(output_dir) / f"batch_{i+1}.txt", 'w') as f:
        for sentence in batch:
            f.write(f"{clean_text(sentence)}\n\n")
print("Done.")

Reading /xdisk/josorio1/aconverse/GEC_BETO/spanish-corpora/raw/all_wikis.txt
Done.
Texts done being created.
Sentencizing 1,000,000 lines...
Current batch size: 0
Report: 1 done.
Sentencizing 1,000,000 lines...
Current batch size: 36104
Report: 3 done.
Sentencizing 1,000,000 lines...
Current batch size: 240
Report: 4 done.
Sentencizing 1,000,000 lines...
Current batch size: 36148
Report: 5 done.
Sentencizing 1,000,000 lines...
Current batch size: 13493
Report: 5 done.
Sentencizing 1,000,000 lines...
Current batch size: 69682
Report: 6 done.
Sentencizing 1,000,000 lines...
Current batch size: 30910
Report: 6 done.
Sentencizing 1,000,000 lines...
Current batch size: 92437
Report: 7 done.
Sentencizing 1,000,000 lines...
Current batch size: 57021
Report: 8 done.
Sentencizing 1,000,000 lines...
Current batch size: 25267
Report: 8 done.
Sentencizing 1,000,000 lines...
Current batch size: 92173
Report: 9 done.
Sentencizing 1,000,000 lines...
Current batch size: 60986
Report: 10 done.
Writing 