In [2]:
pip install wikiextractor

Collecting wikiextractor
  Downloading wikiextractor-3.0.6-py3-none-any.whl (46 kB)
[?25l[K     |███████                         | 10 kB 20.8 MB/s eta 0:00:01[K     |██████████████▏                 | 20 kB 24.3 MB/s eta 0:00:01[K     |█████████████████████▏          | 30 kB 18.7 MB/s eta 0:00:01[K     |████████████████████████████▎   | 40 kB 14.9 MB/s eta 0:00:01[K     |████████████████████████████████| 46 kB 2.3 MB/s 
[?25hInstalling collected packages: wikiextractor
Successfully installed wikiextractor-3.0.6


In [64]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [65]:
import os, sys
import time
import pandas as pd

OUT_DIR = "/content/drive/MyDrive/Colab Netbooks/"

# **Task 1**

### Split Wikipedia Text into Sentences with Spacy

In [5]:
!python -m spacy download de_core_news_sm
!pip install -U spacy[cuda92]

Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 12.5 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('de_core_news_sm')


In [66]:
# Example of textprocessing with Spacy

import spacy
# To run on Google Collab: 
import de_core_news_sm
nlp = de_core_news_sm.load()
spacy.prefer_gpu()  # run spacy on GPU, if possible
#nlp = spacy.load('de_core_news_sm',disable=['tagger','parser','ner','textcat'])
#nlp.add_pipe('sentencizer')
# To run the above command on Google collab
from spacy.lang.en import English
nlp = English()  # just the language with no model
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
sample_text = """Es sind außergewöhnliche Zeiten, die inzwischen schon fast gewöhnlich sind. Seit fast zwei Monaten leben die meisten Deutschen weitgehend in häuslicher Quarantäne. Sie kämpfen mit Belastungen wie Homeoffice und Kinderbetreuung, Hobbys fallen weg, soziale Kontakte sind kaum möglich.
Und die Zukunft ist ungewiss. Zwar leitet die Politik langsam eine Öffnung des gesellschaftlichen Lebens ein, doch gleichzeitig warnen Forscherinnen und Forscher vor einer zweiten noch unkontrollierbareren Welle. Was machen Ungewissheit und Einschränkungen mit uns, wenn sie zum Normalzustand werden?"""
doc = nlp(sample_text)
sentences = list(doc.sents)
for i in range(len(sentences)):
    print(sentences[i].text)
    print("Number of characters:", len(sentences[i].text))
    print(" — — — — — — — — — — — — — — — — — -")

Es sind außergewöhnliche Zeiten, die inzwischen schon fast gewöhnlich sind.
Number of characters: 75
 — — — — — — — — — — — — — — — — — -
Seit fast zwei Monaten leben die meisten Deutschen weitgehend in häuslicher Quarantäne.
Number of characters: 87
 — — — — — — — — — — — — — — — — — -
Sie kämpfen mit Belastungen wie Homeoffice und Kinderbetreuung, Hobbys fallen weg, soziale Kontakte sind kaum möglich.
Number of characters: 118
 — — — — — — — — — — — — — — — — — -

Und die Zukunft ist ungewiss.
Number of characters: 30
 — — — — — — — — — — — — — — — — — -
Zwar leitet die Politik langsam eine Öffnung des gesellschaftlichen Lebens ein, doch gleichzeitig warnen Forscherinnen und Forscher vor einer zweiten noch unkontrollierbareren Welle.
Number of characters: 182
 — — — — — — — — — — — — — — — — — -
Was machen Ungewissheit und Einschränkungen mit uns, wenn sie zum Normalzustand werden?
Number of characters: 87
 — — — — — — — — — — — — — — — — — -


#### Prepare Spacy Parallel Execution
Example of [multiprocessing](https://sebastianraschka.com/Articles/2014_multiprocessing.html).

In [67]:
# create nthread lists of input files
import os
nthread = 8
maxFiles = 100  # actual number ~ 6000 ~ 6GB

nfile = 0
directory = OUT_DIR

inFiles = []
for i in range(nthread):
    inFiles.append([])
i = 0
for root, dirs, files in os.walk(directory):
    for file in files:
        inFile = os.path.join(root, file)
        inFiles[i].append(inFile)
        i += 1
        if i == nthread:
            i = 0
        nfile += 1
        if nfile >= maxFiles:
            break
    if nfile >= maxFiles:
        break

print("len(inFiles)=", len(inFiles), ' num files', sum(len(infi) for infi in inFiles))

len(inFiles)= 8  num files 100


In [68]:
# process a single input file
import spacy
import time
t0 = time.time()
#nlp = spacy.load('de_core_news_sm',disable=['tagger','parser','ner','textcat'])
#nlp.add_pipe(nlp.create_pipe('sentencizer'))
nlp = de_core_news_sm.load()

file = os.path.join(directory, "WikiExtractor.py")

def processFile(file, outf, nlp):
    f = open(file, 'r')
    lines = f.readlines()
    nbreak = 0
    ndoc = 0
    inDoc = False
    idoc = 0
    iline = 0
    nblank = 0  # number of blank lines
    print("len(lines)", len(lines))
    numChar = 0

    for line in lines:
        #print(line)
        line = line.strip()  # remove blanks and \n
        if line[:8] == "<doc id=":
            assert not inDoc
            inDoc = True
            parts = line.split('"')
            assert (parts[4] == ' title=')
            title = parts[5]
            iline = 0
            nblank = 1  # omit the next blank line
            continue
        if line[:6] == "</doc>":
            if iline > 0:
                idoc += 1
            inDoc = False
            continue

        if len(line) > 0:
            iline += 1
            nblank = 0
        else:
            nblank += 1
        if iline <= 1 or nblank > 1:  # skip title line and empty lines after title line
            continue
        #print(idoc,iline,line)
        if len(line) >= 1:
            doc = nlp(line)  # split line into sentences
            sentences = list(doc.sents)
            for i in range(len(sentences)):
                txt = sentences[i].text
                txt = txt.strip()
                #print(idoc,iline,i,txt)

                numChar += len(txt)
                outf.write(txt + "\n")
        else:
            outf.write(line + "\n")
    assert not inDoc
    print("wrote ", idoc, "documents with numChar=", numChar)
    return numChar



outf = open("name.txt", 'w', encoding='utf8')
processFile(file, outf, nlp)
outf.close()
print(time.time() - t0, 'sec')

len(lines) 645
wrote  1 documents with numChar= 17284
5.464847087860107 sec


In [69]:
# function to be executed in parallel
import os
import spacy
os.makedirs(OUT_DIR, exist_ok=True)

def newFile(i, directory, name="wikiOut"):
    val = str(i)
    val = val.rjust(4, '0')
    fileName = directory + name + val + ".txt"  # generate new file name
    print("created file", fileName)
    outf = open(fileName, 'w', encoding='utf8')
    return outf

def process_files(file_list, fid, maxFileSize=10000000):
    t0=time.time()
    nlp = de_core_news_sm.load()
    outDir = "data/german_wiki/processed/"
    i=1
    outf = newFile(i, outDir, name="wikiOut"+fid)
    numChar=0
    for inFile in file_list:
        print(inFile)
        numChar+=processFile(inFile, outf, nlp)
        if numChar > maxFileSize:
            i+=1
            outf.close()
            outf = newFile(i,outDir,name="wikiOut"+fid)
            numChar=0
    outf.close()
    print(fid,'used time',time.time()-t0,'sec')

In [71]:
import os
import spacy
os.makedirs(OUT_DIR, exist_ok=True)

def newFile(i, directory, name="wikiOut"):
    val = str(i)
    val = val.rjust(4, '0')
    fileName = directory + name + val + ".txt"  
    print("created file", fileName)
    outf = open(fileName, 'w', encoding='utf8')
    return outf

def process_files(file_list, fid, maxFileSize=10000000):
    t0=time.time()
    nlp = de_core_news_sm.load()
    outDir = "/content/drive/MyDrive/Colab Netbooks/german_wiki"
    i=1
    outf = newFile(i, outDir, name="wikiOut"+fid)
    numChar=0
    for inFile in file_list:
        print(inFile)
        numChar+=processFile(inFile, outf, nlp)
        if numChar > maxFileSize:
            i+=1
            outf.close()
            outf = newFile(i,outDir,name="wikiOut"+fid)
            numChar=0
    outf.close()
    print(fid,'used time',time.time()-t0,'sec')

In [72]:
# single execution (for test only)
process_files(inFiles[0], '0f', maxFileSize=10000000)

created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0001.txt
/content/drive/MyDrive/Colab Netbooks/WikiExtractor.py
len(lines) 645
wrote  1 documents with numChar= 17284
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_34
len(lines) 4544
wrote  44 documents with numChar= 965153
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_26
len(lines) 4937
wrote  60 documents with numChar= 1013411
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_18
len(lines) 5000
wrote  60 documents with numChar= 1011222
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_14
len(lines) 4684
wrote  78 documents with numChar= 949747
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_04
len(lines) 4806
wrote  52 documents with numChar= 956551
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_91
len(lines) 4845
wrote  77 documents with numChar= 974127
/content/drive/MyDrive/Colab Netbooks/ger

#### Multiprocessing Execution of Spacy

In [73]:
# multiprocessing execution
import multiprocessing as mp
t0 = time.time()

# Define an output queue
output = mp.Queue()

# Setup a list of processes that we want to run
processes = [
    mp.Process(target=process_files, args=(inFiles[j], str(j) + 'f'))
    for j in range(nthread)
]

# Run processes
for p in processes:
    p.start()

# Exit the completed processes
for p in processes:
    p.join()

# Get process results from the output queue
#results = [output.get() for p in processes]

#print(results)
print("used",time.time()-t0,"sec")

created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0001.txt
/content/drive/MyDrive/Colab Netbooks/WikiExtractor.py
len(lines) 645
created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut1f0001.txt
/content/drive/MyDrive/Colab Netbooks/tokenizer.json
created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut2f0001.txt
len(lines) 340
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_40
created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut3f0001.txt
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_38
created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut4f0001.txt
created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut5f0001.txt
created file /content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut7f0001.txt
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_36
/content/drive/MyDrive/Colab Netbooks/german_wiki/extracted/AA/wiki_35
/content/drive/MyD

# **Task 2**

In [78]:
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
paths = [
    str(x) for x in Path(OUT_DIR).glob("*.txt")
]
print("input data", paths)

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-t6x57yqu
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-t6x57yqu
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
tokenizers                    0.11.5
transformers                  4.17.0.dev0
input data ['/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0001.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0002.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut1f0001.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut2f0001.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut3f0001.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut5f0001.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut4f0001.txt', '/content/

### Train a BPE-Tokenizer

In [74]:
import time
t0 = time.time()
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer(lowercase=False)

vocab_size = 32000  # was 52000
# Customize training
tokenizer.train(files=paths,
                vocab_size=vocab_size,
                min_frequency=3,
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>",
                ])
print("used", time.time() - t0, "sec")

used 0.004160642623901367 sec


In [75]:
tokenizer.save(os.path.join("/content/drive/MyDrive/Colab Netbooks/", "tokenizer.json"))

In [76]:
from tokenizers import Tokenizer
from tokenizers.processors import BertProcessing

#tokenizer = ByteLevelBPETokenizer(
#    "./EsperBERTo/vocab.json",
#    "./EsperBERTo/merges.txt",
#)
tokenizer = Tokenizer.from_file(os.path.join("/content/drive/MyDrive/Colab Netbooks/" "tokenizer.json"))
vocab_size = tokenizer.get_vocab_size()
print("vocab_size=", vocab_size)

vocab_size= 261


In [45]:
enc=tokenizer.encode("Ich heiße Karl Müller und wohne in Bonn.")
print(enc)
print(enc.tokens)

Encoding(num_tokens=42, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])
['I', 'c', 'h', 'Ġ', 'h', 'e', 'i', 'Ã', 'Ł', 'e', 'Ġ', 'K', 'a', 'r', 'l', 'Ġ', 'M', 'Ã', '¼', 'l', 'l', 'e', 'r', 'Ġ', 'u', 'n', 'd', 'Ġ', 'w', 'o', 'h', 'n', 'e', 'Ġ', 'i', 'n', 'Ġ', 'B', 'o', 'n', 'n', '.']


### Prepare Training

In [46]:
# Check that we have a GPU
!nvidia-smi

Fri Feb 18 12:45:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    32W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [47]:
# Check that PyTorch sees it
import torch
torch.cuda.is_available()
print("pytorch version",torch.__version__)

pytorch version 1.10.0+cu111


# **Task 3**

## Roberta Model

In [48]:
from transformers import RobertaConfig

config = RobertaConfig(
    vocab_size=vocab_size,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,    # was 6
    type_vocab_size=1,
)

Now let's re-create our tokenizer in transformers

In [80]:
from transformers import RobertaTokenizerFast

tokenizer = RobertaTokenizerFast.from_pretrained("/content/drive/MyDrive/Colab Netbooks/", max_len=512)

# Model Building

In [81]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [82]:
model.num_parameters()
# => 84 million parameters
# 68.7 million parameters  (6 hidden layers)
# 54.5 million (4 hidden layers)
# 47 mill (3 hidden layers)

43717125

In [83]:
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
paths = [str(x) for x in Path(OUT_DIR).glob("*.txt")]
nfiles=6
print(paths[:3])
paths = paths[:nfiles]

['/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0001.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0002.txt', '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut1f0001.txt']


In [84]:
paths

['/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0001.txt',
 '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut0f0002.txt',
 '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut1f0001.txt',
 '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut2f0001.txt',
 '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut3f0001.txt',
 '/content/drive/MyDrive/Colab Netbooks/german_wikiwikiOut5f0001.txt']

Tokenize files and concatenate them

In [85]:
%%time
from transformers import LineByLineTextDataset

dataset = None
for ids in range(len(paths)):
    data = LineByLineTextDataset(
        tokenizer=tokenizer,
        #file_path="./oscar.eo.txt",
        file_path=paths[ids],
        block_size=128,
    )
    print("ids",ids,'len(data.examples)',len(data.examples))
    if ids==0:
        dataset = data
    else:
        dataset.__add__(data)  # see torch.utils.data.dataset.Dataset
print('number of sentences: len(dataset.examples)',len(dataset.examples))



ids 0 len(data.examples) 86922
ids 1 len(data.examples) 8527
ids 2 len(data.examples) 85660
ids 3 len(data.examples) 78476
ids 4 len(data.examples) 86304
ids 5 len(data.examples) 84669
number of sentences: len(dataset.examples) 86922
CPU times: user 1min 16s, sys: 2.56 s, total: 1min 19s
Wall time: 50.6 s


In [86]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Finally, we are all set to initialize our Trainer

In [87]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir=OUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=20,
    per_device_train_batch_size=32,    # was 64
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,    
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

## Save final model 

In [98]:
#trainer.save_model("./EsperBERTo")
trainer.save_model('/content/drive/MyDrive/Colab Netbooks/')

Saving model checkpoint to /content/drive/MyDrive/Colab Netbooks/
Configuration saved in /content/drive/MyDrive/Colab Netbooks/config.json
Model weights saved in /content/drive/MyDrive/Colab Netbooks/pytorch_model.bin


## Check that the LM actually trained

In [99]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    #model="./EsperBERTo",
    #tokenizer="./EsperBERTo"
    model="/content/drive/MyDrive/Colab Netbooks/",
    tokenizer="/content/drive/MyDrive/Colab Netbooks/",
)

loading configuration file /content/drive/MyDrive/Colab Netbooks/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Netbooks/",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.17.0.dev0",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 261
}

loading configuration file /content/drive/MyDrive/Colab Netbooks/config.json
Model config RobertaConfig {
  "_name_or_path": "/content/drive/MyDrive/Colab Netbooks/",
  "architectures": [
    "Ro

In [100]:
# The sun <mask>.
# =>

fill_mask("Der Himmel <mask> blau.")

[{'score': 0.14354261755943298,
  'sequence': 'Der Himmele blau.',
  'token': 73,
  'token_str': 'e'},
 {'score': 0.12452682107686996,
  'sequence': 'Der Himmel  blau.',
  'token': 225,
  'token_str': ' '},
 {'score': 0.0801374614238739,
  'sequence': 'Der Himmelt blau.',
  'token': 88,
  'token_str': 't'},
 {'score': 0.07554151117801666,
  'sequence': 'Der Himmels blau.',
  'token': 87,
  'token_str': 's'},
 {'score': 0.062194038182497025,
  'sequence': 'Der Himmeln blau.',
  'token': 82,
  'token_str': 'n'}]

In [101]:
fill_mask("Das Auto fährt <mask> die Brücke .")

# This is the beginning of a beautiful <mask>.
# =>

[{'score': 0.17558805644512177,
  'sequence': 'Das Auto fährte die Brücke.',
  'token': 73,
  'token_str': 'e'},
 {'score': 0.07003363221883774,
  'sequence': 'Das Auto fährti die Brücke.',
  'token': 77,
  'token_str': 'i'},
 {'score': 0.06368739902973175,
  'sequence': 'Das Auto fährtn die Brücke.',
  'token': 82,
  'token_str': 'n'},
 {'score': 0.055653251707553864,
  'sequence': 'Das Auto fährtd die Brücke.',
  'token': 72,
  'token_str': 'd'},
 {'score': 0.05531834810972214,
  'sequence': 'Das Auto fährth die Brücke.',
  'token': 76,
  'token_str': 'h'}]