In [53]:
import glob
import os
from pathlib import Path

from tokenizers import Tokenizer
from tokenizers import ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast

import numpy as np
import pandas as pd
from tqdm.notebook import trange, tqdm

In [29]:
# Find path to csv files with processed data
#Path().parent.absolute()
paths = [str(x) for x in Path("/home/leonardovida/data-histaware").glob("*.csv")]

In [30]:
for path in tqdm(paths, total=len(paths)):
    base = os.path.basename(path)
    name = os.path.splitext(base)[0]
    print(name)
    df = pd.read_csv(path)
    df.dropna(subset=["text"], inplace=True)
    df.to_csv(f'/home/leonardovida/data-histaware/{name}.txt', header=None, index=None, sep=' ', mode='a')

HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))

articles3050000_3100000
articles2300000_2350000
articles850000_900000
articles750000_800000
articles3150000_3200000
articles3700000_3750000
articles650000_700000
articles3000000_3050000
articles250000_300000
articles2100000_2150000
articles1000000_1050000
articles2850000_2900000
articles3250000_3300000
articles800000_850000
articles1100000_1150000
articles1150000_1200000
articles3400000_3450000
articles600000_650000
articles2350000_2400000
articles2650000_2700000
articles3350000_3400000
articles950000_1000000
articles100000_150000
articles1750000_1800000
articles2400000_2450000
articles3650000_3700000
articles3750000_3800000
articles2050000_2100000
articles1650000_1700000
articles2200000_2250000
articles1400000_1450000
articles2500000_2550000
articles300000_350000
articles3950000_3977117
articles3500000_3550000
articles500000_550000
articles700000_750000
articles2750000_2800000
articles3850000_3900000
articles1200000_1250000
articles1850000_1900000
articles3550000_3600000
articles34500

In [31]:
[str(x) for x in Path("/home/leonardovida/data-histaware").glob("*.txt")]

['/home/leonardovida/data-histaware/articles650000_700000.txt',
 '/home/leonardovida/data-histaware/articles550000_600000.txt',
 '/home/leonardovida/data-histaware/articles3050000_3100000.txt',
 '/home/leonardovida/data-histaware/articles3350000_3400000.txt',
 '/home/leonardovida/data-histaware/articles3100000_3150000.txt',
 '/home/leonardovida/data-histaware/articles1650000_1700000.txt',
 '/home/leonardovida/data-histaware/articles0_50000.txt',
 '/home/leonardovida/data-histaware/articles3000000_3050000.txt',
 '/home/leonardovida/data-histaware/articles2950000_3000000.txt',
 '/home/leonardovida/data-histaware/articles2300000_2350000.txt',
 '/home/leonardovida/data-histaware/articles2550000_2600000.txt',
 '/home/leonardovida/data-histaware/articles800000_850000.txt',
 '/home/leonardovida/data-histaware/articles700000_750000.txt',
 '/home/leonardovida/data-histaware/articles3800000_3850000.txt',
 '/home/leonardovida/data-histaware/articles2000000_2050000.txt',
 '/home/leonardovida/data-

In [32]:
%%time 

paths = [str(x) for x in Path("/home/leonardovida/data-histaware").glob("*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.pre_tokenizer = Whitespace()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 1h 49min 34s, sys: 1min, total: 1h 50min 34s
Wall time: 14min 35s


### Save tokenizer

In [38]:
tokenizer.save_model("/home/leonardovida/dev/hist-aware/notebooks/models/bert-training")

['/home/leonardovida/dev/hist-aware/notebooks/models/bert-training/vocab.json',
 '/home/leonardovida/dev/hist-aware/notebooks/models/bert-training/merges.txt']

### Load tokenizer

In [44]:
tokenizer = ByteLevelBPETokenizer(
    "/home/leonardovida/dev/hist-aware/notebooks/models/bert-training/vocab.json",
    "/home/leonardovida/dev/hist-aware/notebooks/models/bert-training/merges.txt",
)

In [45]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [48]:
tokenizer.encode('Waarom niet met dédain over Schilder, zoals over zoveel anderen? Hij wist zich tegenstander: Barth stond tussen beiden. Maar Miskotte wist zich in dezelfde tijd te staan, in dezelfde storm, die ook Schilder onderging. Soms is er opvallende affiniteit. Hij kende de dichters van Nederland, zoals ook Schilder: Dèr Mouw (Adwaita), Nijhof f en Marsman. Hij stond, denk ik, geestelijk ook aanzienlijk dichter bij hen. Hij had de Nederlandse taal en het Nederlands land lief: „De nederlandse taal en het geboomte van dit land, deze twee gewassen zijn mijn aardse heerlijkheid" (329). Al zou het alléén déze zin zijn - daarvoor is lectuur van meer dan 600 bladzijden geen te hoge prijs! Dit Nederland werd platgedrukt door traditionele kerkelijkheid en bloedde weg in dood geloof. Dit Nederland werd besprongen door duistere machten. Zou het Woord (zoals Barth het verstond) geen nieuwe glans kunnen leggen op het eigen leven, op het volk, waaronder hij werkte?').tokens

['<s>',
 'Waarom',
 'Ġniet',
 'Ġmet',
 'ĠdÃ©',
 'da',
 'in',
 'Ġover',
 'ĠSchilder',
 ',',
 'Ġzoals',
 'Ġover',
 'Ġzoveel',
 'Ġanderen',
 '?',
 'ĠHij',
 'Ġwist',
 'Ġzich',
 'Ġtegenstander',
 ':',
 'ĠBart',
 'h',
 'Ġstond',
 'Ġtussen',
 'Ġbeiden',
 '.',
 'ĠMaar',
 'ĠM',
 'isk',
 'otte',
 'Ġwist',
 'Ġzich',
 'Ġin',
 'Ġdezelfde',
 'Ġtijd',
 'Ġte',
 'Ġstaan',
 ',',
 'Ġin',
 'Ġdezelfde',
 'Ġstorm',
 ',',
 'Ġdie',
 'Ġook',
 'ĠSchilder',
 'Ġonder',
 'ging',
 '.',
 'ĠSoms',
 'Ġis',
 'Ġer',
 'Ġopvallende',
 'Ġaffiniteit',
 '.',
 'ĠHij',
 'Ġkende',
 'Ġde',
 'Ġdichters',
 'Ġvan',
 'ĠNederland',
 ',',
 'Ġzoals',
 'Ġook',
 'ĠSchilder',
 ':',
 'ĠD',
 'Ã¨r',
 'ĠM',
 'ouw',
 'Ġ(',
 'Ad',
 'wa',
 'ita',
 '),',
 'ĠNij',
 'hof',
 'Ġf',
 'Ġen',
 'ĠMar',
 'sman',
 '.',
 'ĠHij',
 'Ġstond',
 ',',
 'Ġdenk',
 'Ġik',
 ',',
 'Ġgeestelijk',
 'Ġook',
 'Ġaanzienlijk',
 'Ġdichter',
 'Ġbij',
 'Ġhen',
 '.',
 'ĠHij',
 'Ġhad',
 'Ġde',
 'ĠNederlandse',
 'Ġtaal',
 'Ġen',
 'Ġhet',
 'ĠNederlands',
 'Ġland',
 'Ġlief',
 ':',


### Train model

In [49]:
!nvidia-smi

Wed Mar 17 17:07:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  On   | 00000000:00:05.0 Off |                  N/A |
| 30%   29C    P8     9W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  On   | 00000000:00:06.0 Off |                  N/A |
| 30%   28C    P8    10W / 250W |      1MiB / 11019MiB |      0%      Default |
|       

In [50]:
import torch
torch.cuda.is_available()

True

In [52]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6, 
    type_vocab_size=1, # The vocabulary size of the token_type_ids passed
)

Paper BERTje
--max_predictions_per_seq=20 \
  --train_batch_size=256 \
  --eval_batch_size=32 \
  --learning_rate=1e-4 \
  --num_train_steps=1000000 \
  --num_warmup_steps=10000 \
  --save_checkpoints_steps=10000 \
  --iterations_per_loop=10000 \
  --max_eval_steps=10000 \

In [54]:
tokenizer = RobertaTokenizerFast.from_pretrained("/home/leonardovida/dev/hist-aware/notebooks/models/bert-training", max_len=512)

In [55]:
from transformers import RobertaForMaskedLM

model = RobertaForMaskedLM(config=config)

In [56]:
model.num_parameters()

83504416