# Pre-training DelphBERT - A tiny Transformer model based on a large newspaper dataset

* //TODO: Copy description of project from Github
* //TODO: Description of data 

In [33]:
import glob
import os
from pathlib import Path

from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union


from tokenizers import Tokenizer
from tokenizers import ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import PreTrainedTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer
from transformers import TrainingArguments
from transformers import RobertaForMaskedLM
from datasets import Dataset, DatasetDict

import torch
from torch import Tensor
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd
from tqdm.notebook import trange, tqdm

## Load Data

First, we need to load the entire "processed" library of Delpher newspaper on the transformer.

* // TODO: even though we are loading clean and unclean text, we are training using the **unclean** text, as Transformers want complete sentences and the clean text does not have stopwords nor lower-upper cases.
    * Think testing the performance training on semi-clean text

### Convert from _.csv_ to _.txt_

In [20]:
# Find path to csv files with processed data
#Path().parent.absolute()
paths = [str(x) for x in Path("/home/leonardovida/data-histaware/1990s/").glob("*.csv")]

In [21]:
# Create txt files for "processed" data
for path in tqdm(paths, total=len(paths)):
    base = os.path.basename(path)
    name = os.path.splitext(base)[0]
    df = pd.read_csv(path)
    df.dropna(subset=["text"], inplace=True)
    df["text"].to_csv(f'/home/leonardovida/data-histaware/{name}.txt', header=None, index=None, sep=' ', mode='a')

HBox(children=(FloatProgress(value=0.0, max=78.0), HTML(value='')))

ParserError: Error tokenizing data. C error: EOF inside string starting at row 63533

In [22]:
%%time 

paths = [str(x) for x in Path("/home/leonardovida/data-histaware").glob("*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.pre_tokenizer = Whitespace()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

CPU times: user 36min 34s, sys: 35.3 s, total: 37min 9s
Wall time: 6min 13s


### Save tokenizer

In [11]:
tokenizer_dir = "/home/leonardovida/dev/dev/hist-aware/notebooks/models/bert-training/tokenizer"
#!mkdir {tokenizer_dir}

In [24]:
tokenizer.save_model(tokenizer_dir)

['/home/leonardovida/dev/dev/hist-aware/notebooks/models/bert-training/tokenizer/vocab.json',
 '/home/leonardovida/dev/dev/hist-aware/notebooks/models/bert-training/tokenizer/merges.txt']

### Load tokenizer

In [25]:
tokenizer = ByteLevelBPETokenizer(
    tokenizer_dir + "/vocab.json",
    tokenizer_dir + "/merges.txt",
)

In [26]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

Test tokenizer performance

In [27]:
tokenizer.encode('Waarom niet met dédain over Schilder, zoals over zoveel anderen? Hij wist zich tegenstander: Barth stond tussen beiden. Maar Miskotte wist zich in dezelfde tijd te staan, in dezelfde storm, die ook Schilder onderging. Soms is er opvallende affiniteit. Hij kende de dichters van Nederland, zoals ook Schilder: Dèr Mouw (Adwaita), Nijhof f en Marsman. Hij stond, denk ik, geestelijk ook aanzienlijk dichter bij hen. Hij had de Nederlandse taal en het Nederlands land lief: „De nederlandse taal en het geboomte van dit land, deze twee gewassen zijn mijn aardse heerlijkheid" (329). Al zou het alléén déze zin zijn - daarvoor is lectuur van meer dan 600 bladzijden geen te hoge prijs! Dit Nederland werd platgedrukt door traditionele kerkelijkheid en bloedde weg in dood geloof. Dit Nederland werd besprongen door duistere machten. Zou het Woord (zoals Barth het verstond) geen nieuwe glans kunnen leggen op het eigen leven, op het volk, waaronder hij werkte?').tokens

['<s>',
 'Waarom',
 'Ġ',
 'niet',
 'Ġ',
 'met',
 'Ġ',
 'd',
 'Ã',
 '©',
 'da',
 'in',
 'Ġ',
 'over',
 'Ġ',
 'Schilder',
 ',',
 'Ġ',
 'zoals',
 'Ġ',
 'over',
 'Ġ',
 'zoveel',
 'Ġ',
 'anderen',
 '?',
 'Ġ',
 'Hij',
 'Ġ',
 'wist',
 'Ġ',
 'zich',
 'Ġ',
 'tegenstander',
 ':',
 'Ġ',
 'Bart',
 'h',
 'Ġ',
 'stond',
 'Ġ',
 'tussen',
 'Ġ',
 'beiden',
 '.',
 'Ġ',
 'Maar',
 'Ġ',
 'Mis',
 'k',
 'otte',
 'Ġ',
 'wist',
 'Ġ',
 'zich',
 'Ġ',
 'in',
 'Ġ',
 'dezelfde',
 'Ġ',
 'tijd',
 'Ġ',
 'te',
 'Ġ',
 'staan',
 ',',
 'Ġ',
 'in',
 'Ġ',
 'dezelfde',
 'Ġ',
 'storm',
 ',',
 'Ġ',
 'die',
 'Ġ',
 'ook',
 'Ġ',
 'Schilder',
 'Ġ',
 'onderging',
 '.',
 'Ġ',
 'Soms',
 'Ġ',
 'is',
 'Ġ',
 'er',
 'Ġ',
 'opvallende',
 'Ġ',
 'affiniteit',
 '.',
 'Ġ',
 'Hij',
 'Ġ',
 'kende',
 'Ġ',
 'de',
 'Ġ',
 'dichters',
 'Ġ',
 'van',
 'Ġ',
 'Nederland',
 ',',
 'Ġ',
 'zoals',
 'Ġ',
 'ook',
 'Ġ',
 'Schilder',
 ':',
 'Ġ',
 'D',
 'Ã',
 '¨',
 'r',
 'Ġ',
 'M',
 'ouw',
 'Ġ',
 '(',
 'Ad',
 'wa',
 'ita',
 '),',
 'Ġ',
 'Nij',
 'hof',
 'Ġ',
 'f'

### Train model

In [2]:
!nvidia-smi

Wed Mar 24 12:34:26 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  On   | 00000000:00:05.0 Off |                  N/A |
| 30%   30C    P8    11W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  On   | 00000000:00:06.0 Off |                  N/A |
| 30%   31C    P8    11W / 250W |      1MiB / 11019MiB |      0%      Default |
|       

Set environment variables

In [12]:
#os.environ["train_path"] = train_path
#os.environ["eval_path"] = eval_path
os.environ["CUDA_LAUNCH_BLOCKING"]='1'  #Makes for easier debugging (just in case)
weights_dir = "/home/leonardovida/dev/dev/hist-aware/notebooks/models/bert-training/weights"
#!mkdir {weights_dir}

In [13]:
torch.cuda.is_available()

True

In [14]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6, 
    type_vocab_size=1, # The vocabulary size of the token_type_ids passed
)

Paper BERTje
--max_predictions_per_seq=20 \
  --train_batch_size=256 \
  --eval_batch_size=32 \
  --learning_rate=1e-4 \
  --num_train_steps=1000000 \
  --num_warmup_steps=10000 \
  --save_checkpoints_steps=10000 \
  --iterations_per_loop=10000 \
  --max_eval_steps=10000 \

In [15]:
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

In [16]:
model = RobertaForMaskedLM(config=config)

In [18]:
model.num_parameters()
# => 84 million parameters

83504416

### Create Dataset

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=128,
)

In [37]:
def prepare_data(dataset_f: str,
                 tokenizer: PreTrainedTokenizer,
                 batch_size: int = 16,
                 num_workers: int = 2) -> Dict[str, DataLoader]:
    """Given an input file, prepare the train, test, validation dataloaders.
       The created datasets will be preprocessed and save to disk.
    :param dataset_f: input file
    :param tokenizer: pretrained tokenizer that will prepare the data, i.e. convert tokens into IDs
    :param batch_size: batch size for the dataloaders
    :param num_workers: number of CPU workers to use during dataloading. On Windows this must be zero
    :return: a dictionary containing train, test, validation dataloaders
    """

    def collate(batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
        """Collates gathered items to form a batch which is then used in training, evaluation, or testing.
        :param batch: a list of samples from the dataset. Each sample is a dictionary with keys "input_ids".
        :return: the created batch with keys "input_ids"
        """
        all_input_ids = pad_sequence([item["input_ids"] for item in batch]).to(torch.long)

        return {"input_ids": all_input_ids}

    def preprocess(sentences: List[str]) -> Dict[str, Union[list, Tensor]]:
        """Preprocess the raw input sentences from the text file.
        :param sentences: a list of sentences (strings)
        :return: a dictionary of "input_ids"
        """
        tokens = [s.split() for s in sentences]

        # The sequences are not padded here. we leave that to the dataloader in collate
        # That means: a bit slower processing, but a smaller saved dataset size
        return tokenizer(tokens,
                         add_special_tokens=False,
                         return_token_type_ids=False,
                         return_attention_mask=False
                        )
    
    dataset = Dataset.from_dict({"text": Path(dataset_f).read_text(encoding="utf-8").splitlines()})

    # Split the dataset into train, test, dev
    # 90% (train), 10% (test + validation)
    train_testvalid = dataset.train_test_split(test_size=0.1)
    # 10% of total (test), 10% of total (validation)
    test_valid = train_testvalid["test"].train_test_split(test_size=0.5)

    dataset = DatasetDict({"train": train_testvalid["train"],
                           "test": test_valid["test"],
                           "valid": test_valid["train"]})

    dataset = dataset.map(preprocess, input_columns=["text"], batched=True)
    dataset.set_format("torch", columns=["input_ids"])

    return {partition: DataLoader(ds,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=collate,
                                  num_workers=num_workers,
                                  pin_memory=True) for partition, ds in dataset.items()}

In [38]:
prepare_data(dataset_f = "/home/leonardovida/data-histaware/articles0_50000.txt",
    tokenizer = tokenizer,
    batch_size = 16,
    num_workers = 2)

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

### Create Data Collator

To make backpropagation on

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Initalize Trainer

In [None]:
training_args = TrainingArguments(
    output_dir="home/leonardovida/data-histaware/delphBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
%%time
trainer.train()

#### Save model

In [None]:
trainer.save_model("./EsperBERTo")

### Verify learning

Create a fill-mask pipeline to check whether the model learned anything useful

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./EsperBERTo",
    tokenizer="./EsperBERTo"
)

In [None]:
fill_mask("La suno <mask>.")

### Upload the model to HuggingFace

Write a README.md model card and add it to the repository under `model_cards/`. Your model card should ideally include:
* a model description,
* training params (dataset, preprocessing, hyperparameters), 
* evaluation results,
* intended uses & limitations
* whatever else is helpful! 🤓

In [None]:
#transformers-cli upload