# Pre-training DelphBERT - A tiny Transformer model based on a large newspaper dataset

* //TODO: Copy description of project from Github
* //TODO: Description of data 

In [71]:
import glob
import os
from pathlib import Path

from typing import Any, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union

from tokenizers import Tokenizer
from tokenizers import ByteLevelBPETokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
from transformers import RobertaConfig
from transformers import RobertaTokenizerFast
from transformers import PreTrainedTokenizer
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer
from transformers import TrainingArguments
from transformers import RobertaForMaskedLM
from datasets import Dataset, DatasetDict

import torch
from torch import Tensor
from torch.utils.data import DataLoader

import numpy as np
import pandas as pd
from tqdm.notebook import trange, tqdm

## Load Data

First, we need to load the entire "processed" library of Delpher newspaper on the transformer.

* // TODO: even though we are loading clean and unclean text, we are training using the **unclean** text, as Transformers want complete sentences and the clean text does not have stopwords nor lower-upper cases.
    * Think testing the performance training on semi-clean text

In [43]:
PATH_RAW_FILES = "/home/leonardovida/data-histaware/raw/raw_merged/"
PATH_TOKENIZER_DIR = "/home/leonardovida/dev/hist-aware/notebooks/models/bert-training-from-scratch/tokenizer"
PATH_DATASET_DIR = "/home/leonardovida/dev/hist-aware/notebooks/models/bert-training-from-scratch/dataset"
#!mkdir PATH_MODEL_DIR

## Preprocess data

### Preprocess from .csv into .txt

In [2]:
import enchant
broker = enchant.Broker()
broker.describe()
broker.list_languages()

['en_US', 'en', 'en_AU', 'en_CA', 'en_GB']

In [3]:
import re

import enchant
import nltk

class TextCleaner:
    def __init__(self):
        #self.d = enchant.Dict("nl_NL")
        self.stopword_list = nltk.corpus.stopwords.words("dutch")
        self.STOPWORDS = set(self.stopword_list)
        
    def get_words(self):
        self.text = " ".join([c for c in nltk.word_tokenize(self.text)])
        return self

    def lower(self):
        """Transform to lower case."""
        self.text = "".join([t.lower() for t in self.text])
        return self

    def remove_stopwords(self):
        """Remove the stopwords."""
        self.text = "".join([t for t in self.text if t not in self.STOPWORDS])
        return self

    def remove_numeric(self):
        """Remove numbers."""
        self.text = "".join([c for c in self.text if not c.isdigit()])
        return self

    def remove_non_ascii(self):
        """Remove non ASCII chars."""
        self.text = "".join([re.sub(r"[^\x00-\x7f]", r" ", c) for c in self.text])
        return self

    def remove_extra_whitespace_tabs(self):
        """Remove extra whitespaces and tabs."""
        self.text = re.sub(r"^\s*|\s\s*", " ", self.text).strip()
        return self

    def remove_one_char(self):
        self.text = " ".join([w for w in self.text.split() if len(w) > 1])
        return self

    def remove_non_words(self):
        """Remove rare words."""
        self.text = " ".join(
            [word for word in str(self.text).split() if self.d.check(word)]
        )
        return self

    def keep_standard_chars(self):
        self.text = "".join([re.sub(r"[^-0-9\w,. ?!()%/]", r"", c) for c in self.text])
        return self

    def preprocess(self, text):
        self.text = text
        self = self.get_words()
        self = self.lower()
        self = self.remove_stopwords()
        self = self.remove_numeric()
        self = self.remove_extra_whitespace_tabs()
        self = self.remove_one_char()
        self = self.remove_non_words()
        return self.text

    def clean(self, text):
        self.text = text
        self = self.get_words()
        self = self.keep_standard_chars()
        self = self.remove_extra_whitespace_tabs()
        return self.text

In [4]:
from tqdm.notebook import tqdm, tqdm_notebook
from loguru import logger

def process_selected_articles(path):
    tqdm_notebook().pandas()
    csv_temp = []
    # Create preprocessing class
    tc = TextCleaner()

    # Load merged articles for selected topic in nlp_pipeline
    df = pd.read_csv(path)
         
    # Initial clean
    df.reset_index(inplace=True)
    df.drop(
        columns={
            "index",
        },
        inplace=True,
    )

    # Split p into original paragraphs
    logger.debug(
        f"Articles before selecting 'articles': {df.shape[0]}"
    )
    df = df[df["subject"] == "artikel"]
    df["p"] = df.apply(lambda row: repr(row["p"]).split("\\',"), axis=1)
    logger.debug(
        f"Articles after selecting 'articles': {df.shape[0]}"
    )
    df = df.explode("p")
    logger.debug(
        f"Articles after splitting into paragraphs: {df.shape[0]}"
    )

    # Preprocess p to cleaner p for Tokenizer and transformers
    res = df["p"].progress_apply(tc.clean)

    # Eliminate paragraphs that do not contain anything
    res.dropna(inplace=True)
    
    # Save to .txt  
    base = os.path.basename(path)
    name = os.path.splitext(base)[0]
    res.to_csv(f'/home/leonardovida/data-histaware/raw/raw_merged/{name}.txt', header=None, index=None, sep=' ', mode='a')
    
    logger.debug(f"Completed: {path}")

### Convert from _.csv_ to _.txt_ - Do it just once though

In [6]:
# Find path to csv files with processed data
#Path().parent.absolute()
paths = [str(x) for x in Path(PATH_RAW_FILES).glob("*.csv")]
paths = paths[2:]
paths

['/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_40.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_60.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_80.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_120.csv',
 '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_100.csv']

In [None]:
# Create txt files for "processed" data
for path in tqdm(paths, total=len(paths)):
    process_selected_articles(path)

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

2021-04-10 14:31:52.863 | DEBUG    | __main__:process_selected_articles:23 - Articles before selecting 'articles': 957008
2021-04-10 14:32:03.730 | DEBUG    | __main__:process_selected_articles:28 - Articles after selecting 'articles': 567544
2021-04-10 14:32:07.279 | DEBUG    | __main__:process_selected_articles:32 - Articles after splitting into paragraphs: 1225204


HBox(children=(FloatProgress(value=0.0, max=1225204.0), HTML(value='')))

### Load .txt files into one (to be changed)

In [None]:
text_files = [str(x) for x in Path(PATH_RAW_FILES).glob("*.txt")]
df = pd.DataFrame()
for file in text_files:
    temp = pd.read_csv(f"{PATH_RAW_FILES}/merged_1970s_20.txt", delimiter = "\t", header=None)
    df = pd.concat([df, temp], axis=0)

In [None]:
df.shape[0]

## Load data into Dataset directly from .csv

In [73]:
import datasets

csv_paths = [str(x) for x in Path(PATH_RAW_FILES).glob("*.csv")]
dataset = datasets.load_dataset(
    "csv",
    data_files = csv_paths
)

Using custom data configuration default-1765ec5baa25fcb6
Reusing dataset csv (/home/leonardovida/.cache/huggingface/datasets/csv/default-1765ec5baa25fcb6/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


Remove columns that are not necessary in the training

In [74]:
dataset = dataset.remove_columns(['Unnamed: 0', 'article_name', 'date', 'index_article',
                                  'article_filepath', 'dir', 'title', 'access_rights',
                                  'identifier', 'metadata_title', 'index_metadata', 'metadata_filepath',
                                  'newspaper_title', 'newspaper_date', 'newspaper_publisher', 'newspaper_source',
                                  'newspaper_volume', 'newspaper_issuenumber', 'newspaper_recordIdentifier',
                                  'transformedRecordIdentifier'])

In [75]:
dataset = dataset.filter(lambda article: article['subject'] == "artikel")

Loading cached processed dataset at /home/leonardovida/.cache/huggingface/datasets/csv/default-1765ec5baa25fcb6/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0/cache-8a85db9cb5d9cb6e.arrow


In [None]:
dataset["train"]["p"][0]

In [91]:
import nltk
" ".join([c for c in nltk.word_tokenize(dataset["train"]["p"][0])])

"[ 'SNEEK — Wij , hier in Nederland , moeten een les trekken uit wat er nu in Chili is gebeurd , zei de heer Piet Reekman , leider van de Sjaloomgroep in Odijk , gisteravond in de Ichtuskerk in Sneek . Wij moeten uit al die dingen die zijn gebeurd om Allende en zijn volksbeweging tegen te werken , op opzij te zetten en uit te moorden , iets leren hier in Nederland . Want als we niet oppassen worden de democratische krachten die hier streven voor een rechtvaardiger samenleving ook opzij gezet , zei hij . ' , 'De heer Reekman was naar Sneek uitgenodigd om over Chili te praten voor leden van de Rotonde-gespreksgroepen . Aan de hand van zijn inleiding zullen de komende weken discussieavonden worden gehouden . Aan het einde van zijn inleiding trok Piet Reekman gisteravond een parallel tussen de aarzelende ( iiristen-dernocratische partijen , in Chili en de christen-democratische partijen in Nederland , die volgens hem ook nooit een keuze durven maken . ' , 'het volk kan weer in armoede verv

In [None]:
def paragraph_clean(article):
    """Basic cleaning of paragraphs.
    
    More can and will be done in the tokenizing step.
    """
    text = "".join([re.sub(r"[^-0-9\w,. ?!()%/]", r"", c) for c in article])
    text = re.sub(r"^\s*|\s\s*", " ", text).strip()
    return {"p": text}
    
dataset = dataset.map(paragraph_clean)

HBox(children=(FloatProgress(value=0.0, max=4186207.0), HTML(value='')))

Divide each paragraph into sentences

In [None]:
import re
#from nltk.tokenize import sent_tokenize

def sentence_tokenize(article):
    """Tokenize sentences from paragraphs."""
    sents = re.split('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)(\s|[A-Z].*)', article["p"])
    return {"sentences": sents}

dataset = dataset.map(sentence_tokenize, num_proc=10)

Rebuild from sentences to shorter paragraphs

In [None]:
def unite(sentences, n):
    """Unite sentences previously split using nltk.tokenize."""
    count = [0]
    sents = []
    for sent in sentences:
        if sum(count) + len(sent.split()) > 400:
            return sents
        else:
            sents.append(sent)
            count.append(len(sent.split()))

dataset = dataset.map(unite, num_proc=10)

Split into train, test and validation and save

In [None]:
# Split 90% train
train_testvalid = dataset.train_test_split(test_size=0.1)
# Split valid into 50% valid and 50% test
test_valid = train_testvalid["test"].train_test_split(test_size=0.5)
# Gather everything into dataset
datasets = DatasetDict({
    "train": train_testvalid["train"],
    "test": test_valid["test"],
    "valid": test_valid["train"]})

In [None]:
dataset.save_to_disk(PATH_DATASET_DIR)

## Tokenizer

Load dataset

In [None]:
from datasets import load_from_disk
reloaded_encoded_dataset = load_from_disk(PATH_DATASET_DIR)

In [18]:
#!mkdir {tokenizer_dir}

BERT uses WordPiece

Here we normalize a LOT the text

In [22]:
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing

bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])

bert_tokenizer.pre_tokenizer = Whitespace()

bert_tokenizer.post_processor = TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", 1),
        ("[SEP]", 2),
    ],
)

In [23]:
from tokenizers.trainers import WordPieceTrainer

trainer = WordPieceTrainer(
    vocab_size=50010, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)
bert_tokenizer.train(text_files, trainer)

bert_tokenizer.save(f"{tokenizer_dir}/delphbert.json")

In [None]:
#When dataset is ready
#def batch_iterator(batch_size=1000):
#    for i in range(0, len(dataset), batch_size):
#        yield dataset[i : i + batch_size]["text"]
#tokenizer.train_from_iterator(batch_iterator(), trainer=trainer, length=len(dataset))

### Additional tries

In [17]:
%%time 

paths = [str(x) for x in Path(PATH_RAW_FILES).glob("*.txt")]
print(f"Found {len(print(paths))} text files from which a tokenizer will be trained")

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

tokenizer.pre_tokenizer = Whitespace()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

['/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_20.txt', '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_140.txt', '/home/leonardovida/data-histaware/raw/raw_merged/merged_1970s_40.txt']
CPU times: user 19min 45s, sys: 25.1 s, total: 20min 10s
Wall time: 3min 16s


In [21]:
# Save just json
tokenizer.save_model(tokenizer_dir)
# Save both vocab and merges
tokenizer.save_pretrained(tokenizer_dir)
#tokenizer.save(f"{tokenizer_dir}/vocab.json", f"{tokenizer_dir}/merges.txt")

AttributeError: 'ByteLevelBPETokenizer' object has no attribute 'save_pretrained'

### Load the tokenizer

In [27]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(f"{tokenizer_dir}/delphbert.json")

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


### Test tokenizer

In [None]:
#encoded_dataset = dataset.map(lambda article: tokenizer(article['p']), batched=True)

### Find max length articles

Go through the entire dataset and find the max lengths

In [None]:
token_lens = []

for txt in df.content:
    tokens = tokenizer.encode(txt, max_length=512)
    token_lens.append(len(tokens)

In [3]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

NameError: name 'tokenizer' is not defined

Test tokenizer performance

In [15]:
tokenizer.encode('Waarom niet met dédain over Schilder, zoals over zoveel anderen? Hij wist zich tegenstander: Barth stond tussen beiden. Maar Miskotte wist zich in dezelfde tijd te staan, in dezelfde storm, die ook Schilder onderging. Soms is er opvallende affiniteit. Hij kende de dichters van Nederland, zoals ook Schilder: Dèr Mouw (Adwaita), Nijhof f en Marsman. Hij stond, denk ik, geestelijk ook aanzienlijk dichter bij hen. Hij had de Nederlandse taal en het Nederlands land lief: „De nederlandse taal en het geboomte van dit land, deze twee gewassen zijn mijn aardse heerlijkheid" (329). Al zou het alléén déze zin zijn - daarvoor is lectuur van meer dan 600 bladzijden geen te hoge prijs! Dit Nederland werd platgedrukt door traditionele kerkelijkheid en bloedde weg in dood geloof. Dit Nederland werd besprongen door duistere machten. Zou het Woord (zoals Barth het verstond) geen nieuwe glans kunnen leggen op het eigen leven, op het volk, waaronder hij werkte?').tokens

['<s>',
 'Waarom',
 'Ġ',
 'niet',
 'Ġ',
 'met',
 'Ġ',
 'd',
 'Ã',
 '©',
 'da',
 'in',
 'Ġ',
 'over',
 'Ġ',
 'Schilder',
 ',',
 'Ġ',
 'zoals',
 'Ġ',
 'over',
 'Ġ',
 'zoveel',
 'Ġ',
 'anderen',
 '?',
 'Ġ',
 'Hij',
 'Ġ',
 'wist',
 'Ġ',
 'zich',
 'Ġ',
 'tegenstander',
 ':',
 'Ġ',
 'Barth',
 'Ġ',
 'stond',
 'Ġ',
 'tussen',
 'Ġ',
 'beiden',
 '.',
 'Ġ',
 'Maar',
 'Ġ',
 'Mis',
 'k',
 'otte',
 'Ġ',
 'wist',
 'Ġ',
 'zich',
 'Ġ',
 'in',
 'Ġ',
 'dezelfde',
 'Ġ',
 'tijd',
 'Ġ',
 'te',
 'Ġ',
 'staan',
 ',',
 'Ġ',
 'in',
 'Ġ',
 'dezelfde',
 'Ġ',
 'storm',
 ',',
 'Ġ',
 'die',
 'Ġ',
 'ook',
 'Ġ',
 'Schilder',
 'Ġ',
 'onderging',
 '.',
 'Ġ',
 'Soms',
 'Ġ',
 'is',
 'Ġ',
 'er',
 'Ġ',
 'opvallende',
 'Ġ',
 'affin',
 'iteit',
 '.',
 'Ġ',
 'Hij',
 'Ġ',
 'kende',
 'Ġ',
 'de',
 'Ġ',
 'dichters',
 'Ġ',
 'van',
 'Ġ',
 'Nederland',
 ',',
 'Ġ',
 'zoals',
 'Ġ',
 'ook',
 'Ġ',
 'Schilder',
 ':',
 'Ġ',
 'D',
 'Ã',
 '¨',
 'r',
 'Ġ',
 'M',
 'ouw',
 'Ġ',
 '(',
 'Ad',
 'wa',
 'ita',
 ')',
 ',',
 'Ġ',
 'Nij',
 'hof',
 'Ġ',

### Train model

In [16]:
!nvidia-smi

Sat Apr 10 16:28:52 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 455.32.00    Driver Version: 455.32.00    CUDA Version: 11.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 208...  On   | 00000000:00:05.0 Off |                  N/A |
| 32%   29C    P8     4W / 250W |      1MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  On   | 00000000:00:06.0 Off |                  N/A |
| 31%   28C    P8     1W / 250W |      1MiB / 11019MiB |      0%      Default |
|       

Set environment variables

In [18]:
#os.environ["train_path"] = train_path
#os.environ["eval_path"] = eval_path
os.environ["CUDA_LAUNCH_BLOCKING"]='1'  #Makes for easier debugging (just in case)
weights_dir = "/home/leonardovida/dev/hist-aware/notebooks/models/bert-training-from-scratch/weights"
#!mkdir {weights_dir}

In [19]:
torch.cuda.is_available()

True

In [20]:
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=512,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1, # The vocabulary size of the token_type_ids passed
)

Paper BERTje
--max_predictions_per_seq=20 \
  --train_batch_size=256 \
  --eval_batch_size=32 \
  --learning_rate=1e-4 \
  --num_train_steps=1000000 \
  --num_warmup_steps=10000 \
  --save_checkpoints_steps=10000 \
  --iterations_per_loop=10000 \
  --max_eval_steps=10000 \

In [27]:
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

In [28]:
model = RobertaForMaskedLM(config=config)

In [23]:
model.num_parameters()
# => 84 million parameters

83502880

### Create Dataset

In [4]:
from torch.utils.data import Dataset

class DelphDataset(Dataset):
    def __init__(self, evaluate: bool = False):
        tokenizer = ByteLevelBPETokenizer(
            tokenizer_dir + "/vocab.json",
            tokenizer_dir + "/merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )
        tokenizer.enable_truncation(max_length=512)
        # or use the RobertaTokenizer from `transformers` directly.

        self.examples = []

        #src_files = Path(PATH_RAW_FILES).glob("*-eval.txt") if evaluate else Path(PATH_RAW_FILES).glob("*-train.txt")
        src_files = Path(PATH_RAW_FILES).glob("*.txt")
        for src_file in src_files:
            print("🔥", src_file)
            lines = src_file.read_text(encoding="utf-8").splitlines()
            self.examples += [x.ids for x in tokenizer.encode_batch(lines)]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        # We’ll pad at the batch level.
        return torch.tensor(self.examples[i])

In [5]:
dataset = DelphDataset()

NameError: name 'tokenizer_dir' is not defined

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./oscar.eo.txt",
    block_size=128,
)

Test

In [43]:
files = [str(x) for x in Path(PATH_RAW_FILES).glob("*.txt")]

from datasets import load_dataset
from transformers import PreTrainedTokenizerFast

dataset = load_dataset('text', data_files=files, split='train')

# Load it using transformers
tokenizer = PreTrainedTokenizerFast(tokenizer_file=f"{tokenizer_dir}/byte-level-BPE.tokenizer.json")

def encode(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding='max_length',
        # TODO: padding 
        max_length=512,
    )

dataset = dataset.map(encode, batched=True)
dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

dataloader = torch.utils.data.DataLoader(dataset, batch_size=4)
next(iter(dataloader))

Using custom data configuration default-4720c1f9f258d4c1
Reusing dataset text (/home/leonardovida/.cache/huggingface/datasets/text/default-4720c1f9f258d4c1/0.0.0/e16f44aa1b321ece1f87b07977cc5d70be93d69b20486d6dacd62e12cf25c9a5)
Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


HBox(children=(FloatProgress(value=0.0, max=3869.0), HTML(value='')))




RuntimeError: stack expects each tensor to be equal size, but got [202] at entry 0 and [157] at entry 1

In [37]:
def prepare_data(dataset_f: str,
                 tokenizer: PreTrainedTokenizer,
                 batch_size: int = 16,
                 num_workers: int = 2) -> Dict[str, DataLoader]:
    """Given an input file, prepare the train, test, validation dataloaders.
       The created datasets will be preprocessed and save to disk.
    :param dataset_f: input file
    :param tokenizer: pretrained tokenizer that will prepare the data, i.e. convert tokens into IDs
    :param batch_size: batch size for the dataloaders
    :param num_workers: number of CPU workers to use during dataloading. On Windows this must be zero
    :return: a dictionary containing train, test, validation dataloaders
    """

    def collate(batch: List[Dict[str, Tensor]]) -> Dict[str, Tensor]:
        """Collates gathered items to form a batch which is then used in training, evaluation, or testing.
        :param batch: a list of samples from the dataset. Each sample is a dictionary with keys "input_ids".
        :return: the created batch with keys "input_ids"
        """
        all_input_ids = pad_sequence([item["input_ids"] for item in batch]).to(torch.long)

        return {"input_ids": all_input_ids}

    def preprocess(sentences: List[str]) -> Dict[str, Union[list, Tensor]]:
        """Preprocess the raw input sentences from the text file.
        :param sentences: a list of sentences (strings)
        :return: a dictionary of "input_ids"
        """
        tokens = [s.split() for s in sentences]

        # The sequences are not padded here. we leave that to the dataloader in collate
        # That means: a bit slower processing, but a smaller saved dataset size
        return tokenizer(tokens,
                         add_special_tokens=False,
                         return_token_type_ids=False,
                         return_attention_mask=False
                        )
    
    dataset = Dataset.from_dict({"text": Path(dataset_f).read_text(encoding="utf-8").splitlines()})

    # Split the dataset into train, test, dev
    # 90% (train), 10% (test + validation)
    train_testvalid = dataset.train_test_split(test_size=0.1)
    # 10% of total (test), 10% of total (validation)
    test_valid = train_testvalid["test"].train_test_split(test_size=0.5)

    dataset = DatasetDict({"train": train_testvalid["train"],
                           "test": test_valid["test"],
                           "valid": test_valid["train"]})

    dataset = dataset.map(preprocess, input_columns=["text"], batched=True)
    dataset.set_format("torch", columns=["input_ids"])

    return {partition: DataLoader(ds,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  collate_fn=collate,
                                  num_workers=num_workers,
                                  pin_memory=True) for partition, ds in dataset.items()}

In [38]:
prepare_data(dataset_f = "/home/leonardovida/data-histaware/articles0_50000.txt",
    tokenizer = tokenizer,
    batch_size = 16,
    num_workers = 2)

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

### Create Data Collator

To make backpropagation on

In [25]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

### Initalize Trainer

In [26]:
training_args = TrainingArguments(
    output_dir="home/leonardovida/data-histaware/delphBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

NameError: name 'dataset' is not defined

In [None]:
%%time
trainer.train()

#### Save model

In [None]:
trainer.save_model("./EsperBERTo")

### Verify learning

Create a fill-mask pipeline to check whether the model learned anything useful

In [None]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model="./EsperBERTo",
    tokenizer="./EsperBERTo"
)

In [None]:
fill_mask("La suno <mask>.")

### Upload the model to HuggingFace

Write a README.md model card and add it to the repository under `model_cards/`. Your model card should ideally include:
* a model description,
* training params (dataset, preprocessing, hyperparameters), 
* evaluation results,
* intended uses & limitations
* whatever else is helpful! 🤓

In [None]:
#transformers-cli upload