In [1]:
# in this notebook we'll only get one of the files (the Oscar one) for the sake of simplicity and performance
!wget -c https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt

--2025-05-18 11:18:25--  https://cdn-datasets.huggingface.co/EsperBERTo/data/oscar.eo.txt
Resolving cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)... 18.155.68.60, 18.155.68.58, 18.155.68.74, ...
Connecting to cdn-datasets.huggingface.co (cdn-datasets.huggingface.co)|18.155.68.60|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 312733741 (298M) [text/plain]
Saving to: ‘oscar.eo.txt’


2025-05-18 11:18:30 (65.3 MB/s) - ‘oscar.eo.txt’ saved [312733741/312733741]



# Train a Tokenizer

In [2]:
# We won't need TensorFlow here
!pip uninstall -y tensorflow
# Install `transformers` from master
!pip install git+https://github.com/huggingface/transformers
!pip list | grep -E 'transformers|tokenizers'
# transformers version at notebook update --- 2.11.0
# tokenizers version at notebook update --- 0.8.0rc1

Found existing installation: tensorflow 2.18.0
Uninstalling tensorflow-2.18.0:
  Successfully uninstalled tensorflow-2.18.0
Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-lk_lufu1
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-lk_lufu1
  Resolved https://github.com/huggingface/transformers to commit 40a493c7ed4f19f08eadb0639cf26d49bfa5e180
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.52.0.dev0-py3-none-any.whl size=11327449 sha256=5112d1498a1c1f5e6790ee1af92a462046c9fec12aa0a1d6dac7f8979ae328a9
  Stored in directory: /tmp/pip-

In [3]:
%%time
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer

paths=[str(x)for x in Path('.').glob('**/*.txt')]

# Initialize the tokenzer
tokenizer=ByteLevelBPETokenizer()

# Customize the training
tokenizer.train(files=paths,vocab_size=52_000,min_frequency=2,special_tokens=['<s>','<pad>','</s>','<unk>','<mask>'])

CPU times: user 15min 52s, sys: 5.74 s, total: 15min 58s
Wall time: 9min 43s


In [4]:
# Save the file to disk
!mkdir EsperBERTo
tokenizer.save_model('EsperBERTo')


['EsperBERTo/vocab.json', 'EsperBERTo/merges.txt']

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing

tokenizer=ByteLevelBPETokenizer(
    "./EsperBERTo/vocab.json",
    "./EsperBERTo/merges.txt",)

In [6]:
tokenizer._tokenizer.post_processor=BertProcessing(
    ('</s>',tokenizer.token_to_id('</s>')),
    ('<s>',tokenizer.token_to_id('<s>'))
)
tokenizer.enable_truncation(max_length=512)

In [7]:
tokenizer.encode("Mi estas Julien.")

Encoding(num_tokens=7, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [8]:
tokenizer.encode("Mi estas Julien.").tokens

['<s>', 'Mi', 'Ġestas', 'ĠJuli', 'en', '.', '</s>']

In [9]:
# Check that we have a GPU
!nvidia-smi

Sun May 18 11:29:21 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   44C    P8             10W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [10]:
# We will define model config
from transformers import RobertaConfig

config=RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=512,
    num_hidden_layers=6,
    num_attention_heads=12,
    type_vocab_size=1
)

In [11]:
from transformers import RobertaTokenizerFast

tokenizer=RobertaTokenizerFast.from_pretrained('./EsperBERTo',max_len=512)

In [12]:
from transformers import RobertaForMaskedLM
model=RobertaForMaskedLM(config=config)

In [13]:
model.num_parameters()

83502880

In [None]:
%%time
from transformers import LineByLineTextDataset

dataset=LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path='./oscar.eo.txt',
    block_size=128
)



In [None]:
from transformers import DataCollatorForLanguageModeling
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=True,mlm_probability=0.15)


In [None]:
# Initialize our trainer
from transformers import Trainer,TrainingArguments

training_args=TrainingArguments(
    output_dir='./EsperBERTo',
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_gpu_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True
)
trainer=Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset
)

In [None]:
%%time
trainer.train()