# Set Device

In [1]:
import torch

_device = "cpu"
if torch.cuda.is_available():
    _device = "cuda"
elif torch.backends.mps.is_available():
    assert torch.backends.mps.is_built(), "Found Torch MPS backend, but was not built"
    _device = "mps"

print(f"Using Torch with device '{_device}'.")
device = torch.device(_device)


Using Torch with device 'mps'.


# Load Tokenizer

In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("staten_generaal_19xx_tokenizer")

# Load Dataset

In [3]:
CORPUS_FILE = "staten_generaal_19xx.txt.gz"

In [4]:
from datasets import Value, load_dataset, Features


# TODO: split into train/test?

sg_19xx = load_dataset(
    "text",
    data_files={"train": [CORPUS_FILE]},
    features=Features({"text": Value(dtype="string", id=None)}),
).with_format("torch", device=device)
sg_19xx


Using custom data configuration default-99800a452ce970ea
Found cached dataset text (/Users/carstenschnober/.cache/huggingface/datasets/text/default-99800a452ce970ea/0.0.0/cb1e9bd71a82ad27976be3b12b407850fe2837d80c22c5e03a28949843a8ace2)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 33933639
    })
})

In [5]:
sg_19xx["train"][1091552]

{'text': '21 DECEMBER 1915.X, Departement vun Landbouw, Nijverheid en Handel.)'}

In [6]:
sg_19xx["train"][5041350]

{'text': 'De Nederlandse regering heeft tijdens de voorbereidende maritieme conferen-tie -in mei 1986 -dit voorstel afgewezen.'}

In [7]:
tokenizer(sg_19xx["train"][5041350]["text"])

{'input_ids': [0, 349, 1190, 1079, 419, 2744, 268, 14046, 19414, 545, 3129, 17, 1009, 548, 265, 2689, 2422, 548, 4037, 875, 6552, 18, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

## Tokenize Dataset

In [8]:
import os


dataset = sg_19xx.map(lambda examples: tokenizer(examples["text"]), batched=True, num_proc=os.cpu_count())
#dataset

                    

#0:   0%|          | 0/3394 [00:00<?, ?ba/s]

#1:   0%|          | 0/3394 [00:00<?, ?ba/s]

#2:   0%|          | 0/3394 [00:00<?, ?ba/s]

#4:   0%|          | 0/3394 [00:00<?, ?ba/s]

#6:   0%|          | 0/3394 [00:00<?, ?ba/s]

#3:   0%|          | 0/3394 [00:00<?, ?ba/s]

#9:   0%|          | 0/3394 [00:00<?, ?ba/s]

#7:   0%|          | 0/3394 [00:00<?, ?ba/s]

#8:   0%|          | 0/3394 [00:00<?, ?ba/s]

#5:   0%|          | 0/3394 [00:00<?, ?ba/s]

# Language Model

## Initialize model

In [9]:
BASE_MODEL = "DTAI-KULeuven/robbertje-1-gb-non-shuffled"

In [10]:
from transformers import AutoModelForMaskedLM, AutoConfig

config = AutoConfig.from_pretrained(BASE_MODEL)
config

RobertaConfig {
  "_name_or_path": "DTAI-KULeuven/robbertje-1-gb-non-shuffled",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "output_hidden_states": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 40000
}

In [11]:
model = AutoModelForMaskedLM.from_config(config)
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(40000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNor

## Data Collator

In [12]:
from transformers import DataCollatorForLanguageModeling

collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True)

## Set up Trainer

In [16]:
from transformers import Trainer

trainer = Trainer(
    model,
    train_dataset=dataset["train"],
    tokenizer=tokenizer,
    data_collator=collator,
    #remove_unused_columns=False,
)

trainer


No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


<transformers.trainer.Trainer at 0x7fde197d3cd0>

In [17]:
from transformers import set_seed

set_seed(0)

In [18]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForMaskedLM.forward` and have been ignored: text. If text are not expected by `RobertaForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 33933639
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 12725115
  Number of trainable parameters = 74276416


  0%|          | 0/12725115 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
