In [1]:
import torch

In [2]:
for idx in range(torch.cuda.device_count()):
    print(idx, torch.cuda.get_device_name(idx))
device = torch.device('cuda:0')

0 GeForce RTX 3090


In [4]:
model_path = 'models/politicalHerBERT'

In [4]:
from pathlib import Path

from tokenizers import ByteLevelBPETokenizer

paths = [str(x) for x in Path(".").glob("**/*.txt")]

# Initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()

# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])


tokenizer.save_model(model_path)

['models/politicalBERT\\vocab.json', 'models/politicalBERT\\merges.txt']

In [5]:
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing


tokenizer = ByteLevelBPETokenizer(
    model_path+"/vocab.json",
    model_path+"/merges.txt",
)

tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

tokenizer.encode("Jestem konserwatywny, ale lubię gejów.").tokens

['<s>',
 'Jestem',
 'Ġkonserwa',
 'tywny',
 ',',
 'Ġale',
 'ĠlubiÄĻ',
 'ĠgejÃ³w',
 '.',
 '</s>']

In [16]:
from transformers import XLMTokenizer, RobertaForMaskedLM, RobertaConfig

config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

#tokenizer = XLMTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
model = RobertaForMaskedLM(config=config)

from transformers import RobertaTokenizerFast

#tokenizer = RobertaTokenizerFast.from_pretrained(model_path, max_len=512)

In [5]:
from transformers import XLMTokenizer, RobertaModel, RobertaForMaskedLM, RobertaTokenizerFast

tokenizer="allegro/herbert-klej-cased-tokenizer-v1"
embed_model="allegro/herbert-klej-cased-v1"

tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer)
model = RobertaForMaskedLM.from_pretrained(embed_model, return_dict=True)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at allegro/herbert-klej-cased-v1 and are newly initialized: ['lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="twitter_data/data/texts_only.txt",
    block_size=128,
)

In [7]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [8]:
from transformers import Trainer, TrainingArguments

tr_args = TrainingArguments(
    output_dir=model_path,
    overwrite_output_dir=True,
    num_train_epochs=30,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=10,
)

trainer = Trainer(
    model=model,
    args=tr_args,
    data_collator=data_collator,
    train_dataset=dataset,
    #prediction_loss_only=True,
)

In [9]:
trainer.train()

Step,Training Loss
500,4.517534
1000,3.324793
1500,2.973333
2000,2.779016
2500,2.618653
3000,2.539037
3500,2.454619
4000,2.345311
4500,2.291287
5000,2.25948


KeyboardInterrupt: 

In [10]:
trainer.save_model(model_path)

In [15]:
from transformers import pipeline

fill_mask = pipeline(
    "fill-mask",
    model=model_path,
    tokenizer="allegro/herbert-klej-cased-tokenizer-v1"#model_path
)
fill_mask("Czy ja <mask> socjalistą?")

Some weights of RobertaModel were not initialized from the model checkpoint at models/politicalHerBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


[{'sequence': '<s>Czy ja iesocjalistą? </s>',
  'score': 0.1482255458831787,
  'token': 1639,
  'token_str': 'ie'},
 {'sequence': '<s>Czy ja stsocjalistą? </s>',
  'score': 0.12938810884952545,
  'token': 725,
  'token_str': 'st'},
 {'sequence': '<s>Czy ja Äsocjalistą? </s>',
  'score': 0.041076239198446274,
  'token': 49385,
  'token_str': 'Ä'},
 {'sequence': '<s>Czy ja kosocjalistą? </s>',
  'score': 0.03023802489042282,
  'token': 267,
  'token_str': 'ko'},
 {'sequence': '<s>Czy ja osocjalistą? </s>',
  'score': 0.028940103948116302,
  'token': 178,
  'token_str': 'o'}]

In [13]:
from transformers import RobertaTokenizerFast, RobertaModel

tokenizer = RobertaTokenizerFast.from_pretrained("models/politicalBERT")
embed_model = RobertaModel.from_pretrained("models/politicalBERT", return_dict=True)

text = 'Nie lubię konserwatystów'

encoded = tokenizer(text, return_tensors='pt', padding=True)
#encoded = {k: v.to(next(self.parameters()).device) for k, v in encoded.items()}
embeddings = embed_model(**encoded)['pooler_output'].float()
print(embeddings)

Some weights of RobertaModel were not initialized from the model checkpoint at models/politicalBERT and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[ 1.3988e-01, -4.8242e-01, -4.4797e-01,  1.2004e-01,  2.8562e-01,
         -2.7454e-01, -1.0131e-01, -4.6635e-01, -5.8219e-01, -2.0793e-01,
          4.1097e-01, -3.4779e-01,  1.0638e-01, -1.8089e-01, -5.7358e-01,
          1.9686e-01, -3.3328e-01, -8.5908e-01,  1.2199e-01,  1.9801e-01,
          5.2861e-01, -3.0298e-01, -5.6212e-01, -8.1941e-01,  1.6757e-01,
          1.6900e-01, -8.4392e-01,  4.3225e-01,  6.0221e-01, -9.6487e-01,
          4.7043e-01, -5.1503e-01, -2.2288e-01, -7.1512e-01, -3.3719e-01,
         -1.3765e-01,  3.0455e-01,  3.0996e-01,  6.3490e-01, -4.4972e-01,
         -2.0741e-02,  4.6612e-01, -8.7157e-01, -6.4219e-01, -2.1672e-01,
          5.6638e-01,  4.9022e-01, -8.5509e-02, -8.9402e-01,  5.1555e-01,
         -2.6077e-01, -4.7545e-01, -1.9073e-01,  4.5381e-01, -3.2618e-01,
          2.5455e-01, -4.2948e-01,  1.8751e-01,  4.4860e-02, -6.3936e-01,
          7.6363e-01, -3.2402e-01, -5.1184e-01, -2.9143e-01,  3.0719e-01,
         -7.0315e-01,  1.3743e-01, -2.