In [1]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

tokenizer = BertTokenizer.from_pretrained("readerbench/RoBERT-small")
model_emotional = BertForMaskedLM.from_pretrained("readerbench/RoBERT-small")

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/245k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/77.9M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [2]:
text = """Intr-o dimineata senina, cand soarele inca se ridica deasupra orizontului, o pasare cantatoare si-a inceput 
trilurile sale melodioase. In mijlocul unei paduri dense, animalele isi incepeau ziua in liniste. Un vant usor adia 
printre crengile copacilor, iar frunzele se miscau in dansul lor natural. In adancul padurii, un rau curgea lin, izvorand viata 
in jurul sau. Pe malul acestuia, o familie de iepuri se juca in iarba proaspata, fara griji sau framantari. Totul parea perfect 
in armonia sa simpla si pura. Dar, in spatele acestei scene idilice, se ascundeau si intamplari mai putin linistite."""

inputs = tokenizer.encode_plus(
    text,
    add_special_tokens=True,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_token_type_ids=False,
    return_attention_mask=True,
    return_tensors="pt"
)
inputs["labels"] = inputs.input_ids.detach().clone()

cls_token_id = 3
sep_token_id = 4
mask_token_id = 5

# generate a random number for each token to be associated with
masked_ids = torch.rand(inputs.input_ids.shape)
# the tokens that will be masked should have their random number < 0.15
# and should not be [CLS] or [SEP] tokens
masked_ids = (masked_ids < 0.15) * (inputs.input_ids != cls_token_id) * (inputs.input_ids != sep_token_id) * (inputs.input_ids != 0)
masked_ids = masked_ids.squeeze()
# get the actual ids that need to be transformed into [MASK] tokens
masked_ids = [index for index, is_maskable in enumerate(masked_ids) if is_maskable.item() is True]

inputs.input_ids[0, masked_ids] = mask_token_id

outputs = model_emotional(**inputs)

print(f"Indexes of the IDs that will be transformed into [MASK]: {masked_ids}")
print(outputs.loss)

Indexes of the IDs that will be transformed into [MASK]: [1, 15, 28, 32, 39, 41, 46, 52, 71, 81, 88, 98, 102, 104, 105, 114, 115, 122, 134, 140, 145]
tensor(16.3362, grad_fn=<NllLossBackward0>)


In [21]:
from torch.utils.data import Dataset, DataLoader

class RomanianCorpusDataset(Dataset):
    # ids for the special tokens, should be checked 
    # if they match with the tokenizer used
    cls_token_id = 3
    sep_token_id = 4
    mask_token_id = 5
    
    def __init__(self, texts: list, max_length: int = 512):
        # encode all the text from the texts list
        self.max_length = max_length
        self.inputs = [self.encode(text) for text in texts]
    
    def encode(self, text: str):
        # for training the MLM model, we only need: 
        # 'input_ids', 'token_type_ids', 'attention_mask' and 'labels'
        encoded_text = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors="pt"
        )
        
        # set the labels to be equal to the original input_ids,
        # generated from the text that does not contain [MASK] tokens
        encoded_text["labels"] = encoded_text.input_ids.detach().clone()
        
        # get masked ids for the current encoded text
        masked_ids = self.get_masked_ids(encoded_text)
        
        # transform the tokens with the previously generated IDs
        # into [MASK] tokens
        encoded_text.input_ids[0, masked_ids] = self.mask_token_id
        
        return encoded_text
    
    def get_masked_ids(self, encoded_text):
        # generate a random number for each token to be associated with
        masked_ids = torch.rand(encoded_text.input_ids.shape)
        # the tokens that will be masked should have their random number < 0.15
        # and should not be [CLS], [SEP] or empty tokens,
        masked_ids = (masked_ids < 0.15) * (encoded_text.input_ids != self.cls_token_id) * (encoded_text.input_ids != self.sep_token_id) * (inputs.input_ids != 0)
        masked_ids = masked_ids.squeeze()
        # get the actual ids that need to be transformed into [MASK] tokens
        masked_ids = [index for index, is_maskable in enumerate(masked_ids) if is_maskable.item() is True]
        
        return masked_ids
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        return {**self.inputs[index]}
        

In [22]:
# load the romanian emotional dataset, that contains
# different texts from fiction
import pandas as pd

emotional_dataframe = pd.read_json("/kaggle/input/romanianfictionforbert/emotional.json")

In [23]:
max_emotional_texts = 3000
max_emotional_text_size = 100
emotional_texts = []

for text in emotional_dataframe.content:
    # stop condition for the whole extraction process, 
    # because it does not need to extract more texts 
    # if the maximum number of texts its reached
    if len(emotional_texts) >= max_emotional_texts:
        break
    
    splitted_text = text.split()
    
    for index in range(0, len(splitted_text), max_emotional_text_size):
        # stop condition for the whole extraction process, 
        # because it does not need to extract more texts 
        # if the maximum number of texts its reached
        if len(emotional_texts) >= max_emotional_texts:
            break
        
        # the last text might contain less words, 
        # so it is not needed
        if len(splitted_text[index:index + max_emotional_text_size]) < max_emotional_text_size:
            break
        
        mini_text = " ".join(splitted_text[index:index + max_emotional_text_size])  
        emotional_texts.append(mini_text)
        
print(f"Number of texts extracted, of size {max_emotional_text_size}: {len(emotional_texts)} texts")

Number of texts extracted, of size 100: 2979 texts


In [24]:
batch_size = 5

train_dataset = RomanianCorpusDataset(emotional_texts)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [26]:
from transformers import AdamW, get_linear_schedule_with_warmup
import torch

# set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device used for training and testing: {device.type}")

# move the MLM model to the device
model_emotional = model_emotional.to(device)

# set the optimizer
optimizer = AdamW(model_emotional.parameters(), lr=1e-4)

num_epochs = 4

# total number of training steps
num_train_steps = len(train_dataloader) * num_epochs

# scheduler (optional, for learning rate decay)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)

Device used for training and testing: cuda


In [27]:
from tqdm import tqdm

def train(
    model: BertForMaskedLM,
    dataloader: DataLoader,
    optimizer,
    device: torch.device,
    scheduler=None
) -> float:
    """
    ## Returns: epoch_loss
    - `epoch_loss`: float = overall loss for the epoch executed
    """
    model.train()
    epoch_loss = 0.0
    
    for index, batch in tqdm(enumerate(dataloader)):
        input_ids = batch["input_ids"].squeeze(1).to(device) 
        attention_mask = batch["attention_mask"].squeeze(1).to(device)
        labels = batch["labels"].squeeze(1).to(device)
        
        # reset gradients for model's parameters
        model.zero_grad()
        
        # feed forward
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # compute gradients
        loss = outputs.loss
        loss.backward()
        
        # optimize model's parameters
        optimizer.step()
    
        epoch_loss += loss.item()
        
    return epoch_loss / len(dataloader)

In [28]:
num_epochs = 8

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    loss = train(model_emotional, train_dataloader, optimizer, device, scheduler)
    print(f"Loss: {loss}")
    print()

Epoch 1/8


596it [01:39,  6.02it/s]


Loss: 0.20422787600565676

Epoch 2/8


596it [01:39,  6.00it/s]


Loss: 0.16628909411046328

Epoch 3/8


596it [01:39,  5.99it/s]


Loss: 0.14190731558873748

Epoch 4/8


596it [01:39,  5.99it/s]


Loss: 0.1202056413678915

Epoch 5/8


596it [01:39,  5.99it/s]


Loss: 0.10051292147262385

Epoch 6/8


596it [01:39,  6.00it/s]


Loss: 0.08232778170764846

Epoch 7/8


596it [01:39,  6.00it/s]


Loss: 0.06666042410062263

Epoch 8/8


596it [01:39,  5.99it/s]

Loss: 0.053354949552890836






In [29]:
# save the model in its full form
model_emotional_full_path = "/kaggle/working/robert-pretrained-emotional-full.pth"
model_emotional.save_pretrained(model_emotional_full_path)

In [6]:
# delete the emotional model from the GPU
del model_emotional
# reload the model
model_law = BertForMaskedLM.from_pretrained("readerbench/RoBERT-small")

In [7]:
# load the romanian law dataset, that contains
# different legal texts
law_dataframe = pd.read_json("/kaggle/input/romanianlawforbert/legal.json")

In [8]:
import re

max_law_texts = 6000
max_law_text_size = 200
law_texts = []

for text in law_dataframe.content:
    # stop condition for the whole extraction process, 
    # because it does not need to extract more texts 
    # if the maximum number of texts its reached
    if len(law_texts) >= max_law_texts:
        break
    
    text = re.sub(r"nr\s+\d+\s+din\s+\d+\s+\w+\s+\d+", "", text)
    text = re.sub(r"nr\s+\d+", "", text)
    splitted_text = text.split()
    
    for index in range(0, len(splitted_text), max_law_text_size):
        # stop condition for the whole extraction process, 
        # because it does not need to extract more texts 
        # if the maximum number of texts its reached
        if len(law_texts) >= max_law_texts:
            break
        
        # the last text might contain less words, 
        # so it is not needed
        if len(splitted_text[index:index + max_law_text_size]) < max_law_text_size:
            break
        
        mini_text = " ".join(splitted_text[index:index + max_law_text_size])  
        law_texts.append(mini_text)
        
print(f"Number of texts extracted, of size {max_law_text_size}: {len(law_texts)} texts")

Number of texts extracted, of size 200: 6000 texts


In [10]:
batch_size = 5
# move the MLM model to the device
model_law = model_law.to(device)

# set the optimizer
optimizer = AdamW(model_law.parameters(), lr=1e-4)

num_epochs = 4

# total number of training steps
num_train_steps = len(train_dataloader) * num_epochs

# scheduler (optional, for learning rate decay)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps
)
train_dataset = RomanianCorpusDataset(law_texts)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

In [14]:
num_epochs = 8

for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    loss = train(model_law, train_dataloader, optimizer, device, scheduler)
    print(f"Loss: {loss}")
    print()

Epoch 1/8


1200it [03:20,  5.99it/s]


Loss: 0.12100644382337729

Epoch 2/8


1200it [03:21,  5.96it/s]


Loss: 0.07582745673134923

Epoch 3/8


1200it [03:21,  5.95it/s]


Loss: 0.06286146557889878

Epoch 4/8


1200it [03:21,  5.96it/s]


Loss: 0.05289242000319064

Epoch 5/8


1200it [03:21,  5.96it/s]


Loss: 0.04303169764423122

Epoch 6/8


1200it [03:21,  5.95it/s]


Loss: 0.035808922922393925

Epoch 7/8


1200it [03:21,  5.96it/s]


Loss: 0.029350762268683564

Epoch 8/8


1200it [03:21,  5.96it/s]

Loss: 0.024445067738803724






In [20]:
# save the model in its full form
model_law_dict_path = "/kaggle/working/robert-pretrained-law"
model_law.save_pretrained(model_law_dict_path)