In [90]:
# !pip install datasets

In [94]:
# !pip install accelerate -U

In [21]:
import pandas as pd
import numpy as np
import re
import os
import math
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
import warnings

In [2]:
warnings.filterwarnings("ignore")

In [3]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModelForMaskedLM.from_pretrained("cointegrated/rubert-tiny2")

In [4]:
df = pd.read_csv("STT2.tsv", sep="\t", dtype="O", header=None).rename(columns={0:"text", 1:"target"})
df.shape

(6920, 2)

In [5]:
train, test = train_test_split(df, random_state=42)

In [6]:
train_text, test_text = list(train.text), list(test.text)

In [10]:
# data_dict = {"train":{"text":train_text}, 
#              "test":{"text":test_text}
#             }

In [11]:
# dataset = Dataset.from_dict(data_dict)

In [9]:
def tokenize_function(examples):
    return tokenizer(examples["text"], 
                     max_length=512, 
                     add_special_tokens=True, 
                     truncation=True, 
                     return_attention_mask=True)

In [12]:
def group_texts(examples):
    # Concatenate all texts.
    block_size = 512 # block_size = tokenizer.model_max_length
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
def make_dataset(text):
    data_dict = {"text":text}
    dataset = Dataset.from_dict(data_dict)
    token_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns="text")
    lm_datasets = token_datasets.map(group_texts, batched=True, batch_size=64, num_proc=4)
    return lm_datasets

In [14]:
train_dataloader = make_dataset(train_text)

Map (num_proc=4):   0%|          | 0/5190 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/5190 [00:00<?, ? examples/s]

In [15]:
test_dataloader = make_dataset(test_text)

Map (num_proc=4):   0%|          | 0/1730 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1730 [00:00<?, ? examples/s]

In [16]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [35]:
model_name = "rubert-tiny-finetuned"
training_args = TrainingArguments(
    f"{model_name}-finetuned",
    evaluation_strategy = "epoch",
    logging_steps=30,
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    num_train_epochs=100,
    resume_from_checkpoint=True,
    report_to=["tensorboard"]
)

In [36]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=test_dataloader,
    data_collator=data_collator,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.829,4.119725
2,3.9257,4.095912
3,3.9179,4.070326
4,4.037,4.04226
5,3.977,4.055024
6,3.9686,3.97962
7,3.9846,4.038764


In [29]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 60.75
