In [1]:
import math
import pandas as pd
from tqdm import tqdm
from datasets import Dataset

import torch
from torch.optim import Adam
from torch.utils.data import DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification




In [2]:
train_val_df = pd.read_csv('data/train.csv')
train_val_df

Unnamed: 0,text,label
0,— Кровь! какую кровь? — встревожилась,1
1,– Под нижнюю подушку.,0
2,— Благодарю-с...,1
3,— Когда же это-с?,1
4,"Старуха помолчала, как бы в раздумье,",1
...,...,...
5904,"– Да, – сказала графиня, после",0
5905,"– Извольте отправляться, – сказал штаб",0
5906,— Какая вы худенькая! Вон какая,1
5907,— Не каждый день получаете-то?,1


In [3]:
train_df, val_df = train_test_split(train_val_df, test_size=0.3, random_state=42)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

# Обучение distilroberta

In [4]:
tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
first_model = AutoModelForSequenceClassification.from_pretrained('distilroberta-base', num_labels=2)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, max_length=256)

train_dataset = Dataset.from_pandas(train_df)
valid_dataset = Dataset.from_pandas(val_df)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
valid_dataset = valid_dataset.remove_columns(["text"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=data_collator)
valid_loader = DataLoader(valid_dataset, batch_size=16, collate_fn=data_collator)

Map:   0%|          | 0/4136 [00:00<?, ? examples/s]

Map:   0%|          | 0/1773 [00:00<?, ? examples/s]

In [6]:
optimizer = Adam(first_model.parameters(), lr=1e-5)

# Обучение
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
first_model.to(device)

for epoch in range(8):  # Количество эпох
    first_model.train()
    epoch_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = first_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} loss: {epoch_loss / len(train_loader)}")

100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.67it/s]


Epoch 1 loss: 0.3346526198750757


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.68it/s]


Epoch 2 loss: 0.22182363470922795


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.65it/s]


Epoch 3 loss: 0.20141463456419206


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.54it/s]


Epoch 4 loss: 0.18081742400684642


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.50it/s]


Epoch 5 loss: 0.16831080721912395


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.57it/s]


Epoch 6 loss: 0.14906608428366475


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.54it/s]


Epoch 7 loss: 0.13406093206702063


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:17<00:00, 14.56it/s]

Epoch 8 loss: 0.12453883046663679





In [7]:
first_model.eval()
predictions, true_labels = [], []
val_loss = 0
with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = first_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()
        
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
    print(f"Validation loss: {val_loss / len(valid_loader)}")
        
accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Validation loss: 0.18865264210493937
Accuracy: 0.9035532994923858
F1 Score: 0.8968014484007242
Confusion Matrix:
 [[859  51]
 [120 743]]


# unsupervised masked language modeling

In [8]:
with open('data/train-test.txt', 'r', encoding='utf-8') as f:
    text_data = f.readlines()

text_data = list(map(lambda x: x.split('. '), text_data))[0]    

dataset = Dataset.from_dict({"text": text_data})

def filter_short_texts(example):
    return len(example['text'].split()) >=3

dataset = dataset.filter(filter_short_texts)

dataset = dataset.train_test_split(test_size=0.3)
dataset

Filter:   0%|          | 0/23397 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 15859
    })
    test: Dataset({
        features: ['text'],
        num_rows: 6797
    })
})

In [9]:
# Загрузка токенизатора
tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")

# Токенизация данных
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["text"])
tokenized_dataset



Map:   0%|          | 0/15859 [00:00<?, ? examples/s]

Map:   0%|          | 0/6797 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 15859
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 6797
    })
})

In [10]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    return result

lm_dataset = tokenized_dataset.map(group_texts, batched=True)
lm_dataset

Map:   0%|          | 0/15859 [00:00<?, ? examples/s]

Map:   0%|          | 0/6797 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 63436
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 27188
    })
})

In [11]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [12]:
model2 = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
training_args = TrainingArguments(
    output_dir="/my_awesome_model",
    save_steps=0,
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=15,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=True,
    gradient_accumulation_steps=2
    
)

trainer = Trainer(
    model=model2,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2476,0.215922
2,0.2052,0.188451
3,0.1912,0.167219
4,0.1787,0.154037
5,0.1658,0.146032
6,0.1556,0.139523
7,0.1553,0.133471
8,0.1513,0.128888
9,0.1348,0.124792
10,0.1387,0.121192


TrainOutput(global_step=59475, training_loss=0.16388113559820713, metrics={'train_runtime': 10926.2327, 'train_samples_per_second': 87.088, 'train_steps_per_second': 5.443, 'total_flos': 3.154873846113792e+16, 'train_loss': 0.16388113559820713, 'epoch': 15.0})

In [14]:
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 1.12


In [15]:
first_model.save_pretrained("./model1")
model2.save_pretrained("./model2")

# Решение задачи классификации при помощи претреннированных весов

In [18]:
model2 = AutoModelForSequenceClassification.from_pretrained("./model2", num_labels=2)

for param in model2.base_model.parameters():
    param.requires_grad = False

for param in model2.roberta.encoder.layer[-3:].parameters():
    param.requires_grad = True    

model2.to(device)
optimizer = Adam(model2.parameters(), lr=1e-5)

for epoch in range(10):  # Количество эпох
    model2.train()
    epoch_loss = 0
    for batch in tqdm(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model2(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} loss: {epoch_loss / len(train_loader)}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./model2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.45it/s]


Epoch 1 loss: 0.2825694744651382


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.47it/s]


Epoch 2 loss: 0.16598994016348884


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.16it/s]


Epoch 3 loss: 0.13211951900676294


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.05it/s]


Epoch 4 loss: 0.11319539655142906


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.07it/s]


Epoch 5 loss: 0.09891383920600243


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 25.98it/s]


Epoch 6 loss: 0.08514253467307462


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.03it/s]


Epoch 7 loss: 0.07579040238328105


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 25.99it/s]


Epoch 8 loss: 0.06411723680772889


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.21it/s]


Epoch 9 loss: 0.0506965759651116


100%|████████████████████████████████████████████████████████████████████████████████| 259/259 [00:09<00:00, 26.25it/s]

Epoch 10 loss: 0.04812365905483153





In [19]:
model2.eval()
predictions, true_labels = [], []
val_loss = 0
with torch.no_grad():
    for batch in valid_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
    
        outputs = model2(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss
        
        preds = torch.argmax(outputs.logits, dim=1)
        predictions.extend(preds.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())
        
    print(f"Validation loss: {val_loss / len(valid_loader)}")

accuracy = accuracy_score(true_labels, predictions)
f1 = f1_score(true_labels, predictions)
conf_matrix = confusion_matrix(true_labels, predictions)

print("Accuracy:", accuracy)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Validation loss: 0.2156081199645996
Accuracy: 0.9272419627749577
F1 Score: 0.9238038984051979
Confusion Matrix:
 [[862  48]
 [ 81 782]]


In [20]:
model2.save_pretrained("./final_model")

Подход использующий MLM позволил добиться повышения метрики accuracy на 2.7 и повышения метрики f1 на 2.3 