In [1]:
import pandas as pd

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,text,label
0,— Кровь! какую кровь? — встревожилась,1
1,– Под нижнюю подушку.,0
2,— Благодарю-с...,1
3,— Когда же это-с?,1
4,"Старуха помолчала, как бы в раздумье,",1


In [2]:
from sklearn.model_selection import train_test_split

train_texts, valid_texts, train_labels, valid_labels = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"])

In [3]:
from datasets import Dataset

train = Dataset.from_dict({"text": train_texts, "label": train_labels})
valid = Dataset.from_dict({"text": valid_texts, "label": valid_labels})

In [4]:
from transformers import AutoTokenizer

def tokenize(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

train = train.map(tokenize, batched=True)
valid = valid.map(tokenize, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/4512 [00:00<?, ? examples/s]

Map:   0%|          | 0/1128 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilroberta-base", num_labels=2)

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [7]:
import numpy as np
import evaluate

metric_acc = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = metric_acc.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = metric_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": acc, "f1": f1}

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="classifier",
    eval_strategy="epoch",
    push_to_hub=False,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    metric_for_best_model="accuracy"
)

In [9]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=valid,
    compute_metrics=compute_metrics,
)

trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33martem-vgk[0m ([33martem-vgk-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3005,0.20862,0.886525,0.881041
2,0.2321,0.219195,0.912234,0.911211
3,0.2253,0.217548,0.913121,0.913732
4,0.1966,0.183602,0.921099,0.924895
5,0.191,0.23745,0.933511,0.935177
6,0.1702,0.272009,0.93617,0.937931
7,0.1795,0.249812,0.933511,0.936494


TrainOutput(global_step=3948, training_loss=0.20850034711812646, metrics={'train_runtime': 1827.228, 'train_samples_per_second': 17.285, 'train_steps_per_second': 2.161, 'total_flos': 4183850319151104.0, 'train_loss': 0.20850034711812646, 'epoch': 7.0})

In [10]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score

pred = trainer.predict(valid)
logits, labels = pred.predictions, pred.label_ids
preds = np.argmax(logits, axis=-1)

acc = accuracy_score(labels, preds)
f1 = f1_score(labels, preds)
cm = confusion_matrix(labels, preds)

print("accuracy_score:", acc)
print("f1_score:", f1)
print("confusion_matrix:", cm)

accuracy_score: 0.9335106382978723
f1_score: 0.9364944961896697
confusion_matrix: [[500  33]
 [ 42 553]]


In [14]:
import re

with open("mlm.txt", "r", encoding="utf-8") as f:
    text = f.read()

text = re.sub(r'[-\u2013\u2014]', '', text)

sent = re.split(r'(?<=[.!?])\s+(?=[А-Я])', text)

sent = [s.strip() for s in sent if len(s) > 10]

dataset = Dataset.from_dict({"text": sent})
dataset = dataset.train_test_split(test_size=0.2)

In [16]:
tokenizer_mlm = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")

def tokenize_mlm(examples):
    return tokenizer_mlm(examples["text"], truncation=True, max_length=512)

tokenized_dataset = dataset.map(
    tokenize_mlm,
    batched=True,
    num_proc=4,
    remove_columns=["text"]
)

Map (num_proc=4):   0%|          | 0/19806 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4952 [00:00<?, ? examples/s]

In [17]:
block_size = 128

def group_texts(examples):
    concatenated = {k: sum(examples[k], []) for k in examples.keys()}
    total_len = len(concatenated["input_ids"])
    total_len = (total_len // block_size) * block_size
    return {
        k: [t[i : i + block_size] for i in range(0, total_len, block_size)]
        for k, t in concatenated.items()
    }

lm_dataset = tokenized_dataset.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/19806 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/4952 [00:00<?, ? examples/s]

In [18]:
from transformers import DataCollatorForLanguageModeling

tokenizer_mlm.pad_token = tokenizer_mlm.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer_mlm, mlm_probability=0.15)

In [19]:
from transformers import AutoModelForMaskedLM

model_mlm = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")

Some weights of the model checkpoint at distilbert/distilroberta-base were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
from transformers import AutoModelForMaskedLM, TrainingArguments, Trainer

training_args_mlm = TrainingArguments(
    output_dir="mlm_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=7,
    weight_decay=0.01,
    push_to_hub=False,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8
)

trainer_mlm = Trainer(
    model=model_mlm,
    args=training_args_mlm,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer_mlm,
)


  trainer_mlm = Trainer(


In [24]:
import math

start_eval = trainer_mlm.evaluate()
print("Start loss:", math.exp(start_eval['eval_loss']))

trainer_mlm.train()

final_eval = trainer_mlm.evaluate()
print("Final loss:", math.exp(final_eval['eval_loss']))

Start loss: 2.5812841565279854


Epoch,Training Loss,Validation Loss,Model Preparation Time
1,0.8105,0.865285,0.0133
2,0.8802,0.756029,0.0133
3,0.8181,0.711755,0.0133
4,0.7764,0.6674,0.0133
5,0.7484,0.64727,0.0133
6,0.7173,0.627492,0.0133
7,0.7068,0.626859,0.0133


Final loss: 1.8659185061590653


In [25]:
model_tune = AutoModelForSequenceClassification.from_pretrained("distilbert/distilroberta-base", num_labels=2, ignore_mismatched_sizes=True)

model_tune.load_state_dict(trainer_mlm.model.state_dict(), strict=False)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilbert/distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


_IncompatibleKeys(missing_keys=['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias'], unexpected_keys=['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias'])

In [26]:
training_args_tune = TrainingArguments(
    output_dir="classifier_tune",
    eval_strategy="epoch",
    push_to_hub=False,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    metric_for_best_model="accuracy"
)

trainer_tune = Trainer(
    model=model_tune,
    args=training_args_tune,
    train_dataset=train,
    eval_dataset=valid,
    compute_metrics=compute_metrics,
)

trainer_tune.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.2693,0.179567,0.917553,0.919758
2,0.2015,0.207948,0.926418,0.92912
3,0.19,0.17673,0.935284,0.938188
4,0.1698,0.279604,0.933511,0.937759
5,0.1301,0.329685,0.937943,0.940678
6,0.1126,0.337475,0.93883,0.941674
7,0.0962,0.379887,0.93617,0.939394


TrainOutput(global_step=3948, training_loss=0.1580896928317641, metrics={'train_runtime': 1959.6273, 'train_samples_per_second': 16.117, 'train_steps_per_second': 2.015, 'total_flos': 4183850319151104.0, 'train_loss': 0.1580896928317641, 'epoch': 7.0})

In [27]:
pred_tune = trainer_tune.predict(valid)
logits_tune, labels_tune = pred_tune.predictions, pred_tune.label_ids
preds_tune = np.argmax(logits_tune, axis=-1)

acc_tune = accuracy_score(labels_tune, preds_tune)
f1_tune = f1_score(labels_tune, preds_tune)
cm_tune = confusion_matrix(labels_tune, preds_tune)

print("accuracy_score:", acc_tune)
print("f1_score:", f1_tune)
print("confusion_matrix:", cm_tune)

accuracy_score: 0.9361702127659575
f1_score: 0.9393939393939394
confusion_matrix: [[498  35]
 [ 37 558]]


Итоговый результат получился лучше чем при начальном обучении

In [28]:
submission = pd.read_csv("submission.csv", delimiter=",")

submission_dataset = Dataset.from_dict({"text": submission["text"].tolist()})

submission_dataset = submission_dataset.map(tokenize, batched=True)

preds_output = trainer_tune.predict(submission_dataset)
logits = preds_output.predictions
pred_labels = np.argmax(logits, axis=-1)

submission["label"] = pred_labels

submission.to_csv("submission_pred.csv", index=False)

Map:   0%|          | 0/1440 [00:00<?, ? examples/s]

In [29]:
submission_pred = pd.read_csv("submission_pred.csv", delimiter=",")

submission_pred.head()

Unnamed: 0,text,label
0,"Он говорил ему, указывая на поля,",1
1,"Тревога беспредметная и бесцельная в настоящем,",1
2,"- Я о тебе, третьего дня",1
3,- Что ж вы? - закричал,1
4,Он с мучением задавал себе этот,1
