In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sequence-classification/seqcls_train.csv
/kaggle/input/sequence-classification/seqcls_test.csv


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import os
import random
from tqdm import tqdm, tqdm_notebook

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification

import warnings
warnings.filterwarnings("ignore")
os.environ["WANDB_MODE"] = "disabled"
os.environ["WANDB_DISABLED"] = "true"

## Load data & EDA

In [3]:
train = pd.read_csv('/kaggle/input/sequence-classification/seqcls_train.csv')
print(*train.shape)
train.head()

24000 3


Unnamed: 0,index,review,sentiment
0,521911,"Есть много причин, по которым 'Война и мир' Ль...",1
1,776224,"Напишите 5 предложений. 1. ""Война и мир"" — это...",2
2,141250,[Практикуйте «Другие люди»] Отзыв о «Войне и ...,1
3,678689,Стремитесь к точному и объективному представле...,2
4,175974,"В книге ""Война и мир"" Льва Толстого разворачив...",2


In [4]:
test = pd.read_csv('/kaggle/input/sequence-classification/seqcls_test.csv')
print(*test.shape)
test.head()

6000 2


Unnamed: 0,index,review
0,690427,## Название: 'Война и мир': Грандиозное полотн...
1,253613,"Продукт не следует ни хвалить, ни критиковать,..."
2,379756,'Война и мир' Льва Толстого — это обширный ром...
3,696940,[Вопрос задан во время обсуждения другого вопр...
4,351121,Вот что я придумал: «Война и мир» Льва Толсто...


In [5]:
train['sentiment'].value_counts()

sentiment
1    8000
2    8000
0    8000
Name: count, dtype: int64

In [6]:
review_len_train = train['review'].apply(lambda x: len(x.split()))
review_len_train.describe()

count    24000.000000
mean       200.403708
std         34.275763
min         12.000000
25%        196.000000
50%        205.000000
75%        215.000000
max        511.000000
Name: review, dtype: float64

In [7]:
review_len_test = test['review'].apply(lambda x: len(x.split()))
review_len_test.describe()

count    6000.000000
mean      199.922000
std        34.374101
min        18.000000
25%       196.000000
50%       205.000000
75%       215.000000
max       410.000000
Name: review, dtype: float64

## Preparing data for training

In [8]:
texts = train['review'].to_list()
labels = train['sentiment'].to_list()
text_train, text_val, label_train, label_val = train_test_split(texts, labels, test_size=0.2, random_state=42)
print(len(label_train), len(label_val))

19200 4800


In [9]:
model_checkpoint = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

tokenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [10]:
tokenizer(["Hello, this one sentence!", "And this sentence goes with it."])

{'input_ids': [[101, 31690, 128, 11043, 10976, 13466, 11193, 9984, 106, 102], [101, 13201, 11043, 13466, 11193, 9984, 12295, 4388, 10681, 10783, 132, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [11]:
tokens_train = tokenizer(text_train, truncation=True, padding=True, max_length=512, return_tensors="pt")
tokens_val = tokenizer(text_val, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [12]:
class SentimentDataset(Dataset):
    def __init__(self, tokens, labels):
        self.tokens = tokens
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: lst[idx] for key, lst in self.tokens.items()}
        item["label"] = torch.tensor(self.labels[idx])
        return item

In [13]:
train_dataset = SentimentDataset(tokens_train, label_train)
val_dataset = SentimentDataset(tokens_val, label_val)

## Fine-tuning the model

In [15]:
num_labels = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Num labels:', num_labels)
print('Device', device)

Num labels: 3
Device cuda


In [16]:
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels).to(device)
model

pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [17]:
for i, layer in enumerate(model.children()):
    if i != 2:
        for param in layer.parameters():
            param.requires_grad = False

In [18]:
train_args = TrainingArguments(
    num_train_epochs=3,
    learning_rate=1e-3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    warmup_steps=250, # 19200 сэмплов / 16 batch size = 1200 шагов в эпохе
    eval_strategy='epoch',
    lr_scheduler_type='linear',
    run_name='',
    output_dir='/content/output',
    logging_dir='/content/logs',
    save_strategy='epoch',
    logging_strategy='epoch',
    disable_tqdm=False,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


*   num_train_epochs - количество эпох
*   learning_rate - скорость обучения
*   per_device_train_batch_size - размер batchа на train
*   per_device_eval_batch_size - размер batchа на валидации
*   weight_decay - коэффициент l2 регуляризации
*   warmup_steps - количество шагов для разогрева, в течение которых скорость обучения будет постепенно увеличиваться с 0 до указанного lr
*   eval_strategy - стратегия валидации (epoch - каждую эпоху)
*   lr_scheduler_type - тип schedulerа (без гибких настроек)
*   run_name - название fine-tuningа
*   output_dir - папка для сохранения весов
*   logging_dir - папка для сохранения логов
*   save_strategy - стратегия сохранения весов (epoch - каждую эпоху)
*   logging_strategy - стратегия сохранения логов (epoch - каждую эпоху)
*   disable_tqdm - включаем tqdm
*   load_best_model_at_end - загружаем лучшую модель с метрикой на валидации  
    metric_for_best_model - метрика для определения лучшей модели (например = 'accuracy')  
    greater_is_better - флаг чем больше тем лучше

In [19]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    f1score = f1_score(labels, preds, average='weighted')
    return {'accuracy': accuracy, 'f1': f1score}

In [20]:
trainer = Trainer(
    model=model,
    args=train_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.7739,0.573636,0.85125,0.850883
2,0.5359,0.494238,0.871458,0.871566
3,0.4974,0.478249,0.871458,0.871466


TrainOutput(global_step=3600, training_loss=0.602401869032118, metrics={'train_runtime': 1177.9251, 'train_samples_per_second': 48.9, 'train_steps_per_second': 3.056, 'total_flos': 1.51553328611328e+16, 'train_loss': 0.602401869032118, 'epoch': 3.0})

## Making prediction

In [None]:
model.eval()

In [29]:
def predict_sample(text):
    inputs = tokenizer(text, truncation=True, padding=True, max_length=512, return_tensors='pt')
    inputs = {key: val.to(device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    prob = torch.nn.functional.softmax(outputs.logits, dim=1)
    pred = torch.argmax(prob, dim=1).item()
    return pred

In [32]:
labels = []
samples = test['review'].to_list()
for rev in tqdm_notebook(samples):
    labels.append(predict_sample(rev))
sub = pd.DataFrame()
sub['index'] = test['index']
sub['sentiment'] = labels
sub.to_csv('sub.csv', index=False)
sub.head()

  0%|          | 0/6000 [00:00<?, ?it/s]

Unnamed: 0,index,sentiment
0,690427,0
1,253613,2
2,379756,2
3,696940,0
4,351121,1
