In [1]:
!pip install -qq transformers

In [2]:
import transformers
from transformers import BertForSequenceClassification
from transformers import  BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [3]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_LEN = 512

## Загрузка и подготовка данных

In [4]:
PATH = 'data/'

label2idx= {'M':1, 'М':1, 'H':0, 'Н':0}
data = pd.read_csv(PATH +'train.csv')
data['Class'] = data['Class'].replace(label2idx)

In [5]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [6]:
class TextDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    text = str(self.texts[item])
    #H=0, M=1
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'segment_ids' : torch.tensor([0] * self.max_len, dtype=torch.long),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [7]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

df_train, df_val = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED, stratify=data['Class'])

In [8]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TextDataset(
    texts=df.Text.to_numpy(),
    targets=df.Class.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

In [9]:
BATCH_SIZE = 12

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

### Функции для обуения и оценки модели

In [10]:
def evaluate(model, dataloader, device="cpu"):
    model.eval()
    
    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    model.to(device)
    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        label_ids = batch["targets"].to(device)
        segment_ids = batch['segment_ids'].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask,
                               labels=label_ids, token_type_ids=segment_ids)
        tmp_eval_loss, logits = output[0], output[1]

        outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        
        predicted_labels += list(outputs)
        correct_labels += list(label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    
    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
        
    return eval_loss, correct_labels, predicted_labels

In [11]:
import os
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score, f1_score


def train(model, train_dataloader, dev_dataloader, output_model_file,
          num_train_epochs=10, patience=2, gradient_accumulation_steps=1, max_grad_norm=5,
          warmup_proportion=0.1, batch_size=8, learning_rate=5e-5): 
    
    num_train_steps = int(len(df_train) / batch_size / gradient_accumulation_steps * num_train_epochs)
    num_warmup_steps = int(warmup_proportion * num_train_steps)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, 
                                                num_training_steps=num_train_steps)
    
    train_losses, dev_losses = [],[]
    
    train_accuracies, train_f1_scores, dev_accuracies, dev_f1_scores = [], [], [], []
    no_improvement = 0
    for _ in trange(int(num_train_epochs), desc="Epoch"):
        train_predictions = []
        train_labels = []
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            label_ids = batch["targets"].to(device)
            segment_ids = batch['segment_ids'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=label_ids,
                           token_type_ids=segment_ids)
            loss = outputs[0]
            logits = outputs[1]
            train_predictions += list(np.argmax(logits.detach().to('cpu'), axis=1))
            train_labels += list(batch["targets"].to('cpu').numpy())

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()
            tr_loss += loss.mean().item()

            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) 
                optimizer.step()
                optimizer.zero_grad() 
                scheduler.step()
                
            nb_tr_steps += 1
                
        tr_loss /= nb_tr_steps
        train_accuracies.append(accuracy_score(train_labels, train_predictions))
        train_f1_scores.append(f1_score(train_labels, train_predictions, average='macro'))
        
        dev_loss, correct_labels, predicted_labels = evaluate(model, dev_dataloader, device="cuda")
        dev_accuracies.append(accuracy_score(correct_labels, predicted_labels))
        dev_f1_scores.append(f1_score(correct_labels, predicted_labels, average='macro'))
        
        print("Train loss:", tr_loss, 'Train acc: ', train_accuracies[-1], "Train_f1: ", train_f1_scores[-1])
        print("Dev loss:", dev_loss, 'Dev acc: ', dev_accuracies[-1], "Dev_f1: ", dev_f1_scores[-1])

        if len(dev_losses) == 0 or dev_accuracies[-1] > min(dev_accuracies):
            no_improvement = 0
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            no_improvement += 1
        
        if no_improvement >= patience:
            print("No improvement on development set. Finish training.")
            break

        dev_losses.append(dev_loss)
        train_losses.append(tr_loss)
        
    return train_losses, dev_losses, train_accuracies, train_f1_scores, dev_accuracies, dev_f1_scores

In [12]:
idx2label = {1:'M', 0:'H'}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = len(idx2label))
model.to(device)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### Обучение 

In [13]:
OUTPUT_FILE=PATH + "models/bert.bin",
EPOCHS=10

train_losses, dev_losses,  train_accuracies, train_f1_scores, dev_accuracies, dev_f1_scores = train(model, 
                                                                                       train_data_loader, val_data_loader, 
                                                                                       gradient_accumulation_steps=4, batch_size=BATCH_SIZE,
                                                                                        output_model_file = OUTPUT_FILE)

Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Training iteration:   0%|          | 0/9680 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/1076 [00:00<?, ?it/s]

Train loss: 0.13706728214944505 Train acc:  0.6704258817655111 Train_f1:  0.6703091197669191
Dev loss: 0.5219017489359724 Dev acc:  0.683737506779267 Dev_f1:  0.6645748678656168


Epoch:  10%|█         | 1/10 [1:49:38<16:26:50, 6578.97s/it]

Training iteration:   0%|          | 0/9680 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/1076 [00:00<?, ?it/s]

Train loss: 0.12405942486606784 Train acc:  0.7158808185332174 Train_f1:  0.7158234795019094
Dev loss: 0.49755632269393557 Dev acc:  0.7120167351049818 Dev_f1:  0.7028079306732792


Epoch:  20%|██        | 2/10 [3:39:15<14:36:58, 6577.35s/it]

Training iteration:   0%|          | 0/9680 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/1076 [00:00<?, ?it/s]

Train loss: 0.11615004600641464 Train acc:  0.739305606969757 Train_f1:  0.7392560743686794
Dev loss: 0.4879895089519622 Dev acc:  0.7260401332610211 Dev_f1:  0.7192379607566612


Epoch:  30%|███       | 3/10 [5:28:51<12:47:18, 6576.99s/it]

Training iteration:   0%|          | 0/9680 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/1076 [00:00<?, ?it/s]

Train loss: 0.1079448224383037 Train acc:  0.7640819910639727 Train_f1:  0.76405718690267
Dev loss: 0.49913804921006516 Dev acc:  0.7337103897110095 Dev_f1:  0.7301234808466088


Epoch:  40%|████      | 4/10 [7:18:29<10:57:42, 6577.14s/it]

Training iteration:   0%|          | 0/9680 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/1076 [00:00<?, ?it/s]

Train loss: 0.0986014811679998 Train acc:  0.7898570063447516 Train_f1:  0.7898370823889493
Dev loss: 0.5249665773864904 Dev acc:  0.7378941659564577 Dev_f1:  0.7371031207081657


Epoch:  50%|█████     | 5/10 [9:08:09<9:08:11, 6578.36s/it] 

Training iteration:   0%|          | 0/9680 [00:00<?, ?it/s]

Evaluation iteration:   0%|          | 0/1076 [00:00<?, ?it/s]

Train loss: 0.08900040994472379 Train acc:  0.8153909727184291 Train_f1:  0.815379416774865
Dev loss: 0.5948227841621715 Dev acc:  0.7362671418610056 Dev_f1:  0.7342841700394438


Epoch:  60%|██████    | 6/10 [10:57:51<7:18:38, 6579.71s/it]

Training iteration:   0%|          | 0/9680 [00:00<?, ?it/s]

### Метрики на тестовых данных

In [None]:
idx2label = {1:'M', 0:'H'}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = len(idx2label))
model.load_state_dict(torch.load('models/bert.bin'))
model.to(device)


PATH = 'data/'

label2idx= {'M':1, 'М':1, 'H':0, 'Н':0}
data = pd.read_csv(PATH +'val.csv')
data['Class'] = data['Class'].replace(label2idx)
BATCH_SIZE=12

test_data_loader = create_data_loader(data, tokenizer, MAX_LEN, BATCH_SIZE)
eval_loss, correct_labels, predicted_labels = evaluate(model, test_data_loader, device='cuda')
print(classification_report(correct_labels, predicted_labels))


In [None]:
#              precision    recall  f1-score   support
#
#           0       0.70      0.82      0.76     10756
#           1       0.78      0.65      0.71     10755
#
#    accuracy                           0.74     21511
#   macro avg       0.74      0.74      0.73     21511
#weighted avg       0.74      0.74      0.73     21511

### Submission

In [None]:
def make_proba_predictions(model, dataloader, device="cpu"):
    model.eval()
    
    predicted_proba, predicted_labels = [], []

    model.to(device)
    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        segment_ids = batch['segment_ids'].to(device)
        label_ids = batch["targets"].to(device)
        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask,
                               labels=label_ids, token_type_ids=segment_ids)
        logits = output[1]

        outputs = np.argmax(logits.to('cpu'), axis=1)
        predicted_labels += list(outputs)
        predicted_proba += list(logits.to('cpu').numpy())
        

    predicted_labels = np.array(predicted_labels)
        
    return predicted_proba, predicted_labels

In [None]:
subm = pd.read_csv('data/test.csv')
subm['Class'] = 0
sumb_data_loader = create_data_loader(subm, tokenizer, MAX_LEN, BATCH_SIZE)
_, predicted_labels = make_proba_predictions(model, sumb_data_loader, device='cuda')

subm['Class'] = predicted_labels
subm['Class'] = subm['Class'].replace(idx2label)
subm[['Id', 'Class']].to_csv('submissions/submission_bert_5_512.csv', index=False)