In [1]:
!pip install -qq transformers

In [2]:
import transformers
from transformers import BertForSequenceClassification
from transformers import  BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

In [3]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_LEN = 512

## Загрузка и подготовка данных

In [5]:
PATH = '../input/ruatd-data/'

label2idx= {'M':1, 'М':1, 'H':0, 'Н':0}
data = pd.read_csv(PATH +'train.csv')
data['Class'] = data['Class'].replace(label2idx)

In [6]:
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [4]:
class TextDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len
  
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    text = str(self.texts[item])
    #H=0, M=1
    target = self.targets[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'segment_ids' : torch.tensor([0] * self.max_len, dtype=torch.long),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [7]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

df_train, df_val = train_test_split(data, test_size=0.1, random_state=RANDOM_SEED, stratify=data['Class'])

In [8]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = TextDataset(
    texts=df.Text.to_numpy(),
    targets=df.Class.to_numpy(),
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=2
  )

In [9]:
BATCH_SIZE = 12

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

### Функции для обуения и оценки модели

In [10]:
def evaluate(model, dataloader, device="cpu"):
    model.eval()
    
    eval_loss = 0
    nb_eval_steps = 0
    predicted_labels, correct_labels = [], []

    model.to(device)
    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        label_ids = batch["targets"].to(device)
        segment_ids = batch['segment_ids'].to(device)

        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask,
                               labels=label_ids, token_type_ids=segment_ids)
        tmp_eval_loss, logits = output[0], output[1]

        outputs = np.argmax(logits.to('cpu'), axis=1)
        label_ids = label_ids.to('cpu').numpy()
        
        predicted_labels += list(outputs)
        correct_labels += list(label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    
    correct_labels = np.array(correct_labels)
    predicted_labels = np.array(predicted_labels)
        
    return eval_loss, correct_labels, predicted_labels

In [11]:
import os
from tqdm import trange
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, precision_recall_fscore_support, accuracy_score, f1_score

In [12]:
def train(model, train_dataloader, dev_dataloader, output_model_file= "./bert.bin",
          num_train_epochs=10, patience=2, gradient_accumulation_steps=1, max_grad_norm=5,
          warmup_proportion=0.1, batch_size=8, learning_rate=5e-5): 
    
    num_train_steps = int(len(df_train) / batch_size / gradient_accumulation_steps * num_train_epochs)
    num_warmup_steps = int(warmup_proportion * num_train_steps)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, correct_bias=False)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, 
                                                num_training_steps=num_train_steps)
    
    train_losses, dev_losses = [],[]
    
    train_accuracies, train_f1_scores, dev_accuracies, dev_f1_scores = [], [], [], []
    no_improvement = 0
    for _ in trange(int(num_train_epochs), desc="Epoch"):
        train_predictions = []
        train_labels = []
        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            label_ids = batch["targets"].to(device)
            segment_ids = batch['segment_ids'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=label_ids,
                           token_type_ids=segment_ids)
            loss = outputs[0]
            logits = outputs[1]
            train_predictions += list(np.argmax(logits.detach().to('cpu'), axis=1))
            train_labels += list(batch["targets"].to('cpu').numpy())

            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            loss.backward()
            tr_loss += loss.mean().item()

            if (step + 1) % gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) 
                optimizer.step()
                optimizer.zero_grad() 
                scheduler.step()
                
            nb_tr_steps += 1
                
        tr_loss /= nb_tr_steps
        train_accuracies.append(accuracy_score(train_labels, train_predictions))
        train_f1_scores.append(f1_score(train_labels, train_predictions, average='macro'))
        
        dev_loss, correct_labels, predicted_labels = evaluate(model, dev_dataloader, device="cuda")
        dev_accuracies.append(accuracy_score(correct_labels, predicted_labels))
        dev_f1_scores.append(f1_score(correct_labels, predicted_labels, average='macro'))
        
        print("Train loss:", tr_loss, 'Train acc: ', train_accuracies[-1], "Train_f1: ", train_f1_scores[-1])
        print("Dev loss:", dev_loss, 'Dev acc: ', dev_accuracies[-1], "Dev_f1: ", dev_f1_scores[-1])

        if len(dev_losses) == 0 or dev_accuracies[-1] > min(dev_accuracies):
            no_improvement = 0
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(model_to_save.state_dict(), output_model_file)
        else:
            no_improvement += 1
        
        if no_improvement >= patience:
            print("No improvement on development set. Finish training.")
            break

        dev_losses.append(dev_loss)
        train_losses.append(tr_loss)
        
    return train_losses, dev_losses, train_accuracies, train_f1_scores, dev_accuracies, dev_f1_scores

In [None]:
idx2label = {1:'M', 0:'H'}
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = len(idx2label))
model.to(device)

### Обучение 

In [None]:
OUTPUT_FILE=PATH + "./bert512.bin",
EPOCHS=10

train_losses, dev_losses,  train_accuracies, train_f1_scores, dev_accuracies, dev_f1_scores = train(model, 
                                                                                       train_data_loader, val_data_loader, 
                                                                                       gradient_accumulation_steps=4, batch_size=BATCH_SIZE)

### Результаты на val датасете

In [13]:
idx2label = {1:'M', 0:'H'}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME, num_labels = len(idx2label))
model.load_state_dict(torch.load('../input/bert-5/bert.bin'))
model.to(device)

In [14]:
PATH = '../input/ruatd-data/'

label2idx= {'M':1, 'М':1, 'H':0, 'Н':0}
data = pd.read_csv(PATH +'val.csv')
data['Class'] = data['Class'].replace(label2idx)
BATCH_SIZE=12

test_data_loader = create_data_loader(data, tokenizer, MAX_LEN, BATCH_SIZE)
eval_loss, correct_labels, predicted_labels = evaluate(model, test_data_loader, device='cuda')
print(classification_report(correct_labels, predicted_labels))

### Submission

In [15]:
def make_proba_predictions(model, dataloader, device="cpu"):
    model.eval()
    
    predicted_proba, predicted_labels = [], []

    model.to(device)
    for step, batch in enumerate(tqdm(dataloader, desc="Evaluation iteration")):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        segment_ids = batch['segment_ids'].to(device)
        label_ids = batch["targets"].to(device)
        with torch.no_grad():
            output = model(input_ids, attention_mask=attention_mask,
                               labels=label_ids, token_type_ids=segment_ids)
        logits = output[1]

        outputs = np.argmax(logits.to('cpu'), axis=1)
        predicted_labels += list(outputs)
        predicted_proba += list(logits.to('cpu').numpy())
        

    predicted_labels = np.array(predicted_labels)
        
    return predicted_proba, predicted_labels

In [16]:
subm = pd.read_csv(PATH +'test.csv')
subm['Class'] = 0
sumb_data_loader = create_data_loader(subm, tokenizer, MAX_LEN, BATCH_SIZE)
_, predicted_labels = make_proba_predictions(model, sumb_data_loader, device='cuda')

subm['Class'] = predicted_labels
subm['Class'] = subm['Class'].replace(idx2label)
subm[['Id', 'Class']].to_csv('./submission_bert_5_512.csv', index=False)