## 1. Импорт зависимостей

Файл выполнялся в гугл колаб, используя GPU

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import time
import torch
import random
import datetime
import numpy as np
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score
    )
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import (
    TensorDataset, random_split, DataLoader,
    RandomSampler, SequentialSampler
    )
from transformers import(
    BertForSequenceClassification, AdamW,
    BertTokenizer, AutoTokenizer,
    get_linear_schedule_with_warmup
    )

Проверим доступность GPU

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## 2. Загрузка данных

Данные, в отличие от других файлов используются в виде тренировочной и тестовой выборки, чтобы не хранить одновременно лишние данные в памяти.

Данные были разделены на train и test с помощью sklearn train_test_split с random_state=42.

In [None]:
# Load the dataset into a pandas dataframe.
df = pd.read_csv("drive/MyDrive/train.csv")

# Report the number of sentences.
print('Number of training sentences: {:,}\n'.format(df.shape[0]))

# Display 10 random rows from the data.
df.sample(10)

Number of training sentences: 25,000



Unnamed: 0,id,text,rating,positive
10554,824,When it comes to creating a universe George Lu...,8,1
5907,4067,"Hayao Miyazaki's second feature film, and his ...",8,1
16536,2383,This is a candidate for the single most disapp...,1,0
5729,3907,I find myself comparing all stand-up acts to t...,10,1
17579,3321,"Usually, I know after the first minute of a mo...",4,0
20235,5712,"After sitting through this pile of dung, my hu...",1,0
1486,11338,"I saw this at ""Dances with Films"", and it was ...",10,1
18483,4135,This film was so bad i had to fast forward mos...,1,0
18417,4076,This subject matter deserves a much better scr...,1,0
15593,1534,From the blocky digitised footage to the actin...,1,0


In [None]:
text_column = 'text'
label_column = 'positive'

In [None]:
df[label_column].unique()

array([1, 0])

In [None]:
encoder = LabelEncoder()
encoder.fit(df[label_column])
df[label_column] = encoder.transform(df[label_column])

In [None]:
df.head()

Unnamed: 0,id,text,rating,positive
0,0,Bromwell High is a cartoon comedy. It ran at t...,9,1
1,10000,Homelessness (or Houselessness as George Carli...,8,1
2,10001,Brilliant over-acting by Lesley Ann Warren. Be...,10,1
3,10002,This is easily the most underrated film inn th...,7,1
4,10003,This is not the typical Mel Brooks film. It wa...,8,1


In [None]:
texts = df[text_column].astype(str).values
labels = df[label_column].values.astype(int)

In [None]:
texts[:3], labels[:3]

(array(['Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn\'t!',
        'Homelessness (or Houselessness as George Carlin stated) has been an issue for years but never a plan to help those on the street that were once considered human who did 

In [None]:
classes_amount = len(df[label_column].unique())
classes_amount

2

In [None]:
df[label_column].unique()

array([1, 0])

# 3. Токенизация & Форматирование Input

В данной секции преобразуем данные к формату, с которым работает BERT.

## 3.1. BERT Tokenizer

В ходе работы будем использовать предобученную модель архтектуры BERT на английском языке: distilbert/distilbert-base-uncased

Эта модель является легкой и подойдет для небольших вычислительных мощностей.

In [None]:
model_name = 'distilbert/distilbert-base-uncased'
batch_size = 32
random_state = 42

In [None]:
# Load the BERT tokenizer.
print(f'Loading {model_name} tokenizer...')
tokenizer = BertTokenizer.from_pretrained(model_name)

Loading distilbert/distilbert-base-uncased tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


Применим токенайзер к одному из наших текстов и посмотрим, что получится.


In [None]:
# Print the original sentence.
print('Original: ', texts[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(texts[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.encode(texts[0]))

Original:  Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!
Tokenized:  ['bro', '##m', '##well', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'lif

## 3.2. Токенизация данных

Перед тем, как закодировать текст, найдем максимальную длину текстов.


В ячейке ниже происходит проход по датасету для определения максимальной длины.

In [None]:
max_length = 0
text_lens = []

for text in texts:
  input_ids = tokenizer.encode(text, add_special_tokens=True)
  current_len = len(input_ids)
  text_lens.append(current_len)

text_lens = np.array(text_lens)
max_length = text_lens.max()
print('Max sentence length: ', max_length)

Token indices sequence length is longer than the specified maximum sequence length for this model (559 > 512). Running this sequence through the model will result in indexing errors


Max sentence length:  3127


In [None]:
# 1. Mean
mean = text_lens.mean()

# 2. Variance
variance = text_lens.var()

# 3. Standard deviation
std = text_lens.std()

# 4. Quartiles
Q1 = np.percentile(text_lens, 25)
Q2 = np.percentile(text_lens, 50)
Q3 = np.percentile(text_lens, 75)

# Print results
print("Mean:", mean)
print("Median:", Q2)
print("Variance:", variance)
print("Standard deviation:", std)
print(f"Q1 = {Q1}, Q3 = {Q3}")


Mean: 313.87132
Median: 233.0
Variance: 54892.35588145762
Standard deviation: 234.29117755787908
Q1 = 168.0, Q3 = 382.0


За длину возьмем медиану

In [None]:
max_length = int(Q2)

Формируем токенизированные множества.

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
def preparing_text_and_labels(texts, labels, max_length, truncation=False):
    input_ids = []
    attention_masks = []

    for text in texts:

        encoded_dict = tokenizer.encode_plus(
                            text,
                            add_special_tokens = True,
                            max_length = max_length,
                            padding='max_length',
                            return_attention_mask = True,
                            return_tensors = 'pt',
                            truncation=truncation
                    )

        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict['input_ids'])

        # And its attention mask (simply differentiates padding from non-padding).
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return input_ids, attention_masks, labels


In [None]:
input_ids, attention_masks, labels = preparing_text_and_labels(texts, labels, max_length=max_length, truncation=True)

print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Original:  Bromwell High is a cartoon comedy. It ran at the same time as some other programs about school life, such as "Teachers". My 35 years in the teaching profession lead me to believe that Bromwell High's satire is much closer to reality than is "Teachers". The scramble to survive financially, the insightful students who can see right through their pathetic teachers' pomp, the pettiness of the whole situation, all remind me of the schools I knew and their students. When I saw the episode in which a student repeatedly tried to burn down the school, I immediately recalled ......... at .......... High. A classic line: INSPECTOR: I'm here to sack one of your teachers. STUDENT: Welcome to Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched. What a pity that it isn't!
Token IDs: tensor([  101, 22953,  2213,  4381,  2152,  2003,  1037,  9476,  4038,  1012,
         2009,  2743,  2012,  1996,  2168,  2051,  2004,  2070,  2060,  3454,
         2055, 

## 3.3. Training & Validation Split


Разделим данные на train и val в пропорции  90% к 10%.

In [None]:
train_part = 0.9

In [None]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Calculate the number of samples to include in each set.
train_size = int(train_part * len(dataset))
val_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

22,500 training samples
2,500 validation samples


Также создадим итератор с использование класса  DataLoader из библиотеки torch.

In [None]:
train_dataloader = DataLoader(
            train_dataset,
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size
        )

validation_dataloader = DataLoader(
            val_dataset,
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size
        )

Теперь, когда данные отформатированы, можем обучить модель.

## 4.1. BertForSequenceClassification

In [None]:
model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels = classes_amount, # The number of output labels--2 for binary classification.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

# Using GPU.
model.cuda()

You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.ou

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

Ради интереса выведем на печать параметры для нескольких слоев:


1. The embedding layer.
2. Первый из 12 ти трансформеров.
3. The output layer.




In [None]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (30522, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

## 4.2. Optimizer & Learning Rate Scheduler

Определим optimizer и sheduler.

Наиболее популярные конфигурации параметров:
- **Batch size:** 16, 32  
- **Learning rate (Adam):** 5e-5, 3e-5, 2e-5  
- **Number of epochs:** 2, 3, 4

Мы выберем:
* Batch size: 32 (при инициализации DataLoaders)
* Learning rate: 2e-5
* Epochs: 4

Параметр эпсилон `eps = 1e-8` - это "очень маленькое число", которое позволяет избежать деления на 0

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8
                )




In [None]:
epochs = 4

total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

## 4.3. Training Loop

Ниже приведена петля обучения (training loop). На каждом проходе по данным мы проходим по всему обучающему и валидационную датасетам.


**Training:**
- Распаковать обучающие данные и лейблы
- Загрузить данные на GPU для ускорения
- Занулить градиенты с предыдущего шаге
- Forward pass (скормить данные в нейросеть и пробросить их вперед)
- Backward pass (backpropagation - посчитать градиенты по всем параметрам с помощью обратного распространения ошибки)
- Обновить параметры с помощью optimizer.step()
- Посчитать статистики, чтобы следить за обучением

**Evalution:**
- Распаковать валидационные данные и лейблы
- Загрузить данные на GPU для ускорения
- Forward pass (скормить данные в нейросеть и пробросить их вперед)
- Посчитать loss и статистики на валидационных данных, чтобы следить за обучением

##

Выбираем Accuracy как метрику качества модели.

Так как в нашем случае классы имею идеальный баланс 12500/12500, а также мы не ставим приоритеты в распознавании классов (т.е. нам не важно отлавливать чаще наблюдения какого-то класса) => все классы одинаково важны, то Accuracy становится хорошим выбором.

In [None]:
# Вспомогательная функция для вычисления Accuracy
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(pred_flat, labels_flat)


Вспомогательная функция для отслеживания затраченного времени.


In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


Все готово к обучению! Приступим!

In [None]:
def fine_tune_model(
        train_dataloader, validation_dataloader,
        random_state=random_state,
        metric_func=flat_accuracy,
        metric_name='Metric'
        ):

    random.seed(random_state)
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()

            res = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)
            loss = res['loss']
            logits = res['logits']

            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # Validation
        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables
        total_eval_metric = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():

                res = model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
                loss = res['loss']
                logits = res['logits']

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_metric += metric_func(logits, label_ids)


        # Report the final accuracy for this validation run.
        avg_val_metric = total_eval_metric / len(validation_dataloader)
        print("  {}: {0:.2f}".format(metric_name, avg_val_metric))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                f'{metric_name}': avg_val_metric,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )

    print("")

    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    return training_stats

In [None]:
training_stats = fine_tune_model(
    train_dataloader, validation_dataloader,
    average='binary', metric_func=flat_accuracy,
    metric_name='Metric'
    )


Training...
  Batch    40  of    704.    Elapsed: 0:00:48.
  Batch    80  of    704.    Elapsed: 0:01:34.
  Batch   120  of    704.    Elapsed: 0:02:22.
  Batch   160  of    704.    Elapsed: 0:03:08.
  Batch   200  of    704.    Elapsed: 0:03:56.
  Batch   240  of    704.    Elapsed: 0:04:43.
  Batch   280  of    704.    Elapsed: 0:05:30.
  Batch   320  of    704.    Elapsed: 0:06:17.
  Batch   360  of    704.    Elapsed: 0:07:04.
  Batch   400  of    704.    Elapsed: 0:07:51.
  Batch   440  of    704.    Elapsed: 0:08:38.
  Batch   480  of    704.    Elapsed: 0:09:25.
  Batch   520  of    704.    Elapsed: 0:10:12.
  Batch   560  of    704.    Elapsed: 0:10:59.
  Batch   600  of    704.    Elapsed: 0:11:46.
  Batch   640  of    704.    Elapsed: 0:12:33.
  Batch   680  of    704.    Elapsed: 0:13:20.

  Average training loss: 0.59
  Training epcoh took: 0:13:47

Running Validation...
accuracy: 0.81
  Validation Loss: 0.42
  Validation took: 0:00:29

Training...
  Batch    40  of    704

KeyboardInterrupt: 

Посмотрим на саммари обучающего процесса!

In [None]:
# Display floats with two decimal places.
pd.set_option("display.max_columns", 100)

# Create a DataFrame from our training statistics.
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index.
df_stats = df_stats.set_index('epoch')

# Display the table.
df_stats

NameError: name 'training_stats' is not defined

Обратите внимание, что loss на обучении непрерывно падает, а на валидации начинает возрастать. Это говорит о том, что модель переобучается.

Loss на валидации - более надежный способ, чем F1, так как F1 не зависит от типа ошибки и уверенности классификатора в ответе.


In [None]:
# Use plot styling from seaborn.
sns.set(style='darkgrid')

# Increase the plot size and font size.
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)

# Plot the learning curve.
plt.plot(df_stats['Training Loss'], 'b-o', label="Training")
plt.plot(df_stats['Valid. Loss'], 'g-o', label="Validation")

# Label the plot.
plt.title("Training & Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.xticks([1, 2, 3, 4])

plt.show()

# 5. Performance On Test Set

Качество на отложенной выборке. Для оценки качества зафайнтьюненой модели оценим качество на тесте с использованием F1_score.

### 5.1. Подготовка данных



Применим к тестовым данным те же шаги предобработки данных, которые мы применяли для обучающей и валидационной выборкок.

In [None]:
df = pd.read_csv('drive/MyDrive/test.csv')

In [None]:
# Report the number of sentences.
print('Number of test sentences: {:,}\n'.format(df.shape[0]))

In [None]:
df.groupby(label_column).count()

In [None]:
df[label_column]

In [None]:
# Create sentence and label lists
texts = df[text_column].astype(str).values
labels = encoder.fit_transform(df[label_column])

In [None]:
texts[:2]

In [None]:
labels

In [None]:
# Tokenize all of the texts and map the tokens to thier word IDs.
input_ids, attention_masks, labels = preparing_text_and_labels(texts, labels, max_length, truncation=True) #Truncation for evoiding long cases


In [None]:
# Create the DataLoader.
prediction_data = TensorDataset(input_ids, attention_masks, labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

## 5.2. Оценка на тесте



После того, как мы подготовили тестовые данные, сгенерируем предсказания с использованием дообученной модели.

In [None]:
# Prediction on test set
def get_predictions(model, prediction_dataloader):
  print('Predicting labels for {:,} test texts...'.format(len(input_ids)))
  # Put model in evaluation mode
  model.eval()

  # Tracking variables
  predictions , true_labels = [], []

  # Predict
  for batch in prediction_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        # Forward pass, calculate logit predictions
        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask)

    logits = outputs[0]

    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    # Store predictions and true labels
    predictions.append(logits)
    true_labels.append(label_ids)

  print()
  print('DONE.')
  return predictions, true_labels

In [None]:
predictions, true_labels = get_predictions(model, prediction_dataloader)

В качестве метрики возьмем accuracy.

In [None]:
accuracy_set = []

# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating accuracy Score for each batch...')

# For each input batch...
for i in range(len(true_labels)):

  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()

  # Calculate and store the coef for this batch.
  accuracy = accuracy_score(true_labels[i], pred_labels_i)
  accuracy_set.append(accuracy)

Финальный скор считается по всему датасету, дополнительно посмотрим на скоры на отдельных батчах.


In [None]:
# Create a barplot showing the F1 score for each batch of test samples.
ax = sns.barplot(x=list(range(len(accuracy_set))), y=accuracy_set, errorbar=None)

plt.title('Accuracy Score per Batch')
plt.ylabel('Accuracy')
plt.xlabel('Batches')

plt.show()

Объединим результаты и получим общий скор.

In [None]:

def get_results(predictions, true_labels, average='binary'):
    # Combine the results across all batches.
    flat_predictions = np.concatenate(predictions, axis=0)

    # For each sample, pick the label (0 or 1) with the higher score.
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

    # Combine the correct labels for each batch into a single list.
    flat_true_labels = np.concatenate(true_labels, axis=0)

    # Calculate the F1
    f1 = f1_score(flat_true_labels, flat_predictions, average=average)
    accuracy = accuracy_score(flat_true_labels, flat_predictions)
    precision = precision_score(flat_true_labels, flat_predictions, average=average)
    recall = recall_score(flat_true_labels, flat_predictions, average=average)


    print('Total Accuracy: %.3f' % accuracy)
    print('Total Precision: %.3f' % precision)
    print('Total Recall: %.3f' % recall)
    print('Total F1: %.3f' % f1)

In [None]:
get_results(predictions, true_labels, average='binary')


**Успех!** Спустя небольшое количество времени мы получили хорошую модель с качеством выше чем у моделей, обученных с нуля.



# Заключение

Используя предобученную сеть cointegrated/rubert-tiny2 с помощью простого файн тюнинга удается добиться лучшего качества среди всех моделей. А также модель довольно быстро обучается, что позволит в дальнейшем только улучшать её.

##  Сохранение модели

Код ниже позволяет сохранить дообученную модель и соотвествующий ей токенайзер на диск.

Так как код выполнялся на GPU Google Colab.

In [None]:
output_dir = './model_save/'

# Create output directory if needed
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print("Saving model to %s" % output_dir)

# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Good practice: save your training arguments together with the trained model
# torch.save(args, os.path.join(output_dir, 'training_args.bin'))


Saving model to ./model_save/


('./model_save/tokenizer_config.json',
 './model_save/special_tokens_map.json',
 './model_save/vocab.txt',
 './model_save/added_tokens.json')

Посмотрим размеры файлов.

In [None]:
!ls -l --block-size=K ./model_save/

total 115120K
-rw-r--r-- 1 root root      1K Sep  2 14:50 config.json
-rw-r--r-- 1 root root 114052K Sep  2 14:50 model.safetensors
-rw-r--r-- 1 root root      1K Sep  2 14:50 special_tokens_map.json
-rw-r--r-- 1 root root      2K Sep  2 14:50 tokenizer_config.json
-rw-r--r-- 1 root root   1056K Sep  2 14:50 vocab.txt


In [None]:
!ls -l --block-size=M ./model_save/model.safetensors

-rw-r--r-- 1 root root 112M Sep  2 14:50 ./model_save/model.safetensors


Файл весит всего 112 мб

Код ниже позволяет сохранить модель из колаба на Google Disk.

In [None]:
# Copy the model files to a directory in your Google Drive.
!cp -r ./model_save/ "./drive/bert_model_finetuned"

cp: cannot create directory './drive/myfile_andreykasha': Operation not supported


Код ниже позволяет загрузить сохраненную модель.

In [None]:
# Load a trained model and vocabulary that you have fine-tuned
model = BertForSequenceClassification.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Copy the model to the GPU.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(83828, 312, padding_idx=0)
      (position_embeddings): Embedding(2048, 312)
      (token_type_embeddings): Embedding(2, 312)
      (LayerNorm): LayerNorm((312,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-2): 3 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=312, out_features=312, bias=True)
              (key): Linear(in_features=312, out_features=312, bias=True)
              (value): Linear(in_features=312, out_features=312, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=312, out_features=312, bias=True)
              (LayerNorm): LayerNorm((312,), eps=1e-