In [2]:
%%capture
!pip install transformers

In [3]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
import transformers

import pandas as pd
import numpy as np

from torch.utils.data import Dataset
from tqdm import trange
import random
from tabulate import tabulate

In [8]:
df = pd.read_csv('/content/drive/MyDrive/data2.csv')

df = df.reset_index(drop=True)

In [6]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased", do_lower_case = False)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

In [9]:
text = df['text']
labels = df['text_type']

In [10]:
token_id = []
attention_masks = []

def preprocessing(input_text, tokenizer):
    '''
    Returns <class transformers.tokenization_utils_base.BatchEncoding> with the following fields:
      - input_ids: list of token ids
      - token_type_ids: list of token type ids
      - attention_mask: list of indices (0,1) specifying which tokens should considered by the model (return_attention_mask = True).
    '''
    return tokenizer.encode_plus(
                          input_text,
                          add_special_tokens = True,
                          max_length = 32,
                          pad_to_max_length = True,
                          return_attention_mask = True,
                          return_tensors = 'pt',
                          truncation=True
                    )

In [None]:
for sample in text:
    encoding_dict = preprocessing(sample, tokenizer)
    token_id.append(encoding_dict['input_ids'])
    attention_masks.append(encoding_dict['attention_mask'])


token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [7]:
def print_rand_sentence_encoding():
  '''Displays tokens, token IDs and attention mask of a random text sample'''
  index = random.randint(0, len(text) - 1)
  tokens = tokenizer.tokenize(tokenizer.decode(token_id[index]))
  token_ids = [i.numpy() for i in token_id[index]]
  attention = [i.numpy() for i in attention_masks[index]]

  table = np.array([tokens, token_ids, attention]).T
  print(tabulate(table,
                 headers = ["Tokens", "Token IDs", "Attention Mask"],
                 tablefmt = "fancy_grid"))

print_rand_sentence_encoding()

╒═════════════╤═════════════╤══════════════════╕
│ Tokens      │   Token IDs │   Attention Mask │
╞═════════════╪═════════════╪══════════════════╡
│ [CLS]       │         101 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ Ност        │       88874 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ ##альг      │       46672 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ ##ия        │        1577 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ по          │        1516 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ СССР        │        5576 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ вызывает    │       21248 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ у           │         875 │                1 │
├─────────────┼─────────────┼──────────────────┤
│ гражданских │       23674 │                1 │
├─────────────┼─────

In [8]:
val_ratio = 0.2

# Indices of the train and validation splits stratified by labels
train_idx, val_idx = train_test_split(
    np.arange(len(labels)),
    test_size = val_ratio,
    )

# Train and validation sets
train_set = TensorDataset(token_id[train_idx],
                          attention_masks[train_idx],
                          labels[train_idx])

val_set = TensorDataset(token_id[val_idx],
                        attention_masks[val_idx],
                        labels[val_idx])

In [9]:
train_idx

array([33502, 29184,  2759, ..., 23700, 24254,  3758])

In [10]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [12]:
def bert_experiment(batch_size, lr, epochs):
    # Recommended batch size: 16, 32.
    # Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5.
    # Recommended number of epochs: 2, 3, 4.
    # See: https://arxiv.org/pdf/1810.04805.pdf

    train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size,
        )

    validation_dataloader = DataLoader(
                val_set,
                sampler = SequentialSampler(val_set),
                batch_size = batch_size
            )
    model = BertForSequenceClassification.from_pretrained(
    "DeepPavlov/rubert-base-cased",
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr = lr, eps = 1e-08)

    # Run on GPU
    model.cuda()

    for _ in trange(epochs, desc = "Epoch"):

        val_labels = []
        val_preds = []

        # ========== Training ==========

        # Set model to training mode
        model.train()

        # Tracking variables
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer.zero_grad()
            # Forward pass
            train_output = model(b_input_ids,
                                token_type_ids = None,
                                attention_mask = b_input_mask,
                                labels = b_labels)
            # Backward pass
            train_output.loss.backward()
            optimizer.step()
            # Update tracking variables
            tr_loss += train_output.loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1

        # ========== Validation ==========

        # Set model to evaluation mode
        model.eval()

        for batch in validation_dataloader:

            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
              # Forward pass
              eval_output = model(b_input_ids,
                                  token_type_ids = None,
                                  attention_mask = b_input_mask)
            label_ids = b_labels.to('cpu').numpy().flatten()
            logits = np.argmax(eval_output.logits.detach().cpu().numpy(), axis = 1).flatten()
            val_preds = np.concatenate([val_preds, logits])
            val_labels = np.concatenate([val_labels, label_ids])
            #print("logits:", logits)
            #print("label_ids", label_ids)

        conf_matrix = confusion_matrix(val_labels, val_preds)

        val_accuracy = accuracy_score(val_labels, val_preds)
        print('\n\t - Train loss: {:.4f}'.format(tr_loss / nb_tr_steps))
        print('\n\t - Validation accuracy (epoch): {:.4f}'.format(val_accuracy))
        print(classification_report(val_labels, val_preds))

    mismatched_indices = np.where(val_preds != val_labels)
    print(mismatched_indices)
    return mismatched_indices, conf_matrix, model

In [16]:
mismatched_indices, conf_matrix, model_haha = bert_experiment(16, 5e-5, 2)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n


	 - Train loss: 0.1038

	 - Validation accuracy (epoch): 0.9766
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98      4159
         1.0       0.99      0.97      0.98      4087

    accuracy                           0.98      8246
   macro avg       0.98      0.98      0.98      8246
weighted avg       0.98      0.98      0.98      8246



Epoch:  50%|█████     | 1/2 [07:28<07:28, 448.57s/it]


KeyboardInterrupt: ignored

In [17]:
import pickle

filename = 'haha-model.sav'
pickle.dump(model_haha, open(filename, 'wb'))

In [19]:
model_haha.save_pretrained('/content/drive/MyDrive/')

In [4]:
m = BertForSequenceClassification.from_pretrained('/content/drive/MyDrive/')

In [14]:
sample = 'Когда черепашка вырастает она становится черепавлом'
encoding_dict = preprocessing(sample, tokenizer)
eval_output = m(encoding_dict['input_ids'], token_type_ids = None, attention_mask = encoding_dict['attention_mask'])
logits = np.argmax(eval_output.logits.detach().cpu().numpy(), axis = 1).flatten()
logits[0]

1