# Введение в обработку естественного языка
## Урок 13. Модель BERT и GPT
### Задача
Взять датасет  
https://huggingface.co/datasets/merionum/ru_paraphraser  
решить задачу парафраза

In [1]:
!pip install datasets
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.3.2-py3-none-any.whl (362 kB)
[K     |████████████████████████████████| 362 kB 4.3 MB/s 
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 42.8 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 10.8 MB/s 
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.5.0-py3-none-any.whl (140 kB)
[K     |████████████████████████████████| 140 kB 14.5 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 63.7 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0

In [2]:
import numpy as np
import torch

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction

In [3]:
dataset = load_dataset('merionum/ru_paraphraser')
label_list = list(set(dataset['train']['class']))

Using custom data configuration merionum--ru_paraphraser-1a7592429d7be082


Downloading and preparing dataset json/merionum--ru_paraphraser to /root/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-1a7592429d7be082/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.17M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/605k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/merionum___json/merionum--ru_paraphraser-1a7592429d7be082/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
def one_hot_encoding(example):
    example_class = example['class']
    oh1 = example_class =='-1'
    oh2 = example_class =='0'
    oh3 = example_class =='1'
    return {'class_-1': oh1, 'class_0': oh2, 'class_1': oh3}

In [5]:
ohe_dataset = dataset.map(one_hot_encoding)



  0%|          | 0/7227 [00:00<?, ?ex/s]

  0%|          | 0/1924 [00:00<?, ?ex/s]

In [6]:
labels = ['class_-1', 'class_0', 'class_1']
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
id2label

{0: 'class_-1', 1: 'class_0', 2: 'class_1'}

In [7]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
    # take a batch of texts
    text_1 = examples["text_1"]
    text_2 = examples["text_2"]
    # encode them
    encoding = tokenizer(text_1, text_2, padding="max_length", truncation=True, max_length=128)
    # add labels
    labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
    # create numpy array of shape (batch_size, num_labels)
    labels_matrix = np.zeros((len(text_1), len(labels)))
    # fill numpy array
    for idx, label in enumerate(labels):
        labels_matrix[:, idx] = labels_batch[label]
        
    encoding["labels"] = labels_matrix.tolist()
    
    return encoding

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [8]:
encoded_dataset = ohe_dataset.map(preprocess_data, batched=True, remove_columns=ohe_dataset['train'].column_names)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
encoded_dataset.set_format("torch")

In [10]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [11]:
def predict(idx):
    print(f"Text_1: {dataset['test'][idx]['text_1']}")
    print(f"Text_2: {dataset['test'][idx]['text_2']}")
    print(f"Class: {dataset['test'][idx]['class']}")
    print(f"-------------------------------------------")
    encoding = tokenizer(dataset['test'][idx]['text_1'], dataset['test'][idx]['text_2'], return_tensors="pt")
    encoding = {k: v.to(model.device) for k,v in encoding.items()}
    outputs = model(**encoding)
    probs = softmax(outputs.logits.squeeze().cpu()).detach().numpy()
    print(f"Probs: {probs}")
    print(f"Label: {model.config.id2label[probs.argmax(axis=-1)]}")

In [12]:
batch_size = 8
metric_name = "f1"

In [13]:
small_train_dataset = encoded_dataset["train"].shuffle(seed=42).select(range(500))
small_eval_dataset = encoded_dataset["test"].shuffle(seed=42).select(range(100))

In [14]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    #sigmoid = torch.nn.Sigmoid()
    #probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    #y_pred = np.zeros(probs.shape)
    #y_pred[np.where(probs >= threshold)] = 1
    
    # first, apply softmax on predictions which are of shape (batch_size, num_labels)
    softmax = torch.nn.Softmax(dim=-1)
    probs = softmax(torch.Tensor(predictions))
    #print(f'probs:\n{probs}')
    #print(f'probs.argmax:\n{probs.argmax(axis=-1)}')

    # next, use argmax to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    for i in range(len(y_pred)):
        y_pred[i, probs[i].argmax(axis=-1)] = 1
    #print(f'y_pred:\n{y_pred}')
    
    # finally, compute metrics
    y_true = labels
    #print(f'y_true:\n{y_true}')

    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [15]:
args = TrainingArguments(
    f"bert-finetuned",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)



trainer = Trainer(
    model,
    args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [16]:
trainer.train()

***** Running training *****
  Num examples = 500
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 315


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.574906,0.52,0.64,0.52
2,No log,0.579715,0.49,0.6175,0.49
3,No log,0.592837,0.51,0.6325,0.51
4,No log,0.625395,0.46,0.595,0.46
5,No log,0.638772,0.5,0.625,0.5


***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-63
Configuration saved in bert-finetuned/checkpoint-63/config.json
Model weights saved in bert-finetuned/checkpoint-63/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-63/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-63/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-126
Configuration saved in bert-finetuned/checkpoint-126/config.json
Model weights saved in bert-finetuned/checkpoint-126/pytorch_model.bin
tokenizer config file saved in bert-finetuned/checkpoint-126/tokenizer_config.json
Special tokens file saved in bert-finetuned/checkpoint-126/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 100
  Batch size = 8
Saving model checkpoint to bert-finetuned/checkpoint-189
Configuration saved in 

TrainOutput(global_step=315, training_loss=0.4869333902994792, metrics={'train_runtime': 3652.9714, 'train_samples_per_second': 0.684, 'train_steps_per_second': 0.086, 'total_flos': 164445886080000.0, 'train_loss': 0.4869333902994792, 'epoch': 5.0})

In [17]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 100
  Batch size = 8


{'epoch': 5.0,
 'eval_accuracy': 0.52,
 'eval_f1': 0.52,
 'eval_loss': 0.5749059319496155,
 'eval_roc_auc': 0.64,
 'eval_runtime': 43.4973,
 'eval_samples_per_second': 2.299,
 'eval_steps_per_second': 0.299}

In [19]:
softmax = torch.nn.Softmax(dim=-1)
predict(3)

Text_1: День Победы в Москве обещает выдаться облачным
Text_2: Любляна отпразднует День Победы вместе с Москвой
Class: -1
-------------------------------------------
Probs: [0.53006995 0.34291637 0.12701364]
Label: class_-1


In [20]:
predict(9)

Text_1: Суд оправдал Васильеву в хищении акций на два миллиарда рублей
Text_2: Суд оправдал Васильеву в хищении акций на 2 млрд рублей
Class: 1
-------------------------------------------
Probs: [0.14234819 0.6098932  0.24775857]
Label: class_0


In [22]:
predict(13)

Text_1: Троих подростков-убийц поймали в Подмосковье
Text_2: В Подмосковье трое подростков признались в серии убийств 
Class: 0
-------------------------------------------
Probs: [0.43602934 0.3976043  0.16636637]
Label: class_-1
