<a href="https://colab.research.google.com/github/PolinaKudryavtseva/NNmethods/blob/main/hw4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
!pip install datasets

In [3]:
import transformers
import torch
from datasets import load_dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModel, BertForSequenceClassification, AutoModelForSequenceClassification
from torch import nn
from torch.nn import CrossEntropyLoss

## Данные

In [4]:
imdb_dataset = load_dataset('imdb')

Downloading builder script:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text (download: 80.23 MiB, generated: 127.02 MiB, post-processed: Unknown size, total: 207.25 MiB) to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

Данных очень много, возьмем не все. И добавим валидационную выборку.

In [6]:
train_texts = imdb_dataset["train"]["text"][:1500] + imdb_dataset["train"]["text"][-1500:]
train_labels = imdb_dataset["train"]["label"][:1500] + imdb_dataset["train"]["label"][-1500:]
test_texts = imdb_dataset["test"]["text"][:500] + imdb_dataset["test"]["text"][-500:]
test_labels = imdb_dataset["test"]["label"][:500] + imdb_dataset["test"]["label"][-500:]
val_texts = imdb_dataset["test"]["text"][500:1000] + imdb_dataset["test"]["text"][-1000:-500]
val_labels = imdb_dataset["test"]["label"][500:1000] + imdb_dataset["test"]["label"][-1000:-500]

In [7]:
del imdb_dataset

## Токенизация

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased', use_fast=True)

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/972k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.87M [00:00<?, ?B/s]

In [9]:
train_tokenized = tokenizer(train_texts, truncation=True, 
                            padding='max_length', max_length=256
                            )
val_tokenized = tokenizer(val_texts, truncation=True, 
                            padding='max_length', max_length=256)
test_tokenized = tokenizer(test_texts, truncation=True, 
                            padding='max_length', max_length=256)

In [10]:
class IMDb_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDb_Dataset(train_tokenized, train_labels)
test_dataset = IMDb_Dataset(test_tokenized, test_labels)
val_dataset = IMDb_Dataset(val_tokenized, val_labels)

## Для всех моделей

In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    do_train=True,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_eval=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

In [13]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## SentimentClassifier

In [None]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = AutoModel.from_pretrained("bert-base-multilingual-cased")
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    self.n_classes = n_classes
  
  def forward(self,        
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None
    ):

    last_hidden_state, pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask, return_dict=False)
    output = self.drop(pooled_output)
    logits = self.out(output)
    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
    output = (logits,)

    return ((loss,) + output) if loss is not None else output

In [None]:
model = SentimentClassifier(2)
model = model.to(device)

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5067,0.470677,0.77,0.773622,0.761628,0.786
2,0.4511,0.581372,0.803,0.820091,0.754622,0.898


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-375
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.49630421113967893, metrics={'train_runtime': 702.1313, 'train_samples_per_second': 8.545, 'train_steps_per_second': 1.068, 'total_flos': 0.0, 'train_loss': 0.49630421113967893, 'epoch': 2.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'eval_accuracy': 0.803,
 'eval_f1': 0.8200913242009131,
 'eval_loss': 0.5813719630241394,
 'eval_precision': 0.7546218487394958,
 'eval_recall': 0.898,
 'eval_runtime': 31.0177,
 'eval_samples_per_second': 32.24,
 'eval_steps_per_second': 4.03}

In [None]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'test_accuracy': 0.812,
 'test_f1': 0.8259259259259258,
 'test_loss': 0.5288817286491394,
 'test_precision': 0.7689655172413793,
 'test_recall': 0.892,
 'test_runtime': 30.5988,
 'test_samples_per_second': 32.681,
 'test_steps_per_second': 4.085}

Итак, SentimentClassifier test accuracy - 0,812. Посмотрим, что будет дальше.

## SentimentClassifier с CLS

In [None]:
class SentimentClassifier_cls(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = AutoModel.from_pretrained("bert-base-multilingual-cased")
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
    self.out = nn.Linear(self.bert.config.hidden_size*2, n_classes)
    self.n_classes = n_classes
  
  def forward(self,        
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None
    ):

    last_hidden_state, pooled_output = self.bert(
                                  input_ids=input_ids,
                                  attention_mask=attention_mask,
                                  return_dict=False)
    cls = last_hidden_state[:,0,:] 
    stacked_layers = torch.hstack([cls, pooled_output])
    logits = self.out(stacked_layers)
    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
    output = (logits,)
    return ((loss,) + output) if loss is not None else output

In [None]:
model = SentimentClassifier_cls(2) 
model = model.to(device)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "abs

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4082,0.4616,0.771,0.742407,0.848329,0.66
2,0.42,0.581159,0.819,0.825794,0.795918,0.858


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-375
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.4837514788309733, metrics={'train_runtime': 705.8374, 'train_samples_per_second': 8.501, 'train_steps_per_second': 1.063, 'total_flos': 0.0, 'train_loss': 0.4837514788309733, 'epoch': 2.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'eval_accuracy': 0.819,
 'eval_f1': 0.8257940327237729,
 'eval_loss': 0.5811593532562256,
 'eval_precision': 0.7959183673469388,
 'eval_recall': 0.858,
 'eval_runtime': 31.0526,
 'eval_samples_per_second': 32.203,
 'eval_steps_per_second': 4.025}

In [None]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'test_accuracy': 0.837,
 'test_f1': 0.8409756097560975,
 'test_loss': 0.47776055335998535,
 'test_precision': 0.820952380952381,
 'test_recall': 0.862,
 'test_runtime': 30.9547,
 'test_samples_per_second': 32.305,
 'test_steps_per_second': 4.038}

У этой модели - SentimentClassifier_cls - test accuracy уже больше, чем у предыдущей (у той была 0,816) - 0,837.

## BertForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased")
model = model.to(device)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "abs

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6271,0.516086,0.743,0.775546,0.688372,0.888
2,0.4037,0.562927,0.79,0.794521,0.777778,0.812


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-375
Configuration saved in ./results/checkpoint-375/config.json
Model weights saved in ./results/checkpoint-375/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-750
Configuration saved in ./results/checkpoint-750/config.json
Model weights saved in ./results/checkpoint-750/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.5351792074839274, metrics={'train_runtime': 698.7648, 'train_samples_per_second': 8.587, 'train_steps_per_second': 1.073, 'total_flos': 789333166080000.0, 'train_loss': 0.5351792074839274, 'epoch': 2.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'eval_accuracy': 0.79,
 'eval_f1': 0.7945205479452055,
 'eval_loss': 0.56292724609375,
 'eval_precision': 0.7777777777777778,
 'eval_recall': 0.812,
 'eval_runtime': 31.1317,
 'eval_samples_per_second': 32.122,
 'eval_steps_per_second': 4.015}

In [None]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'test_accuracy': 0.815,
 'test_f1': 0.8173741362290227,
 'test_loss': 0.49718427658081055,
 'test_precision': 0.8070175438596491,
 'test_recall': 0.828,
 'test_runtime': 31.1018,
 'test_samples_per_second': 32.152,
 'test_steps_per_second': 4.019}

У этой модели test accuracy ниже, чем у двух предыдущих - всего 0,815 против 0,817 и 0,837 :(

## SentimentClassifier с CLS токенами для нескольких слоев

In [None]:
class SentimentClassifier_CLSpooled(nn.Module):

  def __init__(self, n_classes):
    super().__init__()
    self.bert = AutoModel.from_pretrained("bert-base-multilingual-cased")
    self.drop = nn.Dropout(p=0.3)
    self.linear = nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size)
    self.out = nn.Linear(self.bert.config.hidden_size*2, n_classes)
    self.n_classes = n_classes
  
  def forward(self,        
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None
    ):

    last_hidden_state, pooled_output, hidden_states = self.bert(
                                                  input_ids=input_ids,
                                                  attention_mask=attention_mask,
                                                  output_hidden_states=True,
                                                  return_dict=False)
    hidden_states = torch.stack(hidden_states)
    cls = torch.mean(hidden_states[:, :, 0], 0)
    stacked_layers = torch.hstack([cls, pooled_output])
    logits = self.out(stacked_layers)
    loss = None
    if labels is not None:
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.n_classes), labels.view(-1))
    output = (logits,)

    return ((loss,) + output) if loss is not None else output

In [None]:
model = SentimentClassifier_CLSpooled(2)

loading configuration file https://huggingface.co/bert-base-multilingual-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/6c4a5d81a58c9791cdf76a09bce1b5abfb9cf958aebada51200f4515403e5d08.0fe59f3f4f1335dadeb4bce8b8146199d9083512b50d07323c1c319f96df450c
Model config BertConfig {
  "_name_or_path": "bert-base-multilingual-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "abs

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics = compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 3000
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 750


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.568,0.681649,0.607,0.716245,0.560452,0.992
2,0.5127,0.55893,0.824,0.829457,0.804511,0.856


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-375
Trainer.model is not a `PreTrainedModel`, only saving its state dict.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-750
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=750, training_loss=0.49083949693044027, metrics={'train_runtime': 705.3375, 'train_samples_per_second': 8.507, 'train_steps_per_second': 1.063, 'total_flos': 0.0, 'train_loss': 0.49083949693044027, 'epoch': 2.0})

In [None]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'eval_accuracy': 0.824,
 'eval_f1': 0.8294573643410852,
 'eval_loss': 0.5589303970336914,
 'eval_precision': 0.8045112781954887,
 'eval_recall': 0.856,
 'eval_runtime': 31.5686,
 'eval_samples_per_second': 31.677,
 'eval_steps_per_second': 3.96}

In [None]:
trainer.evaluate(eval_dataset=test_dataset, metric_key_prefix="test")

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 8


{'epoch': 2.0,
 'test_accuracy': 0.834,
 'test_f1': 0.837573385518591,
 'test_loss': 0.5178730487823486,
 'test_precision': 0.8199233716475096,
 'test_recall': 0.856,
 'test_runtime': 31.2511,
 'test_samples_per_second': 31.999,
 'test_steps_per_second': 4.0}

Эта модель стала второй по test accuracy, с результатом чуть меньшим, чем SentimentClassifier с CLS - 0,834 против 0,837.

Итак, лучшая модель - SentimentClassifier с CLS!