## Классификация текстов с использованием предобученных языковых моделей.

В данном задании вам предстоит обратиться к задаче классификации текстов и решить ее с использованием предобученной модели BERT.

In [None]:
import json
# do not change the code in the block below
# __________start of block__________
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

%matplotlib inline
# __________end of block__________

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Обратимся к набору данных SST-2. Holdout часть данных (которая понадобится вам для посылки) доступна по ссылке ниже.

In [None]:
# do not change the code in the block below
# __________start of block__________

!wget https://raw.githubusercontent.com/girafe-ai/ml-course/refs/heads/24f_yandex_ml_trainings/homeworks/hw04_bert_and_co/texts_holdout.json
# __________end of block__________

--2024-11-22 18:10:00--  https://raw.githubusercontent.com/girafe-ai/ml-course/refs/heads/24f_yandex_ml_trainings/homeworks/hw04_bert_and_co/texts_holdout.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51581 (50K) [text/plain]
Saving to: ‘texts_holdout.json’


2024-11-22 18:10:00 (6.14 MB/s) - ‘texts_holdout.json’ saved [51581/51581]



In [None]:
# do not change the code in the block below
# __________start of block__________
df = pd.read_csv(
    "https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv",
    delimiter="\t",
    header=None,
)
texts_train = df[0].values[:5000]
y_train = df[1].values[:5000]
texts_test = df[0].values[5000:]
y_test = df[1].values[5000:]
with open("texts_holdout.json") as iofile:
    texts_holdout = json.load(iofile)
# __________end of block__________

Весь остальной код предстоит написать вам.

Для успешной сдачи на максимальный балл необходимо добиться хотя бы __84.5% accuracy на тестовой части выборки__.

In [None]:
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [None]:
class TextClassificationDataset(Dataset):
  def __init__(self, texts, labels, tokenizer, max_lenght):
    self.texts = texts
    self.labels = labels
    self.tokenizer = tokenizer
    self.max_length = max_lenght

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = self.texts[idx]
    label = self.labels[idx]
    encoding = self.tokenizer(
        text,
        return_tensors='pt',
        max_length=self.max_length,
        padding='max_length',
        truncation=True
    )
    return {'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label)}

In [None]:
class BERTClassifier(nn.Module):
  def __init__(self, bert_model_name, num_classes):
    super(BERTClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(bert_model_name)
    self.dropout = nn.Dropout(0.1)
    self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

  def forward(self, input_ids, attention_mask):
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    pooled_output = outputs.pooler_output
    x = self.dropout(pooled_output)
    logits = self.fc(x)
    return logits


In [None]:
def train(model, data_loader, optimizer, scheduler, device):
  model.train()
  for batch in data_loader:
    optimizer.zero_grad()
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    loss = nn.CrossEntropyLoss()(outputs, labels)
    loss.backward()
    optimizer.step()
    scheduler.step()
    print(f"loss = {loss}")

In [None]:
def evaluate(model, data_loader, device):
  model.eval()
  predictions = []
  actual_labels = []
  with torch.no_grad():
    for batch in data_loader:
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      _, preds = torch.max(outputs, dim=1)
      predictions.extend(preds.cpu().tolist())
      actual_labels.extend(labels.cpu().tolist())
  return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [None]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
  model.eval()

  encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
  input_ids = encoding['input_ids'].to(device)
  attention_mask = encoding['attention_mask'].to(device)

  with torch.no_grad():
      outputs = model(input_ids=input_ids, attention_mask=attention_mask)
      logits = outputs
      proba = logits.softmax(dim=1)
      _, preds = torch.max(outputs, dim=1)
      proba = proba[:, 1].flatten()
  return preds.item(), proba

In [None]:
# Set up parameters
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 10
num_epochs = 4
learning_rate = 2e-5

In [None]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(texts_train, y_train, tokenizer, max_length)
val_dataset = TextClassificationDataset(texts_test, y_test, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
  print(f"Epoch {epoch + 1}/{num_epochs}")
  train(model, train_dataloader, optimizer, scheduler, device)
  accuracy, report = evaluate(model, val_dataloader, device)
  print(f"Validation Accuracy: {accuracy:.4f}")
  print(report)

Epoch 1/4
loss = 0.7702503204345703
loss = 0.7119559645652771
loss = 0.6914695501327515
loss = 0.6910136938095093
loss = 0.6939231157302856
loss = 0.6743553876876831
loss = 0.6757327318191528
loss = 0.6681693196296692
loss = 0.698550283908844
loss = 0.6150995492935181
loss = 0.680782675743103
loss = 0.6605938076972961
loss = 0.7454982995986938
loss = 0.6325723528862
loss = 0.5985842943191528
loss = 0.7156761884689331
loss = 0.6459305286407471
loss = 0.6635984182357788
loss = 0.7424151301383972
loss = 0.6209464073181152
loss = 0.6042762994766235
loss = 0.6260980367660522
loss = 0.7092524766921997
loss = 0.6672481894493103
loss = 0.6498962640762329
loss = 0.6428765058517456
loss = 0.5979549288749695
loss = 0.5908058881759644
loss = 0.5785670876502991
loss = 0.597919225692749
loss = 0.582421064376831
loss = 0.6786468029022217
loss = 0.6710689663887024
loss = 0.7185537815093994
loss = 0.5223114490509033
loss = 0.6848912239074707
loss = 0.6043207049369812
loss = 0.5689333081245422
loss = 0.

In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [None]:
probas_train = []
predict_train = []

for text in texts_train:
  pred, proba = predict_sentiment(text, model, tokenizer, device)
  predict_train.append(pred)
  probas_train.append(proba)

print(roc_auc_score(y_train, predict_train))

0.9987977977578448


In [None]:
print(accuracy_score(y_train, predict_train))

0.9988


In [None]:
probas_test = []
predict_test = []

for text in texts_test:
  pred, proba = predict_sentiment(text, model, tokenizer, device)
  predict_test.append(pred)
  probas_test.append(proba)

print(roc_auc_score(y_test, predict_test))
print(accuracy_score(y_test, predict_test))

0.912293109765578
0.9119791666666667


In [None]:
probas_hold = []
predict_hold = []

for text in texts_holdout:
  pred, proba = predict_sentiment(text, model, tokenizer, device)
  predict_hold.append(pred)
  probas_hold.append(proba)

In [None]:
from_tensor_to_train = lambda ab: [float(a.item()) for a in ab]
probas_train = from_tensor_to_train(probas_train)

In [None]:
probas_test = from_tensor_to_train(probas_test)
probas_hold = from_tensor_to_train(probas_hold)

#### Сдача взадания в контест
Сохраните в словарь `out_dict` вероятности принадлежности к первому (положительному) классу

In [None]:
out_dict = {
    'train': probas_train,
    'test': probas_test,
    'holdout': probas_hold
}

Несколько `assert`'ов для проверки вашей посылки:

In [None]:
assert isinstance(out_dict["train"], list), "Object must be a list of floats"
assert isinstance(out_dict["train"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["train"]) == 5000
), "The predicted probas list length does not match the train set size"

assert isinstance(out_dict["test"], list), "Object must be a list of floats"
assert isinstance(out_dict["test"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["test"]) == 1920
), "The predicted probas list length does not match the test set size"

assert isinstance(out_dict["holdout"], list), "Object must be a list of floats"
assert isinstance(out_dict["holdout"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["holdout"]) == 500
), "The predicted probas list length does not match the holdout set size"

Запустите код ниже для генерации посылки.

In [None]:
# do not change the code in the block below
# __________start of block__________
FILENAME = "submission_dict_hw_text_classification_with_bert.json"

with open(FILENAME, "w") as iofile:
    json.dump(out_dict, iofile)
print(f"File saved to `{FILENAME}`")
# __________end of block__________

File saved to `submission_dict_hw_text_classification_with_bert.json`


На этом задание завершено. Поздравляем!