## Классификация текстов с использованием предобученных языковых моделей.

В данном задании вам предстоит обратиться к задаче классификации текстов и решить ее с использованием предобученной модели BERT.

In [1]:
import json
# do not change the code in the block below
# __________start of block__________
import os
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython.display import clear_output
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

%matplotlib inline
# __________end of block__________

Обратимся к набору данных SST-2. Holdout часть данных (которая понадобится вам для посылки) доступна по ссылке ниже.

In [2]:
# do not change the code in the block below
# __________start of block__________

!wget https://raw.githubusercontent.com/girafe-ai/ml-course/refs/heads/24f_yandex_ml_trainings/homeworks/hw04_bert_and_co/texts_holdout.json
# __________end of block__________

--2024-11-23 07:57:52--  https://raw.githubusercontent.com/girafe-ai/ml-course/refs/heads/24f_yandex_ml_trainings/homeworks/hw04_bert_and_co/texts_holdout.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51581 (50K) [text/plain]
Saving to: ‘texts_holdout.json’


2024-11-23 07:57:52 (851 KB/s) - ‘texts_holdout.json’ saved [51581/51581]



In [3]:
# do not change the code in the block below
# __________start of block__________
df = pd.read_csv(
    "https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv",
    delimiter="\t",
    header=None,
)
texts_train = df[0].values[:5000]
y_train = df[1].values[:5000]
texts_test = df[0].values[5000:]
y_test = df[1].values[5000:]
with open("texts_holdout.json") as iofile:
    texts_holdout = json.load(iofile)
# __________end of block__________

Весь остальной код предстоит написать вам.

Для успешной сдачи на максимальный балл необходимо добиться хотя бы __84.5% accuracy на тестовой части выборки__.

In [43]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
from tqdm import tqdm

In [36]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
            self.texts = texts
            self.labels = labels
            self.tokenizer = tokenizer
            self.max_length = max_length

    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [8]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

In [27]:
def train(model, data_loader, optimizer, scheduler, device, loss_func=nn.CrossEntropyLoss()):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_func(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

In [10]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

In [67]:
def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return preds.item()

def predict_probas(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask) 
        outputs = torch.softmax(outputs, dim=1)
        #print(outputs)
    return outputs[:, 1].item()

In [29]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 2
learning_rate = 2e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [37]:
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
ds_train = TextClassificationDataset(texts_train, y_train, tokenizer, max_length)
ds_val = TextClassificationDataset(texts_test, y_test, tokenizer, max_length)
dl_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True)
dl_val = DataLoader(ds_val, batch_size=batch_size)

In [31]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(ds_train) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [32]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, dl_train, optimizer, scheduler, device)
    accuracy, report = evaluate(model, dl_val, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/2
Validation Accuracy: 0.8990
              precision    recall  f1-score   support

           0       0.91      0.88      0.89       917
           1       0.89      0.92      0.90      1003

    accuracy                           0.90      1920
   macro avg       0.90      0.90      0.90      1920
weighted avg       0.90      0.90      0.90      1920

Epoch 2/2
Validation Accuracy: 0.9089
              precision    recall  f1-score   support

           0       0.91      0.90      0.90       917
           1       0.91      0.92      0.91      1003

    accuracy                           0.91      1920
   macro avg       0.91      0.91      0.91      1920
weighted avg       0.91      0.91      0.91      1920



In [70]:
train_res = []
val_res = []
test_res = []

for text in tqdm(texts_train):
    res = predict_probas(text, model, tokenizer, device, max_length)
    train_res.append(res)

for text in tqdm(texts_test):
    res = predict_probas(text, model, tokenizer, device, max_length)
    val_res.append(res)

for text in tqdm(texts_holdout):
    res = predict_probas(text, model, tokenizer, device, max_length)
    test_res.append(res)

100%|██████████| 5000/5000 [00:58<00:00, 85.02it/s] 
100%|██████████| 1920/1920 [00:26<00:00, 71.60it/s]
100%|██████████| 500/500 [00:05<00:00, 90.41it/s]


In [71]:
print(len(train_res))
print(len(val_res))
print(len(test_res))

5000
1920
500


#### Сдача взадания в контест
Сохраните в словарь `out_dict` вероятности принадлежности к первому (положительному) классу

In [75]:
print(test_res)

[0.05855754017829895, 0.9679608345031738, 0.5833645462989807, 0.0058727082796394825, 0.012927518226206303, 0.24907049536705017, 0.955267608165741, 0.011078527197241783, 0.8589808940887451, 0.9913628101348877, 0.05769398808479309, 0.0042026531882584095, 0.8582097291946411, 0.9718685150146484, 0.01902637630701065, 0.9914076924324036, 0.554171621799469, 0.9869203567504883, 0.060488324612379074, 0.3637818694114685, 0.007221213076263666, 0.01263545174151659, 0.9869203567504883, 0.8103464841842651, 0.9946797490119934, 0.9687501192092896, 0.9424953460693359, 0.9835017323493958, 0.18136875331401825, 0.1997794210910797, 0.6646702885627747, 0.7050139307975769, 0.012012647464871407, 0.9750622510910034, 0.9927587509155273, 0.9914311766624451, 0.006818759720772505, 0.05381226912140846, 0.9462679624557495, 0.9934646487236023, 0.9732042551040649, 0.010739143937826157, 0.782086968421936, 0.12148739397525787, 0.02973640151321888, 0.07334636151790619, 0.8162181973457336, 0.5833645462989807, 0.9899334311

In [72]:
out_dict = {
    'train': train_res,  # list of length 5000 with probas
    'test': val_res,  # list of length 1920 with probas
    'holdout': test_res,  # list of length 500 with probas
}

Несколько `assert`'ов для проверки вашей посылки:

In [76]:
assert isinstance(out_dict["train"], list), "Object must be a list of floats"
assert isinstance(out_dict["train"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["train"]) == 5000
), "The predicted probas list length does not match the train set size"

assert isinstance(out_dict["test"], list), "Object must be a list of floats"
assert isinstance(out_dict["test"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["test"]) == 1920
), "The predicted probas list length does not match the test set size"

assert isinstance(out_dict["holdout"], list), "Object must be a list of floats"
assert isinstance(out_dict["holdout"][0], float), "Object must be a list of floats"
assert (
    len(out_dict["holdout"]) == 500
), "The predicted probas list length does not match the holdout set size"

Запустите код ниже для генерации посылки.

In [81]:
# do not change the code in the block below
# __________start of block__________
FILENAME = "./submission_dict_hw_text_classification_with_bert.json"

with open(FILENAME, "w") as iofile:
    json.dump(out_dict, iofile)
print(f"File saved to `{FILENAME}`")
# __________end of block__________

File saved to `./submission_dict_hw_text_classification_with_bert.json`


На этом задание завершено. Поздравляем!