<a href="https://colab.research.google.com/github/ThrallPraudmur/Transformers/blob/main/BERT'_s_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from collections import Counter
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle

comment_data = pd.read_excel('values_df.xlsx')
comment_data = shuffle(comment_data)

# Подготовка данных
encoder = LabelEncoder()
comment_data['class'] = encoder.fit_transform(comment_data['sentiment'])

X = comment_data['text']
y = comment_data['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42, stratify = comment_data['class'])
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size = 0.5, random_state = 42, stratify = y_test)

# Сompute class frequencies
class_counts = Counter(y_train)
total_samples = len(y_train)

class_weights = torch.tensor([total_samples / (class_counts[i] * len(class_counts)) for i in range(len(class_counts))], dtype = torch.float)

In [None]:
tokenizer('бк с гд. потребности в кредите.! лизинг нет потребности.')

{'input_ids': [101, 843, 862, 869, 847, 850, 132, 37758, 845, 15123, 842, 132, 106, 61348, 8953, 37758, 132, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [None]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels = 4)

# Токенизация и преобразование комментариев
train_encodings = tokenizer(list(X_train), truncation = True, padding = True)
test_encodings = tokenizer(list(X_test), truncation = True, padding = True)

# Создание тензоров pytorch для входных данных
train_dataset = TensorDataset(torch.tensor(train_encodings['input_ids']),
                              torch.tensor(train_encodings['attention_mask']),
                              torch.tensor(y_train.values),
                              class_weights[y_train.values])

test_dataset = TensorDataset(torch.tensor(test_encodings['input_ids']),
                             torch.tensor(test_encodings['attention_mask']),
                             torch.tensor(y_test.values))

# Создание dataloader для обучающего и тестового набора данных
train_loader = DataLoader(train_dataset, batch_size = 16, shuffle = True)
test_loader = DataLoader(test_dataset, batch_size = 16, shuffle = False)

# Установка GPU
# device = torch.device('cuda')
# model.to(device)

# loss_fn = torch.nn.CrossEntropyLoss()
loss_fn = torch.nn.CrossEntropyLoss(weight = class_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-5)

# Обучение модели
for epoch in range(5):
  model.train()
  for batch in train_loader:
    # input_ids, attention_mask, labels = (item.to(device) for item in batch)
    # input_ids, attention_mask, labels, weights = (item.to(device) for item in batch)
    input_ids, attention_mask, labels, weights = (item for item in batch)
    optimizer.zero_grad()
    outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
    loss = outputs.loss
    loss.backward()
    optimizer.step()

model.eval()
predictions = []
with torch.no_grad():
  for batch in test_loader:
    # input_ids, attention_mask, _ = (item.to(device) for item in batch)
    input_ids, attention_mask, _ = (item for item in batch)
    outputs = model(input_ids, attention_mask = attention_mask)
    _, predicted_labels = torch.max(outputs.logits, dim = 1)
    predictions.extend(predicted_labels.cpu().numpy())

accuracy = accuracy_score(y_test, predictions)
accuracy

In [None]:
y_test = encoder.inverse_transform(y_test)
predictions = encoder.inverse_transform(predictions)

results_df = pd.DataFrame({'Comment': X_test, 'True_Label': y_test, 'Predicted_Label': predictions})
print(classification_report(results_df['True_Label'], results_df['Predicted_Label']))

              precision    recall  f1-score   support

    negative       0.88      0.88      0.88        32
     neutral       0.90      0.50      0.64        18
    opponent       0.89      0.53      0.67        15
    positive       0.71      0.95      0.81        41

    accuracy                           0.79       106
   macro avg       0.84      0.71      0.75       106
weighted avg       0.82      0.79      0.78       106



In [None]:
model.save_pretrained('bert_model')
tokenizer.save_pretrained('bert_tokenizer')

('bert_tokenizer/tokenizer_config.json',
 'bert_tokenizer/special_tokens_map.json',
 'bert_tokenizer/vocab.txt',
 'bert_tokenizer/added_tokens.json')

In [None]:
'польз' in tokenizer.vocab

True

In [None]:
pd.options.display.max_colwidth = None
results_df