## Задача «Классификация эмоций в текстовых расшифровках голосовых сообщений»

In [4]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tqdm import tqdm


In [5]:
df = pd.read_csv("C:\\Users\Taya\Desktop\Фразы-эмоции.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1043 entries, 0 to 1042
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Текст фразы  1043 non-null   object
 1   Эмоция       1043 non-null   object
dtypes: object(2)
memory usage: 16.4+ KB


In [7]:
df.head(30)

Unnamed: 0,Текст фразы,Эмоция
0,"Я не могу поверить, что ты снова опоздал на ва...",Недовольство
1,"Врач, к которому меня направили, оказался груб...",Недовольство
2,Как же меня раздражает твоя привычка постоянно...,Недовольство
3,"Мне кажется, что некоторые врачи просто зараба...",Недовольство
4,Мне надоело слушать твои бесконечные жалобы и ...,Недовольство
5,В нашей стране медицина находится в ужасном со...,Недовольство
6,"Ты думаешь, что можешь просто так взять и уйти...",Недовольство
7,"Врач, к которому меня направили, оказался груб...",Недовольство
8,"Почему ты не можешь понять, что твои слова и д...",Недовольство
9,"Я потратил(а) несколько часов, ожидая приёма у...",Недовольство


In [8]:
df.Эмоция.unique()

array(['Недовольство', 'Злость', 'Зависть', 'Сочувствие', 'Радость',
       'Печаль', 'Интерес', 'Нейтрально'], dtype=object)

In [9]:
# Токенизатор для RuBERT
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

# Преобразуем метки в числовой формат
label_encoder = LabelEncoder()
df["Эмоция_код"] = label_encoder.fit_transform(df["Эмоция"])

# Разделение на обучающую и тестовую выборки
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Текст фразы"].values,
    df["Эмоция_код"].values,
    test_size=0.2,
    random_state=42
)

# Токенизация
def tokenize_texts(texts, tokenizer, max_len=128):
    encodings = tokenizer(
        list(texts),
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    return encodings["input_ids"], encodings["attention_mask"]

train_input_ids, train_attention_masks = tokenize_texts(train_texts, tokenizer)
val_input_ids, val_attention_masks = tokenize_texts(val_texts, tokenizer)




In [10]:
class EmotionDataset(Dataset):
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_masks[idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Создание Dataset и DataLoader
train_dataset = EmotionDataset(train_input_ids, train_attention_masks, train_labels)
val_dataset = EmotionDataset(val_input_ids, val_attention_masks, val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


 ## Блок: Определение модели

In [11]:
class EmotionClassifier(nn.Module):
    def __init__(self, num_labels):
        super(EmotionClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("DeepPavlov/rubert-base-cased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        return self.fc(pooled_output)

# Инициализация модели
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = len(label_encoder.classes_)
model = EmotionClassifier(num_labels).to(device)


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT e

## Обучение

In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

def train_model(model, train_loader, val_loader, optimizer, criterion, epochs=3):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1} Training Loss: {total_loss / len(train_loader)}")
        evaluate_model(model, val_loader)

def evaluate_model(model, val_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    print(classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

# Запуск обучения
train_model(model, train_loader, val_loader, optimizer, criterion, epochs=3)


Training Epoch 1: 100%|██████████| 53/53 [08:45<00:00,  9.91s/it]


Epoch 1 Training Loss: 1.647058128185992


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     Зависть       0.95      0.86      0.90        22
      Злость       0.70      0.77      0.73        30
     Интерес       0.77      0.43      0.56        23
Недовольство       0.00      0.00      0.00         3
  Нейтрально       0.58      1.00      0.73        44
      Печаль       0.94      0.58      0.71        26
     Радость       0.68      0.59      0.63        29
  Сочувствие       0.85      0.69      0.76        32

    accuracy                           0.72       209
   macro avg       0.68      0.61      0.63       209
weighted avg       0.75      0.72      0.71       209



Training Epoch 2: 100%|██████████| 53/53 [08:09<00:00,  9.23s/it]


Epoch 2 Training Loss: 0.557092342736586


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

     Зависть       1.00      0.77      0.87        22
      Злость       0.79      0.77      0.78        30
     Интерес       0.89      0.74      0.81        23
Недовольство       0.00      0.00      0.00         3
  Нейтрально       0.90      0.82      0.86        44
      Печаль       0.70      0.62      0.65        26
     Радость       0.70      0.97      0.81        29
  Сочувствие       0.73      0.94      0.82        32

    accuracy                           0.80       209
   macro avg       0.71      0.70      0.70       209
weighted avg       0.80      0.80      0.79       209



Training Epoch 3: 100%|██████████| 53/53 [08:07<00:00,  9.20s/it]


Epoch 3 Training Loss: 0.2556485989886635
              precision    recall  f1-score   support

     Зависть       0.74      0.91      0.82        22
      Злость       0.86      0.80      0.83        30
     Интерес       0.72      0.78      0.75        23
Недовольство       0.00      0.00      0.00         3
  Нейтрально       0.85      0.75      0.80        44
      Печаль       0.64      0.88      0.74        26
     Радость       0.88      0.72      0.79        29
  Сочувствие       0.90      0.84      0.87        32

    accuracy                           0.79       209
   macro avg       0.70      0.71      0.70       209
weighted avg       0.80      0.79      0.79       209



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Блок: Сохранение модели

In [13]:
torch.save(model.state_dict(), "emotion_classifier.pth")


## Блок: Загрузка и предсказание

In [17]:
# Загрузка модели
loaded_model = EmotionClassifier(num_labels)
loaded_model.load_state_dict(torch.load("emotion_classifier.pth"))
loaded_model.to(device)

# Функция предсказания
def predict_emotion(text, model, tokenizer):
    model.eval()
    encoding = tokenizer(
        text,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask)
        prediction = torch.argmax(outputs, dim=1)

    return label_encoder.inverse_transform([prediction.cpu().item()])[0]

# Пример использования
text_example = "я сейчас буду плакать"
print(predict_emotion(text_example, loaded_model, tokenizer))


  return torch.load(checkpoint_file, map_location="cpu")
Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  loaded_model.load_state_dict(torch.load("emotion_clas

Печаль
