# Model training: LSTM (emotion)

---

## Table of Contents

1. [Imports](#imports)
2. [Data loading and splitting](#data-loading-and-splitting)
3. [Setting training parameters](#setting-training-parameters)
4. [Model training](#model-training)
5. [Model evaluation](#model-evaluation)
6. [Summary](#summary)
7. [Model serialization](#model-serialization)

# Imports

In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, Dataset

# Data loading and splitting

In [2]:
base_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))

train_file = os.path.join(base_dir, 'train_emotion.csv')
val_file = os.path.join(base_dir, 'val_emotion.csv')
test_file = os.path.join(base_dir, 'test_emotion.csv')

if not all([os.path.exists(train_file), os.path.exists(val_file), os.path.exists(test_file)]):
    emotion_df = pd.read_parquet('../../data/emotion_without_outliers/emotion_without_outliers.parquet')
    emotion_df = emotion_df.drop(columns=['text_length'])
    
    target_samples_per_class = 16_667  # 100k / 6 classes of emotions
    
    balanced_data = emotion_df.groupby('label', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), target_samples_per_class), random_state=42)
    )
    
    train_data, temp_data = train_test_split(balanced_data, test_size=0.3, stratify=balanced_data['label'], random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

    train_data.to_csv(train_file, index=False)
    val_data.to_csv(val_file, index=False)
    test_data.to_csv(test_file, index=False)
else:
    train_data = pd.read_csv(train_file)
    val_data = pd.read_csv(val_file)
    test_data = pd.read_csv(test_file)

# Setting training parameters

Due to the uneven distribution of classes in the dataset, the classes will be weighted.

In [3]:
MAX_NUM_WORDS = 20000
MAX_SEQ_LENGTH = 256

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['text'])

In [4]:
def encode_texts(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

In [5]:
X_train = encode_texts(train_data['text'])
X_val = encode_texts(val_data['text'])
X_test = encode_texts(test_data['text'])

y_train = train_data['label'].values
y_val = val_data['label'].values
y_test = test_data['label'].values

In [6]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [7]:
class TokenizedTextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {'input_ids': self.X[idx], 'label': self.y[idx]}

In [8]:
train_dataset = TokenizedTextDataset(X_train, y_train)
val_dataset = TokenizedTextDataset(X_val, y_val)
test_dataset = TokenizedTextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Model training

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.relu = nn.ReLU()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
    
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embedded)
        pooled = torch.mean(lstm_out, dim=1)
        dropped = self.dropout(pooled)
        output = self.fc(self.relu(dropped))
        return output

In [11]:
model = LSTMClassifier(vocab_size=MAX_NUM_WORDS, embed_dim=256, hidden_dim=256, num_classes=6).to(device)

In [12]:
loss_fn = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [13]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, epoch):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)
        
        if batch_idx % 10 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            accuracy = 100. * correct_predictions / total_samples
            print(f"Epoch: {epoch}. Batch {batch_idx}/{len(data_loader)} - Avg Loss: {avg_loss:.4f} - Accuracy: {accuracy:.2f}%")

    avg_loss = total_loss / len(data_loader)
    accuracy = 100. * correct_predictions / total_samples
    return avg_loss, accuracy

In [14]:
def eval_model(model, data_loader, device):
    model.eval()
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids)
            _, preds = torch.max(logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_samples += labels.size(0)

    return 100. * correct_predictions / total_samples

In [15]:
for epoch in range(2):
    train_loss, train_accuracy = train_epoch(model, train_loader, loss_fn, optimizer, device, epoch)
    print(f"Train loss: {train_loss:.4f} - Train accuracy: {train_accuracy:.2f}%")
    val_accuracy = eval_model(model, val_loader, device)
    print(f"Validation accuracy: {val_accuracy:.4f}")

Epoch: 0. Batch 0/4290 - Avg Loss: 1.8370 - Accuracy: 12.50%
Epoch: 0. Batch 10/4290 - Avg Loss: 1.8297 - Accuracy: 19.32%
Epoch: 0. Batch 20/4290 - Avg Loss: 1.8351 - Accuracy: 19.35%
Epoch: 0. Batch 30/4290 - Avg Loss: 1.8288 - Accuracy: 19.96%
Epoch: 0. Batch 40/4290 - Avg Loss: 1.8209 - Accuracy: 19.82%
Epoch: 0. Batch 50/4290 - Avg Loss: 1.8257 - Accuracy: 18.26%
Epoch: 0. Batch 60/4290 - Avg Loss: 1.8218 - Accuracy: 18.55%
Epoch: 0. Batch 70/4290 - Avg Loss: 1.8218 - Accuracy: 18.13%
Epoch: 0. Batch 80/4290 - Avg Loss: 1.8201 - Accuracy: 18.06%
Epoch: 0. Batch 90/4290 - Avg Loss: 1.8182 - Accuracy: 18.06%
Epoch: 0. Batch 100/4290 - Avg Loss: 1.8159 - Accuracy: 18.07%
Epoch: 0. Batch 110/4290 - Avg Loss: 1.8152 - Accuracy: 17.74%
Epoch: 0. Batch 120/4290 - Avg Loss: 1.8126 - Accuracy: 18.03%
Epoch: 0. Batch 130/4290 - Avg Loss: 1.8118 - Accuracy: 17.56%
Epoch: 0. Batch 140/4290 - Avg Loss: 1.8114 - Accuracy: 17.46%
Epoch: 0. Batch 150/4290 - Avg Loss: 1.8114 - Accuracy: 17.34%
Epo

# Model evaluation

In [16]:
test_accuracy = eval_model(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}%")

Test Accuracy: 94.6620%


# Summary

| Epoch        | Train Accuracy | Validation Accuracy |
|-------------|---------------|---------------------|
| **Epoch 1** | 67.27%        | 92.06%              |
| **Epoch 2** | 93.34%        | 94.76%              |

### Observation
- The **train accuracy** and **validation accuracy** constantly icreases.

# Model serialization

In [17]:
torch.save(model.state_dict(), './LSTM_emotion_model/lstm_emotion_model.pth')