# Model training: LSTM (multitasking)

---

## Table of Contents

1. [Imports](#imports)
2. [Data loading and splitting](#data-loading-and-splitting)
3. [Setting training parameters](#setting-training-parameters)
4. [Model training](#model-training)
5. [Model evaluation](#model-evaluation)
6. [Summary](#summary)
7. [Model serialization](#model-serialization)

# Imports

In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, Dataset

# Data loading and splitting

In [2]:
base_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))

In [3]:
train_file_sentiment = os.path.join(base_dir, 'train_sentiment.csv')
val_file_sentiment = os.path.join(base_dir, 'val_sentiment.csv')
test_file_sentiment = os.path.join(base_dir, 'test_sentiment.csv')

if not all([os.path.exists(train_file_sentiment), os.path.exists(val_file_sentiment), os.path.exists(test_file_sentiment)]):
    sentiment_df = pd.read_parquet('../../data/sentiment_without_outliers/sentiment_without_outliers.parquet')
    sentiment_df = sentiment_df.drop(columns=['text_length'])
    
    train_data_sentiment, temp_data = train_test_split(sentiment_df, test_size=0.3, stratify=sentiment_df['label'], random_state=42)
    val_data_sentiment, test_data_sentiment = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

    train_data_sentiment.to_csv(train_file_sentiment, index=False)
    val_data_sentiment.to_csv(val_file_sentiment, index=False)
    test_data_sentiment.to_csv(test_file_sentiment, index=False)
else:
    train_data_sentiment = pd.read_csv(train_file_sentiment)
    val_data_sentiment = pd.read_csv(val_file_sentiment)
    test_data_sentiment = pd.read_csv(test_file_sentiment)

In [4]:
train_file_emotion = os.path.join(base_dir, 'train_emotion.csv')
val_file_emotion = os.path.join(base_dir, 'val_emotion.csv')
test_file_emotion = os.path.join(base_dir, 'test_emotion.csv')

if not all([os.path.exists(train_file_emotion), os.path.exists(val_file_emotion), os.path.exists(test_file_emotion)]):
    emotion_df = pd.read_parquet('../../data/emotion_without_outliers/emotion_without_outliers.parquet')
    emotion_df = emotion_df.drop(columns=['text_length'])
    
    target_samples_per_class = 16_667  # 100k / 6 classes of emotions
    
    balanced_data = emotion_df.groupby('label', group_keys=False).apply(
        lambda x: x.sample(n=min(len(x), target_samples_per_class), random_state=42)
    )
    
    train_data_emotion, temp_data = train_test_split(balanced_data, test_size=0.3, stratify=balanced_data['label'], random_state=42)
    val_data_emotion, test_data_emotion = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

    train_data_emotion.to_csv(train_file_emotion, index=False)
    val_data_emotion.to_csv(val_file_emotion, index=False)
    test_data_emotion.to_csv(test_file_emotion, index=False)
else:
    train_data_emotion = pd.read_csv(train_file_emotion)
    val_data_emotion = pd.read_csv(val_file_emotion)
    test_data_emotion = pd.read_csv(test_file_emotion)

# Setting training parameters

Due to the uneven distribution of classes in the dataset, the classes will be weighted.

In [5]:
MAX_NUM_WORDS = 20000
MAX_SEQ_LENGTH = 256

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data_sentiment['text'].tolist() + train_data_emotion['text'].tolist())

In [6]:
def encode_texts(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

In [7]:
X_train_sentiment = encode_texts(train_data_sentiment['text'])
X_val_sentiment = encode_texts(val_data_sentiment['text'])
X_test_sentiment = encode_texts(test_data_sentiment['text'])

y_train_sentiment = train_data_sentiment['label'].values
y_val_sentiment = val_data_sentiment['label'].values
y_test_sentiment = test_data_sentiment['label'].values

In [8]:
X_train_emotion = encode_texts(train_data_emotion['text'])
X_val_emotion = encode_texts(val_data_emotion['text'])
X_test_emotion = encode_texts(test_data_emotion['text'])

y_train_emotion = train_data_emotion['label'].values
y_val_emotion = val_data_emotion['label'].values
y_test_emotion = test_data_emotion['label'].values

In [9]:
class_weights_sentiment = torch.tensor(compute_class_weight('balanced', classes=np.unique(y_train_sentiment), y=y_train_sentiment), dtype=torch.float)
class_weights_emotion = torch.tensor(compute_class_weight('balanced', classes=np.unique(y_train_emotion), y=y_train_emotion), dtype=torch.float)

In [10]:
class MultiTaskDataset(Dataset):
    def __init__(self, X_sent, y_sent, X_emot, y_emot):
        self.X_sent = torch.tensor(X_sent, dtype=torch.long)
        self.y_sent = torch.tensor(y_sent, dtype=torch.long)
        self.X_emot = torch.tensor(X_emot, dtype=torch.long)
        self.y_emot = torch.tensor(y_emot, dtype=torch.long)

    def __len__(self):
        return min(len(self.y_sent), len(self.y_emot))

    def __getitem__(self, idx):
        return {
            'input_ids_sent': self.X_sent[idx],
            'label_sent': self.y_sent[idx],
            'input_ids_emot': self.X_emot[idx],
            'label_emot': self.y_emot[idx]
        }

In [11]:
train_dataset = MultiTaskDataset(X_train_sentiment, y_train_sentiment, X_train_emotion, y_train_emotion)
val_dataset = MultiTaskDataset(X_val_sentiment, y_val_sentiment, X_val_emotion, y_val_emotion)
test_dataset = MultiTaskDataset(X_test_sentiment, y_test_sentiment, X_test_emotion, y_test_emotion)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Model training

In [12]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [13]:
class MultiTaskLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_labels_sentiment, num_labels_emotion):
        super(MultiTaskLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc_sentiment = nn.Linear(hidden_dim, num_labels_sentiment)
        self.fc_emotion = nn.Linear(hidden_dim, num_labels_emotion)
    
    def forward(self, input_ids_sent, input_ids_emot):
        embedded_sent = self.embedding(input_ids_sent)
        embedded_emot = self.embedding(input_ids_emot)

        lstm_out_sent, _ = self.lstm(embedded_sent)
        lstm_out_emot, _ = self.lstm(embedded_emot)

        out_sentiment = self.fc_sentiment(lstm_out_sent[:, -1, :])
        out_emotion = self.fc_emotion(lstm_out_emot[:, -1, :])

        return out_sentiment, out_emotion

In [14]:
model = MultiTaskLSTM(vocab_size=MAX_NUM_WORDS, embed_dim=256, hidden_dim=256, num_labels_sentiment=3, num_labels_emotion=6).to(device)

In [15]:
loss_fn_sentiment = nn.CrossEntropyLoss(weight=class_weights_sentiment.to(device))
loss_fn_emotion = nn.CrossEntropyLoss(weight=class_weights_emotion.to(device))
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [16]:
def train_epoch(model, data_loader, loss_fn_sent, loss_fn_emot, optimizer, device, epoch):
    model.train()
    total_loss = 0
    correct_sentiment = 0
    correct_emotion = 0
    total_sentiment_samples = 0
    total_emotion_samples = 0

    for batch_idx, batch in enumerate(data_loader):
        input_ids_sent = batch['input_ids_sent'].to(device)
        labels_sent = batch['label_sent'].to(device)
        input_ids_emot = batch['input_ids_emot'].to(device)
        labels_emot = batch['label_emot'].to(device)

        optimizer.zero_grad()

        logits_sent, logits_emot = model(input_ids_sent, input_ids_emot)

        loss_sent = loss_fn_sent(logits_sent, labels_sent)
        loss_emot = loss_fn_emot(logits_emot, labels_emot)

        loss = loss_sent + loss_emot
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        _, preds_sent = torch.max(logits_sent, dim=1)
        _, preds_emot = torch.max(logits_emot, dim=1)

        correct_sentiment += (preds_sent == labels_sent).sum().item()
        correct_emotion += (preds_emot == labels_emot).sum().item()
        total_sentiment_samples += labels_sent.size(0)
        total_emotion_samples += labels_emot.size(0)
        
        if batch_idx % 10 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            acc_sent = 100. * correct_sentiment / total_sentiment_samples
            acc_emot = 100. * correct_emotion / total_emotion_samples
            print(f"Epoch {epoch}. Batch {batch_idx}/{len(data_loader)}: "
                  f"AvgLoss: {avg_loss:.2f}, S.Acc: {acc_sent:.2f}%, E.Acc: {acc_emot:.2f}%")

    avg_loss = total_loss / len(data_loader)
    accuracy_sentiment = 100. * correct_sentiment / total_sentiment_samples
    accuracy_emotion = 100. * correct_emotion / total_emotion_samples
    return avg_loss, accuracy_sentiment, accuracy_emotion

In [17]:
def eval_model(model, data_loader, device):
    model.eval()
    correct_sentiment = 0
    correct_emotion = 0
    total_sentiment_samples = 0
    total_emotion_samples = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids_sent = batch['input_ids_sent'].to(device)
            labels_sent = batch['label_sent'].to(device)
            input_ids_emot = batch['input_ids_emot'].to(device)
            labels_emot = batch['label_emot'].to(device)

            logits_sent, logits_emot = model(input_ids_sent, input_ids_emot)

            _, preds_sent = torch.max(logits_sent, dim=1)
            _, preds_emot = torch.max(logits_emot, dim=1)

            correct_sentiment += (preds_sent == labels_sent).sum().item()
            correct_emotion += (preds_emot == labels_emot).sum().item()
            total_sentiment_samples += labels_sent.size(0)
            total_emotion_samples += labels_emot.size(0)

    accuracy_sentiment = 100. * correct_sentiment / total_sentiment_samples
    accuracy_emotion = 100. * correct_emotion / total_emotion_samples
    return accuracy_sentiment, accuracy_emotion

In [18]:
for epoch in range(2):
    train_loss, train_acc_sent, train_acc_emot = train_epoch(
        model, train_loader, loss_fn_sentiment, loss_fn_emotion, optimizer, device, epoch
    )
    print(f"Train Loss: {train_loss:.4f}, "
          f"Train Sentiment Accuracy: {train_acc_sent:.2f}%, "
          f"Train Emotion Accuracy: {train_acc_emot:.2f}%")
    val_acc_sent, val_acc_emot = eval_model(model, val_loader, device)
    print(f"Validation Sentiment Accuracy: {val_acc_sent:.2f}%, Validation Emotion Accuracy: {val_acc_emot:.2f}%")

Epoch 0. Batch 0/4290: AvgLoss: 2.86, S.Acc: 50.00%, E.Acc: 18.75%
Epoch 0. Batch 10/4290: AvgLoss: 3.04, S.Acc: 26.14%, E.Acc: 18.75%
Epoch 0. Batch 20/4290: AvgLoss: 2.98, S.Acc: 26.19%, E.Acc: 17.26%
Epoch 0. Batch 30/4290: AvgLoss: 2.95, S.Acc: 29.03%, E.Acc: 17.94%
Epoch 0. Batch 40/4290: AvgLoss: 2.94, S.Acc: 30.79%, E.Acc: 17.53%
Epoch 0. Batch 50/4290: AvgLoss: 2.93, S.Acc: 32.48%, E.Acc: 18.63%
Epoch 0. Batch 60/4290: AvgLoss: 2.92, S.Acc: 34.43%, E.Acc: 18.65%
Epoch 0. Batch 70/4290: AvgLoss: 2.92, S.Acc: 33.98%, E.Acc: 18.93%
Epoch 0. Batch 80/4290: AvgLoss: 2.92, S.Acc: 32.33%, E.Acc: 18.44%
Epoch 0. Batch 90/4290: AvgLoss: 2.92, S.Acc: 33.38%, E.Acc: 17.99%
Epoch 0. Batch 100/4290: AvgLoss: 2.91, S.Acc: 34.16%, E.Acc: 17.82%
Epoch 0. Batch 110/4290: AvgLoss: 2.91, S.Acc: 34.68%, E.Acc: 17.79%
Epoch 0. Batch 120/4290: AvgLoss: 2.91, S.Acc: 34.14%, E.Acc: 17.30%
Epoch 0. Batch 130/4290: AvgLoss: 2.91, S.Acc: 33.11%, E.Acc: 17.51%
Epoch 0. Batch 140/4290: AvgLoss: 2.91, S.Acc

# Model evaluation

In [19]:
test_acc_sent, test_acc_emot = eval_model(model, test_loader, device)

print(f"Test Sentiment Accuracy: {test_acc_sent:.2f}%")
print(f"Test Emotion Accuracy: {test_acc_emot:.2f}%")

Test Sentiment Accuracy: 42.69%
Test Emotion Accuracy: 17.00%


# Summary

| Epoch        | Train Accuracy Sentiment | Validation Accuracy Sentiment | Train Accuracy Emotion | Validation Accuracy Emotion |
|--------------|--------------------------|-------------------------------|------------------------|-----------------------------|
| **Epoch 1**  | 38.65%                   | 34.34%                        | 16.94%                 | 17.00%                      |
| **Epoch 2**  | 38.92%                   | 42.72%                        | 16.63%                 | 17.00%                      |

### Observation
* The training accuracy for both sentiment and emotion classification remains virtually unchanged across epochs.

# Model serialization

In [20]:
torch.save(model.state_dict(), './LSTM_multitask_model/lstm_multitask_model.pth')