# Model training: LSTM (sentiment)

---

## Table of Contents

1. [Imports](#imports)
2. [Data loading and splitting](#data-loading-and-splitting)
3. [Setting training parameters](#setting-training-parameters)
4. [Model training](#model-training)
5. [Model evaluation](#model-evaluation)
6. [Summary](#summary)
7. [Model serialization](#model-serialization)

# Imports

In [18]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, Dataset

# Data loading and splitting

In [19]:
base_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))

train_file = os.path.join(base_dir, 'train_sentiment.csv')
val_file = os.path.join(base_dir, 'val_sentiment.csv')
test_file = os.path.join(base_dir, 'test_sentiment.csv')

if not all([os.path.exists(train_file), os.path.exists(val_file), os.path.exists(test_file)]):
    sentiment_df = pd.read_parquet('../../data/sentiment_without_outliers/sentiment_without_outliers.parquet')
    sentiment_df = sentiment_df.drop(columns=['text_length'])
    
    train_data, temp_data = train_test_split(sentiment_df, test_size=0.3, stratify=sentiment_df['label'], random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

    train_data.to_csv(train_file, index=False)
    val_data.to_csv(val_file, index=False)
    test_data.to_csv(test_file, index=False)
else:
    train_data = pd.read_csv(train_file)
    val_data = pd.read_csv(val_file)
    test_data = pd.read_csv(test_file)

# Setting training parameters

Due to the uneven distribution of classes in the dataset, the classes will be weighted.

In [20]:
MAX_NUM_WORDS = 20000
MAX_SEQ_LENGTH = 256

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['text'])

In [21]:
def encode_texts(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

In [22]:
X_train = encode_texts(train_data['text'])
X_val = encode_texts(val_data['text'])
X_test = encode_texts(test_data['text'])

y_train = train_data['label'].values
y_val = val_data['label'].values
y_test = test_data['label'].values

In [23]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [24]:
class TokenizedTextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {'input_ids': self.X[idx], 'label': self.y[idx]}

In [25]:
train_dataset = TokenizedTextDataset(X_train, y_train)
val_dataset = TokenizedTextDataset(X_val, y_val)
test_dataset = TokenizedTextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Model training

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [27]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.relu = nn.ReLU()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_dim * 2, num_classes)
    
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        lstm_out, _ = self.lstm(embedded)
        pooled = torch.mean(lstm_out, dim=1)
        dropped = self.dropout(pooled)
        output = self.fc(self.relu(dropped))
        return output

In [28]:
model = LSTMClassifier(vocab_size=MAX_NUM_WORDS, embed_dim=256, hidden_dim=256, num_classes=3).to(device)

In [29]:
loss_fn = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [30]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, epoch):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)
        
        if batch_idx % 10 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            accuracy = 100. * correct_predictions / total_samples
            print(f"Epoch: {epoch}. Batch {batch_idx}/{len(data_loader)} - Avg Loss: {avg_loss:.4f} - Accuracy: {accuracy:.2f}%")

    avg_loss = total_loss / len(data_loader)
    accuracy = 100. * correct_predictions / total_samples
    return avg_loss, accuracy

In [31]:
def eval_model(model, data_loader, device):
    model.eval()
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids)
            _, preds = torch.max(logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_samples += labels.size(0)

    return 100. * correct_predictions / total_samples

In [32]:
for epoch in range(2):
    train_loss, train_accuracy = train_epoch(model, train_loader, loss_fn, optimizer, device, epoch)
    print(f"Train loss: {train_loss:.4f} - Train accuracy: {train_accuracy:.2f}%")
    val_accuracy = eval_model(model, val_loader, device)
    print(f"Validation accuracy: {val_accuracy:.4f}")

Epoch: 0. Batch 0/4317 - Avg Loss: 1.1358 - Accuracy: 6.25%
Epoch: 0. Batch 10/4317 - Avg Loss: 1.1217 - Accuracy: 36.93%
Epoch: 0. Batch 20/4317 - Avg Loss: 1.1139 - Accuracy: 39.88%
Epoch: 0. Batch 30/4317 - Avg Loss: 1.1159 - Accuracy: 36.09%
Epoch: 0. Batch 40/4317 - Avg Loss: 1.1124 - Accuracy: 34.15%
Epoch: 0. Batch 50/4317 - Avg Loss: 1.1172 - Accuracy: 32.11%
Epoch: 0. Batch 60/4317 - Avg Loss: 1.1155 - Accuracy: 32.68%
Epoch: 0. Batch 70/4317 - Avg Loss: 1.1135 - Accuracy: 33.63%
Epoch: 0. Batch 80/4317 - Avg Loss: 1.1120 - Accuracy: 34.41%
Epoch: 0. Batch 90/4317 - Avg Loss: 1.1112 - Accuracy: 34.96%
Epoch: 0. Batch 100/4317 - Avg Loss: 1.1091 - Accuracy: 35.40%
Epoch: 0. Batch 110/4317 - Avg Loss: 1.1071 - Accuracy: 35.47%
Epoch: 0. Batch 120/4317 - Avg Loss: 1.1094 - Accuracy: 34.92%
Epoch: 0. Batch 130/4317 - Avg Loss: 1.1099 - Accuracy: 34.21%
Epoch: 0. Batch 140/4317 - Avg Loss: 1.1091 - Accuracy: 34.44%
Epoch: 0. Batch 150/4317 - Avg Loss: 1.1091 - Accuracy: 34.48%
Epoc

# Model evaluation

In [33]:
test_accuracy = eval_model(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}%")

Test Accuracy: 67.0405%


# Summary

| Epoch        | Train Accuracy | Validation Accuracy |
|-------------|---------------|---------------------|
| **Epoch 1** | 55.27%        | 63.04%              |
| **Epoch 2** | 68.24%        | 66.68%              |

### Observation
- The **train accuracy** increases.
- The **validation accuracy** increases.

# Model serialization

In [None]:
torch.save(model.state_dict(), './LSTM_sentiment_model/lstm_sentiment_model.pth')