# Model training: GRU (sentiment)

---

## Table of Contents

1. [Imports](#imports)
2. [Data loading and splitting](#data-loading-and-splitting)
3. [Setting training parameters](#setting-training-parameters)
4. [Model training](#model-training)
5. [Model evaluation](#model-evaluation)
6. [Summary](#summary)
7. [Model serialization](#model-serialization)

# Imports

In [1]:
import pandas as pd
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from torch.utils.data import DataLoader, Dataset

# Data loading and splitting

In [2]:
base_dir = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), '..'))

train_file = os.path.join(base_dir, 'train_sentiment.csv')
val_file = os.path.join(base_dir, 'val_sentiment.csv')
test_file = os.path.join(base_dir, 'test_sentiment.csv')

if not all([os.path.exists(train_file), os.path.exists(val_file), os.path.exists(test_file)]):
    sentiment_df = pd.read_parquet('../../data/sentiment_without_outliers/sentiment_without_outliers.parquet')
    sentiment_df = sentiment_df.drop(columns=['text_length'])
    
    train_data, temp_data = train_test_split(sentiment_df, test_size=0.3, stratify=sentiment_df['label'], random_state=42)
    val_data, test_data = train_test_split(temp_data, test_size=0.5, stratify=temp_data['label'], random_state=42)

    train_data.to_csv(train_file, index=False)
    val_data.to_csv(val_file, index=False)
    test_data.to_csv(test_file, index=False)
else:
    train_data = pd.read_csv(train_file)
    val_data = pd.read_csv(val_file)
    test_data = pd.read_csv(test_file)

# Setting training parameters

Due to the uneven distribution of classes in the dataset, the classes will be weighted.

In [3]:
MAX_NUM_WORDS = 20000
MAX_SEQ_LENGTH = 256

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(train_data['text'])

In [4]:
def encode_texts(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=MAX_SEQ_LENGTH, padding='post', truncating='post')

In [5]:
X_train = encode_texts(train_data['text'])
X_val = encode_texts(val_data['text'])
X_test = encode_texts(test_data['text'])

y_train = train_data['label'].values
y_val = val_data['label'].values
y_test = test_data['label'].values

In [6]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [7]:
class TokenizedTextDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return {'input_ids': self.X[idx], 'label': self.y[idx]}

In [8]:
train_dataset = TokenizedTextDataset(X_train, y_train)
val_dataset = TokenizedTextDataset(X_val, y_val)
test_dataset = TokenizedTextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

# Model training

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
class GRUClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(GRUClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.gru = nn.GRU(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
    
    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        gru_out, _ = self.gru(embedded)
        output = self.fc(gru_out[:, -1, :])
        return output

In [11]:
model = GRUClassifier(vocab_size=MAX_NUM_WORDS, embed_dim=256, hidden_dim=256, num_classes=3).to(device)

In [12]:
loss_fn = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [13]:
def train_epoch(model, data_loader, loss_fn, optimizer, device, epoch):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0

    for batch_idx, batch in enumerate(data_loader):
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        logits = model(input_ids)
        loss = loss_fn(logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(logits, dim=1)
        correct_predictions += (preds == labels).sum().item()
        total_samples += labels.size(0)
        
        if batch_idx % 10 == 0:
            avg_loss = total_loss / (batch_idx + 1)
            accuracy = 100. * correct_predictions / total_samples
            print(f"Epoch: {epoch}. Batch {batch_idx}/{len(data_loader)} - Avg Loss: {avg_loss:.4f} - Accuracy: {accuracy:.2f}%")

    avg_loss = total_loss / len(data_loader)
    accuracy = 100. * correct_predictions / total_samples
    return avg_loss, accuracy

In [14]:
def eval_model(model, data_loader, device):
    model.eval()
    correct_predictions = 0
    total_samples = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            logits = model(input_ids)
            _, preds = torch.max(logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_samples += labels.size(0)

    return 100. * correct_predictions / total_samples

In [15]:
for epoch in range(2):
    train_loss, train_accuracy = train_epoch(model, train_loader, loss_fn, optimizer, device, epoch)
    print(f"Train loss: {train_loss:.4f} - Train accuracy: {train_accuracy:.2f}%")
    val_accuracy = eval_model(model, val_loader, device)
    print(f"Validation accuracy: {val_accuracy:.4f}")

Epoch: 0. Batch 0/4317 - Avg Loss: 1.0957 - Accuracy: 37.50%
Epoch: 0. Batch 10/4317 - Avg Loss: 1.4136 - Accuracy: 35.23%
Epoch: 0. Batch 20/4317 - Avg Loss: 1.3101 - Accuracy: 31.25%
Epoch: 0. Batch 30/4317 - Avg Loss: 1.2568 - Accuracy: 32.26%
Epoch: 0. Batch 40/4317 - Avg Loss: 1.2238 - Accuracy: 30.34%
Epoch: 0. Batch 50/4317 - Avg Loss: 1.2012 - Accuracy: 31.50%
Epoch: 0. Batch 60/4317 - Avg Loss: 1.1889 - Accuracy: 31.97%
Epoch: 0. Batch 70/4317 - Avg Loss: 1.1801 - Accuracy: 30.81%
Epoch: 0. Batch 80/4317 - Avg Loss: 1.1720 - Accuracy: 31.71%
Epoch: 0. Batch 90/4317 - Avg Loss: 1.1705 - Accuracy: 31.46%
Epoch: 0. Batch 100/4317 - Avg Loss: 1.1693 - Accuracy: 31.56%
Epoch: 0. Batch 110/4317 - Avg Loss: 1.1681 - Accuracy: 32.38%
Epoch: 0. Batch 120/4317 - Avg Loss: 1.1644 - Accuracy: 31.56%
Epoch: 0. Batch 130/4317 - Avg Loss: 1.1603 - Accuracy: 32.35%
Epoch: 0. Batch 140/4317 - Avg Loss: 1.1584 - Accuracy: 32.54%
Epoch: 0. Batch 150/4317 - Avg Loss: 1.1566 - Accuracy: 32.12%
Epo

# Model evaluation

In [16]:
test_accuracy = eval_model(model, test_loader, device)
print(f"Test Accuracy: {test_accuracy:.4f}%")

Test Accuracy: 65.6284%


# Summary

| Epoch        | Train Accuracy | Validation Accuracy |
|-------------|---------------|---------------------|
| **Epoch 1** | 36.50%        | 58.51%              |
| **Epoch 2** | 62.71%        | 65.05%              |

### Observation
- The **train accuracy** increases.
- The **validation accuracy** increases.

# Model serialization

In [17]:
torch.save(model.state_dict(), './GRU_sentiment_model/gru_sentiment_model.pth')