In [14]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.optim as optim
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize

In [15]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/marcuskrarup/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Load data

In [16]:
# Import Ekman emotions to DataFrame
df_train_ekman = pd.read_csv('data/ekman_train.csv')
df_val_ekman = pd.read_csv('data/ekman_val.csv')
df_test_ekman = pd.read_csv('data/ekman_test.csv')



In [17]:
# Count occurrences for each emotion in train, test, and val datasets
train_counts = df_train_ekman.iloc[:, 1:].sum()
test_counts = df_test_ekman.iloc[:, 1:].sum()
val_counts = df_val_ekman.iloc[:, 1:].sum()

print("Train Counts:\n", train_counts)
print("\nTest Counts:\n", test_counts)
print("\nValidation Counts:\n", val_counts)

Train Counts:
 anger        4824
disgust       818
fear          695
joy         13877
neutral      9290
sadness      2967
surprise     4783
dtype: int64

Test Counts:
 anger       1635
disgust      281
fear         257
joy         4622
neutral     3046
sadness     1042
surprise    1638
dtype: int64

Validation Counts:
 anger       1568
disgust      250
fear         258
joy         4599
neutral     3152
sadness      977
surprise    1636
dtype: int64


# Text preprocessing

In [18]:
def preprocess(text):
    return word_tokenize(text.lower())

# Build vocab

In [21]:
tokenized_texts = df_train_ekman['text'].apply(preprocess)
word_counts = Counter(token for tokens in tokenized_texts for token in tokens)
vocab = {word: i + 2 for i, (word, _) in enumerate(word_counts.items())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def encode(text):
    return [vocab.get(t, vocab['<UNK>']) for t in preprocess(text)]

# Dataset

In [22]:
class EmotionDataset(Dataset):
    def __init__(self, df):
        self.X = [torch.tensor(encode(text)) for text in df['text']]
        self.y = torch.tensor(df.iloc[:, 1:].values, dtype=torch.float)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def collate_fn(batch):
    sequences, labels = zip(*batch)
    padded_seqs = pad_sequence(sequences, batch_first=True, padding_value=0)
    return padded_seqs, torch.stack(labels)

train_loader = DataLoader(EmotionDataset(df_train_ekman), batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(EmotionDataset(df_val_ekman), batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(EmotionDataset(df_test_ekman), batch_size=32, collate_fn=collate_fn)

# Model

In [23]:
class EmotionLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.fc(h_n[-1])
        return self.sigmoid(out)

# Initialize

In [24]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = EmotionLSTM(len(vocab), embed_dim=100, hidden_dim=128, output_dim=7).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop

In [25]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 402.1999
Epoch 2, Loss: 371.5362
Epoch 3, Loss: 335.3612
Epoch 4, Loss: 311.0528
Epoch 5, Loss: 285.8978


# Evaluation (Validation or Test)

In [26]:
for epoch in range(5):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        preds = model(X_batch)
        loss = criterion(preds, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 258.3643
Epoch 2, Loss: 228.1770
Epoch 3, Loss: 198.7505
Epoch 4, Loss: 172.0502
Epoch 5, Loss: 147.9996


In [28]:
def evaluate(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            preds = (preds > 0.5).float()
            correct += (preds == y_batch).sum().item()
            total += torch.numel(y_batch)
    print(f"Accuracy: {correct / total:.4f}")

In [33]:
evaluate(
    model = model.to(device),
    data_loader = val_loader,  
    criterion = criterion,
    device = device,
)

TypeError: evaluate() got an unexpected keyword argument 'model'

In [30]:
print("Test:")
evaluate(test_loader)

Test:
Accuracy: 0.8517


In [34]:
from itertools import product

# Grid search hyperparameters
EPOCHS_LIST = [3, 5]
HIDDEN_DIMS = [64, 128]
BATCH_SIZES = [16, 32]

best_config = None
best_val_loss = float('inf')

def run_training(hidden_dim, batch_size, epochs):
    # Create loaders with current batch size
    train_loader = DataLoader(EmotionDataset(df_train_ekman), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(EmotionDataset(df_val_ekman), batch_size=batch_size, collate_fn=collate_fn)
    
    # Model
    model = EmotionLSTM(len(vocab), embed_dim=100, hidden_dim=hidden_dim, output_dim=7).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        model.train()
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            optimizer.zero_grad()
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            loss.backward()
            optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            preds = model(X_batch)
            loss = criterion(preds, y_batch)
            val_loss += loss.item()
    
    avg_val_loss = val_loss / len(val_loader)
    return avg_val_loss

# Grid search loop
for hidden_dim, batch_size, epochs in product(HIDDEN_DIMS, BATCH_SIZES, EPOCHS_LIST):
    print(f"Training with hidden_dim={hidden_dim}, batch_size={batch_size}, epochs={epochs}")
    val_loss = run_training(hidden_dim, batch_size, epochs)
    print(f"Validation Loss: {val_loss:.4f}")
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_config = (hidden_dim, batch_size, epochs)

print("\n✅ Best Configuration:")
print(f"Hidden Dim: {best_config[0]}, Batch Size: {best_config[1]}, Epochs: {best_config[2]}")
print(f"Validation Loss: {best_val_loss:.4f}")


Training with hidden_dim=64, batch_size=16, epochs=3
Validation Loss: 0.3142
Training with hidden_dim=64, batch_size=16, epochs=5
Validation Loss: 0.3197
Training with hidden_dim=64, batch_size=32, epochs=3
Validation Loss: 0.3216
Training with hidden_dim=64, batch_size=32, epochs=5
Validation Loss: 0.3185
Training with hidden_dim=128, batch_size=16, epochs=3
Validation Loss: 0.3082
Training with hidden_dim=128, batch_size=16, epochs=5
Validation Loss: 0.3181
Training with hidden_dim=128, batch_size=32, epochs=3
Validation Loss: 0.3125
Training with hidden_dim=128, batch_size=32, epochs=5
Validation Loss: 0.3153

✅ Best Configuration:
Hidden Dim: 128, Batch Size: 16, Epochs: 3
Validation Loss: 0.3082
