## <img src="https://img.icons8.com/bubbles/50/000000/checklist.png" style="height:50px;display:inline"> Table of Contents
---

* [Data Preprocessing](#data-preprocessing)
    * [Data Loading](#data-loading)
    * [Data Augmentation](#data-augmentation)
    * [Data Tokenization and Vectorization](#data-tokenization-and-vectorization)
    * [DataLoaders](#dataloaders)
    * [Statistics](#statistics)
* [Training Functions](#training-functions)
* [Hyper Parameters](#hyper-parameters)
* [Models](#models)
    * [B-LSTM Model](#b-lstm-model)
    * [xLSTM Model](#xlstm-model)
    * [GRU Model](#gru-model)
* [Optuna](#optuna)
    * [Framework](#framework)
    * [LSTM Study](#lstm-study)
    * [GRU Study](#gru-study)
* [Training](#training)
    * [LSTM Train](#lstm-train)
    * [GRU Train](#gru-train)
* [Comparison](#comparison)

## <img src="https://img.icons8.com/?size=100&id=uwzWDmqxwaFo&format=png&color=000000" style="height:50px;display:inline"> Imports
---

In [None]:
#!pip install datasets
import os

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import time
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
# from torchviz import make_dot

from textattack.augmentation import EasyDataAugmenter
from transformers import BertModel, BertTokenizer

import datasets
from datasets import load_dataset, concatenate_datasets

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'{device=}')

## <img src="https://img.icons8.com/?size=100&id=D7OBPFcT8dSK&format=png&color=000000" style="height:50px;display:inline"> Data Preprocessing
---

### Data Loading

In [None]:
# Load GoEmotions dataset
dataset = load_dataset("go_emotions")

In [None]:
# Define the classes
classes = ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
           'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
           'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
           'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
           'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
           'neutral']

# Ekman Mapping
primary_emotion_to_sub_emotions = {
    "anger": ["anger", "annoyance", "disapproval"],
    "disgust": ["disgust"],
    "fear": ["fear", "nervousness"],
    "joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
    "sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
    "surprise": ["surprise", "realization", "confusion", "curiosity"],
    "neutral": ["neutral"]
}

sub_emotion_to_primary_emotion = {sub: primary for primary, subs in primary_emotion_to_sub_emotions.items() for sub in subs}

In [None]:
raw_train_labels = [label for entry in dataset['train'] for label in entry['labels']]
raw_validation_labels = [label for entry in dataset['validation'] for label in entry['labels']]
raw_test_labels = [label for entry in dataset['test'] for label in entry['labels']]

In [None]:
def disp_labels_distribution(labels, split, classes=classes):
    label_counts = Counter(labels)
    print(f'{label_counts.most_common()=}')

    # Calculate total labels
    total_labels = sum(label_counts.values())
    print(f'{total_labels=}')

    # Prepare data for plotting
    counts = [label_counts.get(i, 0) for i in range(len(classes))]

    # Plotting the bar graph
    fig, ax = plt.subplots(figsize=(8, 8))
    bars = ax.barh(classes, counts)
    ax.set_xlabel('Count')
    ax.set_title(f'Class Distribution in GoEmotions {split} Data')

    # Adding data labels
    for bar in bars:
        width = bar.get_width()
        label_x_pos = width + total_labels * 0.005
        ax.text(label_x_pos, bar.get_y() + bar.get_height()/2, f'{(width/total_labels)*100:.2f}%', va='center')

    plt.show()
    return label_counts.values()

In [None]:
disp_labels_distribution(raw_train_labels, 'Train')
disp_labels_distribution(raw_validation_labels, 'Validation')
disp_labels_distribution(raw_test_labels, 'Test')

In [None]:
print(f"Train Size: {len(dataset['train'])} | Valid Size: {len(dataset['validation'])} | Test Size: {len(dataset['test'])}")

### Data Augmentation

In [None]:
def primary_label_pipeline(labels):
    primary_labels = [sub_emotion_to_primary_emotion[classes[label]] for label in labels]

    primary_emotions = list(primary_emotion_to_sub_emotions.keys())
    primary_labels_indx = [primary_emotions.index(p) for p in primary_labels]

    repetitions = Counter(primary_labels_indx)

    most_common_label, _ = repetitions.most_common(1)[0]
    return most_common_label

In [None]:
eda_augmenter = EasyDataAugmenter()

def balanced_augment(dataset):
    # Count the occurrence of each label in the dataset
    unbalanced_train_counter = Counter([primary_label_pipeline(label) for label in dataset['labels']])
    print(f'{unbalanced_train_counter=}')
    target_count = max(unbalanced_train_counter.values()) // 4

    augmented_items = []

    for label, count in unbalanced_train_counter.items():
        if count < target_count:
            print(f'Augmenting {label=}: {count}/{target_count}')
            # Get the samples of the weak class
            weak_class_samples = [data['text'] for data in dataset if primary_label_pipeline(data['labels']) == label]

            # Calculate the number of augmentations needed
            num_augmentations = target_count - count

            # Augment the weak class samples
            for _ in tqdm(range(num_augmentations)):
                sample_to_augment = weak_class_samples[_ % len(weak_class_samples)]
                augmented_samples = eda_augmenter.augment(sample_to_augment)
                augmented_items.extend([{'text': aug_text, 'labels': [classes.index(list(primary_emotion_to_sub_emotions.keys())[label])], 'id': None} for aug_text in augmented_samples])

    # Create a new dataset from the augmented items
    augmented_dataset = datasets.Dataset.from_list(augmented_items).cast(dataset.features)

    # Concatenate the original dataset with the augmented dataset
    merged_dataset = concatenate_datasets([dataset, augmented_dataset])

    # Check the new label distribution
    balanced_train_counter = Counter([primary_label_pipeline(label) for label in merged_dataset['labels']])
    print(f'{balanced_train_counter=}')

    return merged_dataset, dataset

In [None]:
dataset = load_dataset("go_emotions")

print("========= Augmenting Train Dataset =========")
aug_train, train = balanced_augment(dataset['train'])
dataset['train'] = aug_train

# print("========= Augmenting Validation Dataset =========")
# aug_validation, validation = balanced_augment(dataset['validation'])
# dataset['validation'] = aug_validation

# print("========= Augmenting Test Dataset =========")
# aug_test, test = balanced_augment(dataset['test'])
# dataset['test'] = aug_test

# if not os.path.isdir('datasets'):
#     os.mkdir('datasets')
# dataset.save_to_disk('./datasets/')

### Data Tokenization and Vectorization

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def yield_tokens(data_iter):
    for data in data_iter:
        text = data['text']
        yield tokenizer.tokenize(text)

vocab = build_vocab_from_iterator(yield_tokens(iter(dataset['train'])), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])

In [None]:
bert_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Freeze BERT parameters
for param in bert_model.parameters():
    param.requires_grad = False

# Create an embedding matrix for the vocabulary
embedding_dim = bert_model.config.hidden_size
vocab_size = len(vocab)
embedding_matrix = torch.zeros(vocab_size, embedding_dim)

# Get the embeddings for each token
for token, idx in tqdm(vocab.get_stoi().items()):
    inputs = tokenizer(token, return_tensors='pt').to(device)
    outputs = bert_model(**inputs)
    embeddings = outputs.last_hidden_state
    # Take the average of the token embeddings
    embedding_matrix[idx] = embeddings.mean(dim=1).squeeze()

### DataLoaders

In [None]:
text_pipeline = lambda x: vocab(tokenizer.tokenize(x))

label_pipeline = {
    'train': lambda x: primary_label_pipeline(x),
    'validation': lambda x: primary_label_pipeline(x),
    'test': lambda x: primary_label_pipeline(x)
}

In [None]:
max_seq_len = 30

def collate_batch(batch, split):
    label_list, text_tokenized_list = [], []
    for data in batch:
        _text, _labels, _id = data.values()
        label_list.append(label_pipeline[split](_labels))
        processed_text = torch.tensor(text_pipeline(_text)[:max_seq_len], dtype=torch.int64)
        if processed_text.shape[0] < max_seq_len:
            pad = vocab(['<pad>'])[0] * torch.ones(max_seq_len - len(processed_text), dtype=torch.int64, device=processed_text.device)
            processed_text = torch.cat([processed_text, pad])
        text_tokenized_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_tokenized_list = torch.stack(text_tokenized_list, dim=0)
    return label_list.to(device), text_tokenized_list.to(device)

In [None]:
batch_size = 16

train_dataloader = DataLoader(
    dataset['train'],
    batch_size=batch_size,
    shuffle=True,
    collate_fn=lambda batch: collate_batch(batch, 'train'))

valid_dataloader = DataLoader(
    dataset['validation'],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=lambda batch: collate_batch(batch, 'validation'))

test_dataloader = DataLoader(
    dataset['test'],
    batch_size=batch_size,
    shuffle=False,
    collate_fn=lambda batch: collate_batch(batch, 'test'))

### Statistics

In [None]:
primary_emotions = list(primary_emotion_to_sub_emotions.keys())

def calc_primary_emotions_distribution(dataloader, split):
    total_labels = np.array([])
    for _, (labels, _) in enumerate(dataloader):
        total_labels = np.append(total_labels, labels.cpu().numpy())

    return disp_labels_distribution(total_labels, split, classes=primary_emotions)

total_sum_train = calc_primary_emotions_distribution(train_dataloader, 'train')
total_sum_validation = calc_primary_emotions_distribution(valid_dataloader, 'validation')
total_sum_test = calc_primary_emotions_distribution(test_dataloader, 'test')


## <img src="https://img.icons8.com/?size=100&id=114910&format=png&color=000000" style="height:50px;display:inline"> Training Functions
---

In [None]:
def clip_gradient(model, clip_value):
    params = list(filter(lambda p: p.grad is not None, model.parameters()))
    for p in params:
        p.grad.data.clamp_(-clip_value, clip_value)

In [None]:
def count_layers_and_parameters(dummy_model):
    # Count the number of layers
    def count_layers(module):
        if len(list(module.children())) == 0:
            return 1
        return sum(count_layers(child) for child in module.children())

    num_layers = count_layers(dummy_model)

    # Count the number of parameters
    num_params = sum(p.numel() for p in dummy_model.parameters() if p.requires_grad)
    print(f"{type(dummy_model).__name__}: {num_layers=} {num_params=}")

In [None]:
def calc_model_size(dummy_model):
    param_size = 0
    for param in dummy_model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in dummy_model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    print(f"{type(dummy_model).__name__} size: {size_all_mb:.2f} MB")

In [None]:
def calculate_accuracy(model, dataloader):
    model.eval()
    total_correct = 0
    total_images = 0
    confusion_matrix = np.zeros([len(primary_emotions), len(primary_emotions)], int)
    with torch.no_grad():
        for _, (labels, text) in enumerate(dataloader):
            outputs = model(text)
            _, predicted = torch.max(outputs.data, 1)
            total_images += labels.size(0)
            total_correct += (predicted == labels).sum().item()
            for i, l in enumerate(labels):
                confusion_matrix[l.item(), predicted[i].item()] += 1

    model_accuracy = total_correct / total_images * 100
    return model_accuracy, confusion_matrix


def train_model(model, train_dataloader, valid_dataloader, learning_rate, step_size, num_epochs):
    optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()
    # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=step_size, verbose=True)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size)

    train_losses = []
    train_accuracies = []
    validation_accuracies = []
    for epoch in range(1, num_epochs + 1):
        model.train()
        running_loss = 0.0
        epoch_time = time.time()

        for _, (labels, text) in enumerate(train_dataloader):
            outputs = model(text)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            clip_gradient(model, 1e-1)
            optimizer.step()

            running_loss += loss.item()

        running_loss /= len(train_dataloader)

        train_accuracy, train_cm = calculate_accuracy(model, train_dataloader)
        validation_accuracy, valid_cm = calculate_accuracy(model, valid_dataloader)
        print(f'Epoch [{epoch:2}/{num_epochs}] | Loss: {running_loss:.6f} | Training Accuracy: {train_accuracy:.4f}% | Validation Accuracy: {validation_accuracy:.4f}% | Time: {time.time() - epoch_time:.2f}s | Learning Rate: {scheduler.get_last_lr()}')

        # scheduler.step(validation_accuracy)
        scheduler.step()

        train_losses.append(running_loss)
        train_accuracies.append(train_accuracy)
        validation_accuracies.append(validation_accuracy)

        if epoch % 10 == 0:
            print('==> Saving model ...')
            state = {
                'net': model.state_dict(),
                'epoch': epoch,
            }
            if not os.path.isdir('checkpoints'):
                os.mkdir('checkpoints')
            torch.save(state, f'./checkpoints/{type(model).__name__}_ckpt.pth')

    return train_losses, train_accuracies, validation_accuracies, train_cm, valid_cm


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

def plot_statistics(train_losses, train_accuracies, validation_accuracies, train_cm, valid_cm):
    epochs = range(1, len(train_losses) + 1)

    plt.figure(figsize=(14, 5))

    # Plot losses
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, label='Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.title('Training Loss Over Epochs')
    plt.legend()

    # Plot accuracies
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_accuracies, label='Training Accuracy', color='orange')
    plt.plot(epochs, validation_accuracies, label='Validation Accuracy', color='green')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('Training Accuracy Over Epochs')
    plt.legend()

    plt.tight_layout()
    plt.show()

    # Displaying confusion matrices using ConfusionMatrixDisplay
    def plot_confusion_matrix(cm, title='Confusion Matrix'):
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=primary_emotions)
        disp.plot(cmap=plt.cm.Blues, xticks_rotation='vertical')
        plt.title(title)
        plt.show()

    # Plot training confusion matrix
    plot_confusion_matrix(train_cm, title='Training Confusion Matrix')

    # Plot validation confusion matrix

    plot_confusion_matrix(valid_cm, title='Validation Confusion Matrix')

## <img src="https://img.icons8.com/cute-clipart/64/000000/horizontal-settings-mixer.png" style="height:50px;display:inline"> Hyper Parameters
---

In [None]:
# These are parameters not tuned by Optuna later

# Constants
vocab_dim = len(tokenizer.get_vocab())
num_classes = len(primary_emotion_to_sub_emotions.keys())
num_epochs = 20

# Scheduler
step_size = 10

## <img src="https://img.icons8.com/?size=100&id=Y6kSC37ALOtM&format=png&color=000000" style="height:50px;display:inline"> Models
---

### B-LSTM Model

In [None]:
class GoEmotions_LSTM(nn.Module):
    def __init__(self,
                 # Vocab
                 vocab_dim,
                 # Embedding
                 embedding_dim,
                 embedding_weights,
                 # LSTM
                 lstm_hidden_dim,
                 lstm_num_layers,
                 lstm_dropout,
                 bi_directional,
                 # Dense
                 dense_hidden_dims,
                 dense_dropouts,
                 # Output
                 num_classes):
        super(GoEmotions_LSTM, self).__init__()

        # Vocab
        self.vocab_dim = vocab_dim
        # Embedding
        self.embedding_dim = embedding_dim
        self.embedding_weights = embedding_weights
        # LSTM
        self.lstm_hidden_dim = lstm_hidden_dim
        self.lstm_num_layers = lstm_num_layers
        self.lstm_dropout = lstm_dropout
        self.bi_directional = bi_directional
        # Dense
        self.dense_hidden_dims = dense_hidden_dims
        self.dense_dropouts = dense_dropouts
        self.dense_input_dim = 2 * self.lstm_hidden_dim if bi_directional else self.lstm_hidden_dim
        # Output
        self.num_classes = num_classes

        # Layer definitions
        self.embedding = nn.Embedding(self.vocab_dim, self.embedding_dim)
        self.embedding.weights = nn.Parameter(self.embedding_weights, requires_grad=False)

        self.LayerNorm = nn.LayerNorm(embedding_dim, eps=1e-12, elementwise_affine=True)

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_hidden_dim,
            num_layers=self.lstm_num_layers,
            batch_first=True,
            dropout=self.lstm_dropout,
            bidirectional=self.bi_directional)

        in_features = self.dense_input_dim
        layers = []

        for l in range(len(self.dense_hidden_dims)):
            out_features = self.dense_hidden_dims[l]
            p = self.dense_dropouts[l]

            layers.append(nn.Linear(in_features, out_features))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(p))

            in_features = out_features

        layers.append(nn.Linear(out_features, self.num_classes))
        self.dense_layer = nn.Sequential(*layers)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.init_weights()

    def init_weights(self):
        # pick initialzation: https://pytorch.org/docs/stable/nn.init.html
        # examples
        # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        # nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu', a=math.sqrt(5))
        # nn.init.normal_(m.weight, 0, 0.005)
        # don't forget the bias term (m.bias)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.xavier_normal_(m.weight, gain=1.0)
                if m.bias is not None:
                    torch.nn.init.constant_(m.bias, 0)


    def forward(self, x):
        h0 = torch.zeros((self.lstm_num_layers * 2, x.size(0), self.lstm_hidden_dim)).to(self.device)
        c0 = torch.zeros((self.lstm_num_layers * 2, x.size(0), self.lstm_hidden_dim)).to(self.device)

        torch.nn.init.xavier_normal_(h0)
        torch.nn.init.xavier_normal_(c0)

        x = self.embedding(x)
        x_embedding_shape = x.shape
        x = self.LayerNorm(x)

        # From: https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        #
        # For bidirectional LSTMs, h_n is not equivalent to the last element of output; the former
        # contains the final forward and reverse hidden states, while the latter contains the final
        # forward hidden state and the initial reverse hidden state.
        x, (h, c) = self.lstm(x, (h0, c0))
        if self.bi_directional:
            h = h.view(self.lstm_num_layers, 2, x_embedding_shape[0], self.lstm_hidden_dim)
            h = h[-1]
            h = h.transpose(0, 1).reshape(x_embedding_shape[0], 2 * self.lstm_hidden_dim)
        else:
            h = h[-1]

        out = self.dense_layer(h)
        return out

### xLSTM Model

### GRU Model

In [None]:
class GoEmotions_GRU(nn.Module):
    def __init__(self,
                 # Vocab
                 vocab_dim,
                 # Embedding
                 embedding_dim,
                 embedding_weights,
                 # GRU
                 gru_hidden_dim,
                 gru_num_layers,
                 gru_dropout,
                 bi_directional,
                 # Dense
                 dense_hidden_dims,
                 dense_dropouts,
                 # Output
                 num_classes):
        super(GoEmotions_GRU, self).__init__()

        # Vocab
        self.vocab_dim = vocab_dim
        # Embedding
        self.embedding_dim = embedding_dim
        self.embedding_weights = embedding_weights
        # GRU
        self.gru_hidden_dim = gru_hidden_dim
        self.gru_num_layers = gru_num_layers
        self.gru_dropout = gru_dropout
        self.bi_directional = bi_directional
        # Dense
        self.dense_hidden_dims = dense_hidden_dims
        self.dense_dropouts = dense_dropouts
        self.dense_input_dim = 2 * self.gru_hidden_dim if bi_directional else self.gru_hidden_dim
        # Output
        self.num_classes = num_classes

        # Layer definitions
        self.embedding = nn.Embedding(self.vocab_dim, self.embedding_dim)
        self.embedding.weights = nn.Parameter(self.embedding_weights, requires_grad=False)

        self.LayerNorm = nn.LayerNorm(embedding_dim, eps=1e-12, elementwise_affine=True)

        self.gru = nn.GRU(
            input_size=self.embedding_dim,
            hidden_size=self.gru_hidden_dim,
            num_layers=self.gru_num_layers,
            batch_first=True,
            dropout=self.gru_dropout,
            bidirectional=self.bi_directional)

        in_features = self.dense_input_dim
        layers = []

        for l in range(len(self.dense_hidden_dims)):
            out_features = self.dense_hidden_dims[l]
            p = self.dense_dropouts[l]

            layers.append(nn.Linear(in_features, out_features))
            layers.append(nn.ReLU(inplace=True))
            layers.append(nn.Dropout(p))

            in_features = out_features

        layers.append(nn.Linear(out_features, self.num_classes))
        self.dense_layer = nn.Sequential(*layers)

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.init_weights()

    def init_weights(self):
        # pick initialzation: https://pytorch.org/docs/stable/nn.init.html
        # examples
        # nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
        # nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu', a=math.sqrt(5))
        # nn.init.normal_(m.weight, 0, 0.005)
        # don't forget the bias term (m.bias)
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.xavier_normal_(m.weight, gain=1.0)
                if m.bias is not None:
                    torch.nn.init.constant_(m.bias, 0)


    def forward(self, x):
        h0 = torch.zeros((self.gru_num_layers * 2, x.size(0), self.gru_hidden_dim)).to(self.device)

        torch.nn.init.xavier_normal_(h0)

        x = self.embedding(x)
        x_embedding_shape = x.shape
        x = self.LayerNorm(x)

        # While it is not documented in Pytorch, it is fundumentally different taking the output (x) vs
        # the final hidden state (h) when using Bidirectional GRU.
        x, h = self.gru(x, h0)
        if self.bi_directional:
            h = h.view(self.gru_num_layers, 2, x_embedding_shape[0], self.gru_hidden_dim)
            h = h[-1]
            h = h.transpose(0, 1).reshape(x_embedding_shape[0], 2 * self.gru_hidden_dim)
        else:
            h = h[-1]

        out = self.dense_layer(h)
        return out

## <img src="https://img.icons8.com/color/96/000000/pie-chart--v1.png" style="height:50px;display:inline"> Optuna
---

### Framework

In [None]:
import optuna

epochs = 20
log_interval = 10
n_train_examples = batch_size * 300
n_valid_examples = batch_size * 100


def define_model(trial, model_type):
    embedding_dim = trial.suggest_int("embedding_dim", 32, 256)
    rnn_hidden_dim = trial.suggest_int("rnn_hidden_dim", 32, 256)
    rnn_num_layers = trial.suggest_int("rnn_num_layers", 2, 5)
    rnn_dropout = trial.suggest_float(f"rnn_dropout", 0.1, 0.5)
    dense_num_layers = trial.suggest_int("n_layers", 1, 4)
    dense_hidden_dims = [trial.suggest_int(f"n_units_l{l}", 4, 128) for l in range(dense_num_layers)]
    dense_dropouts = [trial.suggest_float(f"dropout_l{l}", 0.1, 0.5) for l in range(dense_num_layers)]

    if model_type == 'LSTM':
        model = GoEmotions_LSTM(
            vocab_dim=vocab_dim,
            embedding_dim=embedding_dim,
            embedding_weights=embedding_matrix,
            lstm_hidden_dim=rnn_hidden_dim,
            lstm_num_layers=rnn_num_layers,
            lstm_dropout=rnn_dropout,
            bi_directional=True,
            dense_hidden_dims=dense_hidden_dims,
            dense_dropouts=dense_dropouts,
            num_classes=num_classes
        ).to(device)
    elif model_type == 'GRU':
        model = GoEmotions_GRU(
            vocab_dim=vocab_dim,
            embedding_dim=embedding_dim,
            embedding_weights=embedding_matrix,
            gru_hidden_dim=rnn_hidden_dim,
            gru_num_layers=rnn_num_layers,
            gru_dropout=rnn_dropout,
            bi_directional=True,
            dense_hidden_dims=dense_hidden_dims,
            dense_dropouts=dense_dropouts,
            num_classes=num_classes
        ).to(device)

    return model


def objective(trial, model_type):
    model = define_model(trial, model_type)

    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
    optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, num_epochs + 1):
        model.train()

        for batch_idx, (labels, text) in enumerate(train_dataloader):
            if batch_idx * batch_size >= n_train_examples:
                break

            outputs = model(text)
            loss = criterion(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            clip_gradient(model, 1e-1)
            optimizer.step()

        model.eval()
        total_correct = 0
        total_labels = 0
        with torch.no_grad():
            for batch_idx, (labels, text) in enumerate(valid_dataloader):
                if batch_idx * batch_size >= n_valid_examples:
                    break

                outputs = model(text)
                _, predicted = torch.max(outputs.data, 1)
                total_labels += labels.size(0)
                total_correct += (predicted == labels).sum().item()

        model_accuracy = total_correct / total_labels * 100
        trial.report(model_accuracy, epoch)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return model_accuracy

### LSTM Study

In [None]:
sampler = optuna.samplers.TPESampler()
study = optuna.create_study(study_name="goemotions-lstm", direction="maximize", sampler=sampler)
study.optimize(lambda study: objective(study, 'LSTM'), n_trials=100, timeout=60 * 90)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_contour(study, params=["embedding_dim", "rnn_dropout", "lr"])

In [None]:
best_lstm_trial = trial

### GRU Study

In [None]:
sampler = optuna.samplers.TPESampler()
study = optuna.create_study(study_name="goemotions-gru", direction="maximize", sampler=sampler)
study.optimize(lambda study: objective(study, 'GRU'), n_trials=100, timeout=60 * 90)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_contour(study, params=["embedding_dim", "rnn_dropout", "lr"])

In [None]:
best_gru_trial = trial

## <img src="https://img.icons8.com/?size=100&id=s8cTlBs8lfX0&format=png&color=000000" style="height:50px;display:inline"> Training
---

### LSTM Train

In [None]:
seed = 211
np.random.seed(seed)
torch.manual_seed(seed)

model_blstm = GoEmotions_LSTM(
    vocab_dim=vocab_dim,
    embedding_dim=best_lstm_trial.params['embedding_dim'],
    embedding_weights=embedding_matrix,
    lstm_hidden_dim=best_lstm_trial.params['rnn_hidden_dim'],
    lstm_num_layers=best_lstm_trial.params['rnn_num_layers'],
    lstm_dropout=best_lstm_trial.params['rnn_dropout'],
    bi_directional=True,
    dense_hidden_dims=[dim for key, dim in best_lstm_trial.params.items() if key.startswith('n_units_l')],
    dense_dropouts=[p for key, p in best_lstm_trial.params.items() if key.startswith('dropout_l')],
    num_classes=num_classes
).to(device)
print(model_blstm)

count_layers_and_parameters(model_blstm)
calc_model_size(model_blstm)

# y = model_blstm(next(iter(train_dataloader))[1])
# make_dot(y.mean(), params=dict(model_blstm.named_parameters()))

state = {
    'net': model_blstm.state_dict(),
    'epoch': 0,
}
if not os.path.isdir('checkpoints'):
    os.mkdir('checkpoints')
torch.save(state, f'./checkpoints/{type(model_blstm).__name__}_initial_ckpt.pth')

In [None]:
train_losses_lstm, train_accuracies_lstm, validation_accuracies_lstm, train_cm_lstm, valid_cm_lstm = train_model(model_blstm, train_dataloader, valid_dataloader, best_lstm_trial.params['lr'], step_size, num_epochs)

In [None]:
plot_statistics(train_losses_lstm, train_accuracies_lstm, validation_accuracies_lstm, train_cm_lstm, valid_cm_lstm)

### GRU Train

In [None]:
seed = 211
np.random.seed(seed)
torch.manual_seed(seed)

model_gru = GoEmotions_GRU(
    vocab_dim=vocab_dim,
    embedding_dim=best_gru_trial.params['embedding_dim'],
    embedding_weights=embedding_matrix,
    gru_hidden_dim=best_gru_trial.params['rnn_hidden_dim'],
    gru_num_layers=best_gru_trial.params['rnn_num_layers'],
    gru_dropout=best_gru_trial.params['rnn_dropout'],
    bi_directional=True,
    dense_hidden_dims=[dim for key, dim in best_gru_trial.params.items() if key.startswith('n_units_l')],
    dense_dropouts=[p for key, p in best_gru_trial.params.items() if key.startswith('dropout_l')],
    num_classes=num_classes
).to(device)
print(model_gru)

count_layers_and_parameters(model_gru)
calc_model_size(model_gru)

# y = model_gru(next(iter(train_dataloader))[1])
# make_dot(y.mean(), params=dict(model_gru.named_parameters()))

state = {
    'net': model_gru.state_dict(),
    'epoch': 0,
}
if not os.path.isdir('checkpoints'):
    os.mkdir('checkpoints')
torch.save(state, f'./checkpoints/{type(model_gru).__name__}_initial_ckpt.pth')

In [None]:
train_losses_gru, train_accuracies_gru, validation_accuracies_gru, train_cm_gru, valid_cm_gru = train_model(model_gru, train_dataloader, valid_dataloader, best_gru_trial.params['lr'], step_size, num_epochs)

In [None]:
plot_statistics(train_losses_gru, train_accuracies_gru, validation_accuracies_gru, train_cm_gru, valid_cm_gru)

## <img src="https://img.icons8.com/?size=100&id=HkG28tSEJLgP&format=png&color=000000" style="height:50px;display:inline"> Comparison
---

In [None]:
epochs = range(1, len(train_losses_lstm) + 1)

plt.figure(figsize=(14, 5))

# Plot losses
plt.subplot(1, 2, 1)
plt.plot(epochs, train_losses_lstm, label='LSTM')
plt.plot(epochs, train_losses_gru, label='GRU')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.legend()

# Plot accuracies
plt.subplot(1, 2, 2)
plt.plot(epochs, train_accuracies_lstm, label='LSTM Train')
plt.plot(epochs, train_accuracies_gru, label='GRU Train')
plt.plot(epochs, validation_accuracies_lstm, label='LSTM Validation')
plt.plot(epochs, validation_accuracies_gru, label='GRU Validation')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.title('Training Accuracy Over Epochs')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
import numpy as np
import torch
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_metrics(model, dataloader, primary_emotions):
    model.eval()
    total_correct = 0
    total_images = 0
    confusion_matrix = np.zeros([len(primary_emotions), len(primary_emotions)], int)
    all_labels = []
    all_predictions = []

    with torch.no_grad():
        for _, (labels, text) in enumerate(dataloader):
            outputs = model(text)
            _, predicted = torch.max(outputs.data, 1)
            total_images += labels.size(0)
            total_correct += (predicted == labels).sum().item()
            for i, l in enumerate(labels):
                confusion_matrix[l.item(), predicted[i].item()] += 1
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())

    model_accuracy = total_correct / total_images * 100

    # Calculate precision, recall, and f1 scores for each class
    precision = precision_score(all_labels, all_predictions, average=None, labels=list(range(len(primary_emotions))))
    recall = recall_score(all_labels, all_predictions, average=None, labels=list(range(len(primary_emotions))))
    f1 = f1_score(all_labels, all_predictions, average=None, labels=list(range(len(primary_emotions))))

    # Calculate macro-average and standard deviation
    macro_precision = np.mean(precision)
    macro_recall = np.mean(recall)
    macro_f1 = np.mean(f1)

    std_precision = np.std(precision)
    std_recall = np.std(recall)
    std_f1 = np.std(f1)

    # Create a pandas DataFrame
    data = {
        'Ekman Emotion': primary_emotions + ['macro-average', 'std'],
        'Precision': np.append(precision, [macro_precision, std_precision]),
        'Recall': np.append(recall, [macro_recall, std_recall]),
        'F1': np.append(f1, [macro_f1, std_f1])
    }

    df = pd.DataFrame(data)

    return model_accuracy, confusion_matrix, df

In [None]:
model_accuracy_blstm, confusion_matrix_blstm, metrics_df_blstm = calculate_metrics(model_blstm, test_dataloader, primary_emotions)
print(f'{model_accuracy_blstm=}')
print(metrics_df_blstm)

In [None]:
model_accuracy_gru, confusion_matrix_gru, metrics_df_gru = calculate_metrics(model_gru, test_dataloader, primary_emotions)
print(f'{model_accuracy_gru=}')
print(metrics_df_gru)