In [None]:
!pip install datasets

from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import time

from sklearn.metrics import ConfusionMatrixDisplay
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
from transformers import BertTokenizer
from datasets import load_dataset

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
print(f'{device=}')

In [None]:
# Load GoEmotions dataset
dataset = load_dataset("go_emotions")

In [None]:
# Define the classes
classes = ['admiration', 'amusement', 'anger', 'annoyance', 'approval',
           'caring', 'confusion', 'curiosity', 'desire', 'disappointment',
           'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear',
           'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism',
           'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise',
           'neutral']

primary_emotion_to_sub_emotions = {
    "anger": ["anger", "annoyance", "disapproval"],
    "disgust": ["disgust"],
    "fear": ["fear", "nervousness"],
    "joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
    "sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
    "surprise": ["surprise", "realization", "confusion", "curiosity"],
    "neutral": ["neutral"]
}

sub_emotion_to_primary_emotion = {sub: primary for primary, subs in primary_emotion_to_sub_emotions.items() for sub in subs}

In [None]:
raw_train_labels = [label for entry in dataset['train'] for label in entry['labels']]
raw_validation_labels = [label for entry in dataset['validation'] for label in entry['labels']]
raw_test_labels = [label for entry in dataset['test'] for label in entry['labels']]

In [None]:
def calc_primary_class_weights(labels):
    label_counts = Counter(labels)

    # Convert sub-emotion counts to primary emotion counts
    primary_counts = Counter()
    for sub_emotion, count in label_counts.items():
        primary_emotion = sub_emotion_to_primary_emotion[classes[sub_emotion]]
        primary_counts[primary_emotion] += count

    # Calculate class weights based on primary emotion counts
    total_primary_labels = sum(primary_counts.values())
    primary_class_weights = {primary: total_primary_labels / count for primary, count in primary_counts.items()}
    return primary_class_weights

primary_class_weights = {
    'train': calc_primary_class_weights(raw_train_labels),
    'validation': calc_primary_class_weights(raw_validation_labels),
    'test': calc_primary_class_weights(raw_test_labels)
}

In [None]:
def disp_labels_distribution(labels, split, classes=classes):
    label_counts = Counter(labels)
    print(f'{label_counts.most_common()=}')

    # Calculate total labels
    total_labels = sum(label_counts.values())
    print(f'{total_labels=}')

    # Prepare data for plotting
    counts = [label_counts.get(i, 0) for i in range(len(classes))]

    # Plotting the bar graph
    fig, ax = plt.subplots(figsize=(8, 8))
    bars = ax.barh(classes, counts)
    ax.set_xlabel('Count')
    ax.set_title(f'Class Distribution in GoEmotions {split} Data')

    # Adding data labels
    for bar in bars:
        width = bar.get_width()
        label_x_pos = width + total_labels * 0.005
        ax.text(label_x_pos, bar.get_y() + bar.get_height()/2, f'{(width/total_labels)*100:.2f}%', va='center')

    plt.show()

In [None]:
disp_labels_distribution(raw_train_labels, 'Train')
disp_labels_distribution(raw_validation_labels, 'Validation')
disp_labels_distribution(raw_test_labels, 'Test')

In [None]:
print(f"Train Size: {len(dataset['train'])} | Valid Size: {len(dataset['validation'])} | Test Size: {len(dataset['test'])}")

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokenizer.tokenize("here is an example")

In [None]:
def yield_tokens(data_iter):
    for data in data_iter:
        text = data['text']
        yield tokenizer.tokenize(text)


vocab = build_vocab_from_iterator(yield_tokens(iter(dataset['train'])), specials=["<unk>", "<pad>"])
vocab.set_default_index(vocab["<unk>"])
print(vocab(['here', 'is', 'an', 'example']))

In [None]:
text_pipeline = lambda x: vocab(tokenizer.tokenize(x))

def gen_probability_vector_for_primary_labels(labels, split):
    primary_labels = [sub_emotion_to_primary_emotion[classes[label]] for label in labels]

    primary_emotions = list(primary_emotion_to_sub_emotions.keys())
    primary_labels_indx = [primary_emotions.index(p) for p in primary_labels]

    repetitions = Counter(primary_labels_indx)
    label_prob_vec = np.zeros(len(primary_emotions))
    for key in repetitions:
        label_prob_vec[key] = 1

    return label_prob_vec

label_pipeline = {
    'train': lambda x: gen_probability_vector_for_primary_labels(x, 'train'),
    'validation': lambda x: gen_probability_vector_for_primary_labels(x, 'validation'),
    'test': lambda x: gen_probability_vector_for_primary_labels(x, 'test')
}

In [None]:
max_seq_len = 200

def collect_batch(batch, split):
    label_list, text_list = [], []
    for data in batch:
        _text, _labels, _id = data.values()
        label_list.append(label_pipeline[split](_labels))
        processed_text = torch.tensor(text_pipeline(_text)[:max_seq_len], dtype=torch.int64)
        if processed_text.shape[0] < max_seq_len:
            pad = vocab(['<pad>'])[0] * torch.ones(max_seq_len - len(processed_text), dtype=torch.int64, device=processed_text.device)
            processed_text = torch.cat([processed_text, pad])
        text_list.append(processed_text)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = torch.stack(text_list, dim=0)
    return label_list.to(device), text_list.to(device)

In [None]:
batch_size = 32
train_dataloader = DataLoader(dataset['train'], batch_size=batch_size, shuffle=True, collate_fn=lambda batch: collect_batch(batch, 'train'))
valid_dataloader = DataLoader(dataset['validation'], batch_size=batch_size, shuffle=False, collate_fn=lambda batch: collect_batch(batch, 'validation'))
test_dataloader = DataLoader(dataset['test'], batch_size=batch_size, shuffle=False, collate_fn=lambda batch: collect_batch(batch, 'test'))

In [None]:
primary_emotions = list(primary_emotion_to_sub_emotions.keys())

def calc_primary_emotions_distribution(dataloader, split):
    total_sum = torch.zeros(len(primary_emotions)).to(device)
    for _, (labels, text) in enumerate(dataloader):
        batch_sum = torch.sum(labels, dim=0)
        total_sum += batch_sum

    total_sum_np = total_sum.cpu().numpy()
    percentages = (total_sum_np / np.sum(total_sum_np)) * 100
    # percentages = np.sort(percentages)
    plt.figure(figsize=(10, 6))
    plt.barh(primary_emotions, percentages, color='blue')
    plt.xlabel('Percentage')
    plt.ylabel('Primary Emotions')
    plt.title(f'Percentage of Each Primary Emotion for {split} data')
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    return total_sum

total_sum_train = calc_primary_emotions_distribution(train_dataloader, 'train')
total_sum_validation = calc_primary_emotions_distribution(valid_dataloader, 'validation')
total_sum_test = calc_primary_emotions_distribution(test_dataloader, 'test')

In [None]:
print(f'{total_sum_train=}\n{total_sum_validation=}\n{total_sum_test=}')

class_count = total_sum_train + total_sum_validation + total_sum_test

pos_weight = [(class_count.sum() - x) / (x + 1e-5) for _, x in enumerate(class_count)]
pos_weight = torch.tensor([weight.item() for weight in pos_weight]).to(device)
print(f"{pos_weight=}")

In [None]:
class GoEmotions_BLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, lstm_hidden_size=64, lstm_layers=10, dense_hidden_size=64, output_dim=len(primary_emotions)):
        super(GoEmotions_BLSTM, self).__init__()

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_hidden_size = lstm_hidden_size
        self.lstm_layers = lstm_layers
        self.dense_hidden_size = dense_hidden_size
        self.output_dim = output_dim

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)
        self.LayerNorm = nn.LayerNorm(embedding_dim, eps=1e-12)

        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=lstm_hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True)

        self.dropout = nn.Dropout(0.1)

        self.dense_layer = nn.Sequential(
            nn.Linear(2 * lstm_hidden_size, dense_hidden_size),
            nn.ReLU(inplace=True),
            nn.Dropout(0.1),
            nn.Linear(dense_hidden_size, output_dim),
        )

        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.init_weights()


    def init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
                if m.bias is not None:
                    torch.nn.init.zeros_(m.bias)
            elif isinstance(m, nn.Embedding):
                torch.nn.init.normal_(m.weight, mean=0.0, std=0.02)
            elif isinstance(m, nn.LayerNorm):
                torch.nn.init.zeros_(m.bias)
                torch.nn.init.ones_(m.weight)


    def forward(self, x):
        h = torch.zeros((self.lstm_layers * 2, x.size(0), self.dense_hidden_size)).to(self.device)
        c = torch.zeros((self.lstm_layers * 2, x.size(0), self.dense_hidden_size)).to(self.device)

        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)

        x = self.embedding(x)
        x = self.LayerNorm(x)
        x, (hidden, context) = self.lstm(x, (h, c))
        x = self.dropout(x)
        x = self.dense_layer(x[:, -1, :])
        return x

In [None]:
def count_layers_and_parameters(dummy_model):
    # Count the number of layers
    def count_layers(module):
        if len(list(module.children())) == 0:
            return 1
        return sum(count_layers(child) for child in module.children())

    num_layers = count_layers(dummy_model)

    # Count the number of parameters
    num_params = sum(p.numel() for p in dummy_model.parameters() if p.requires_grad)
    print(f"{type(dummy_model).__name__}: {num_layers=} {num_params=}")

In [None]:
def calc_model_size(dummy_model):
    param_size = 0
    for param in dummy_model.parameters():
        param_size += param.nelement() * param.element_size()
    buffer_size = 0
    for buffer in dummy_model.buffers():
        buffer_size += buffer.nelement() * buffer.element_size()
    size_all_mb = (param_size + buffer_size) / 1024 ** 2
    print(f"{type(dummy_model).__name__} size: {size_all_mb:.2f} MB")

In [None]:
model = GoEmotions_BLSTM(vocab_size=len(tokenizer.get_vocab())).to(device)
print(model)

count_layers_and_parameters(model)
calc_model_size(model)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

def calculate_accuracy(model, dataloader):
    model.eval()
    y_true_accumulate = []
    y_pred_accumulate = []

    with torch.no_grad():
        for _, (labels, text) in enumerate(dataloader):
            outputs = model(text)
            y_pred = torch.sigmoid(outputs)

            y_pred = (y_pred > 0.5).float()  # Convert probabilities to binary predictions

            y_true_accumulate.extend(labels.cpu().numpy())
            y_pred_accumulate.extend(y_pred.cpu().numpy())

    model_accuracy = accuracy_score(y_true_accumulate, y_pred_accumulate)
    conf_matrix = confusion_matrix(y_true_accumulate, y_pred_accumulate)

    return model_accuracy, conf_matrix


def train_model(model, train_dataloader, valid_dataloader, test_dataloader, num_epochs=500):
    learning_rate = 1e-4
    clip_value = 5

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # nn.BCEWithLogitsLoss applies nn.Sigmoid on the input
    # Use a pos_weight matrix to help mitigate class imbalance
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    train_losses = []
    train_accuracies = []
    for epoch in range(1, num_epochs + 1):
        model.train()
        running_loss = 0.0
        epoch_time = time.time()

        for _, (labels, text) in enumerate(train_dataloader):
            outputs = model(text)
            loss = criterion(outputs, labels.float())
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_value)
            optimizer.step()

            running_loss += loss.item()

        running_loss /= len(train_dataloader)

        train_accuracy, _ = calculate_accuracy(model, train_dataloader)
        if epoch % 10 == 0:
            validation_accuracy, _ = calculate_accuracy(model, valid_dataloader)
            print(f'Epoch [{epoch:2}/{num_epochs}] | Loss: {running_loss:.6f} | Training Accuracy: {train_accuracy:.4f} | Validation Accuracy: {validation_accuracy:.4f} | Time: {time.time() - epoch_time:.2f}s')
        else:
            print(f'Epoch [{epoch:2}/{num_epochs}] | Loss: {running_loss:.6f} | Training Accuracy: {train_accuracy:.4f} | Time: {time.time() - epoch_time:.2f}s')

        train_losses.append(running_loss)
        train_accuracies.append(train_accuracy)

    return train_losses, train_accuracies


In [None]:
train_losses, train_accuracies = train_model(model, train_dataloader, valid_dataloader, test_dataloader, 20)

In [None]:
plt.plot(train_losses)

In [None]:
plt.plot(train_accuracies)