# Emotional Classification Pipeline

In [1]:
!pip install -U datasets fasttext

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32

In [None]:
# Get fastText pretrained vectors
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.af.zip
!unzip wiki.af.zip

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ha.zip
!unzip wiki.ha.zip

!wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.sw.zip
!unzip wiki.sw.zip

--2025-06-15 17:00:19--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.af.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.15, 13.226.210.111, 13.226.210.25, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.15|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2567394808 (2.4G) [application/zip]
Saving to: ‘wiki.af.zip’


2025-06-15 17:00:38 (130 MB/s) - ‘wiki.af.zip’ saved [2567394808/2567394808]

Archive:  wiki.af.zip
  inflating: wiki.af.vec             
  inflating: wiki.af.bin             
--2025-06-15 17:01:30--  https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.ha.zip
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.226.210.111, 13.226.210.15, 13.226.210.78, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.226.210.111|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2233056492 (2.1G) [application/zip]
Saving to: ‘wiki.ha.zi

## Install Packages

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, concatenate_datasets, Dataset
from peft import LoraConfig, get_peft_model
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, hamming_loss
from sklearn.model_selection import train_test_split
import torch
import random
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import fasttext
from numpy import dot
from numpy.linalg import norm
import math

In [None]:
def set_all_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # set_seed(seed)  # HuggingFace transformers seed


## Define Pipeline Parameters

In [None]:
model_name = "Davlan/afro-xlmr-small"  # Base model name
run_baseline = False # Set to True to run the baseline model (skip data augmentation)
languages = ["afr", "hau", "swa"] # List of languages to include, can be run on all languages/specific ones
# languages = ["afr"] # List of languages to include, can be run on all languages/specific ones
seed = 42 # Set a seed for reproducibility

emotion_labels = {
    "joy": 0,
    "anger": 1,
    "fear": 2,
    "sadness": 3,
    "disgust": 4,
    "surprise": 5,
    "neutral": 6,
}

set_all_seeds(seed)

## Load Datasets

In [None]:
def load_datasets(languages):
    # Load the datasets for the specified languages
    # datasets are in the form:
    # {
    #     "lang1": dataset1,
    #     "lang2": dataset2,
    #     ...
    # }

    datasets = {}
    for lang in languages:
        print(f"Loading dataset for {lang}...")
        hf_dataset = load_dataset("brighter-dataset/BRIGHTER-emotion-categories", lang)

        combined_hf_dataset = concatenate_datasets([
            hf_dataset["train"],
            hf_dataset["test"],
            hf_dataset["dev"]
        ])
        datasets[lang] = combined_hf_dataset
        print(f"Loaded {len(combined_hf_dataset)} samples for {lang}.")


    return datasets


datasets = load_datasets(languages)

## Load FastText Models

In [None]:
swahili_model = fasttext.load_model('wiki.sw.bin')
hausa_model = fasttext.load_model('wiki.ha.bin')
afrikaans_model = fasttext.load_model('wiki.af.bin')

language_models = {
    "swa": swahili_model,
    "hau": hausa_model,
    "afr": afrikaans_model
}

## Preprocess Dataset
- Map empty emotions to neutral
- Split in to train, validation and test sets using stratified sampling by emotion label
- Add multi-hot encoded label column

In [None]:
def preprocess_datasets(datasets, emotion_to_id_dict):
    # datasets are in form
    # {
    #     "lang1": {
    #         "train": <train_dataset>,
    #         "test": <test_dataset>,
    #         "validation": <validation_dataset>
    #     },
    #     "lang2": {
    #         "train": <train_dataset>,
    #         "test": <test_dataset>,
    #         "validation": <validation_dataset>
    #     },
    #     ...
    # }

    preprocessed_datasets = {}
    for lang, dataset in datasets.items():
        print(f"Preprocessing dataset for {lang}...")

        # Replace empty labels with "neutral"
        dataset = dataset.map(
            lambda example: {
                **example,
                "emotions": example["emotions"] if len(example["emotions"]) > 0 else ["neutral"]
            }
        )

        # Convert the list of emotions to a string representation for stratification
        def categorise_emotions(emotions_list):
            if not emotions_list:
                return "none"
            elif len(emotions_list) == 1:
                return emotions_list[0]
            else:
                return "multiple"

        # Add a new column for stratification
        dataset = dataset.map(
            lambda example: {"emotion_key": categorise_emotions(example["emotions"])}
        )

        # Now split using train_test_split from scikit-learn
        train_size = 0.8
        test_size = 0.1
        dev_size = 0.1

        # Convert to pandas for easier handling with sklearn
        df = dataset.to_pandas()

        print(f"Stratifying by emotion...")
        # First split to get train and the rest
        train_df, temp_df = train_test_split(
            df,
            train_size=train_size,
            stratify=df["emotion_key"],
            random_state=seed
        )

        # Then split the rest into test and dev
        test_df, dev_df = train_test_split(
            temp_df,
            train_size=test_size/(test_size + dev_size),
            stratify=temp_df["emotion_key"],
            random_state=seed
        )

        small_train, left_over = train_test_split(
            train_df,
            train_size=0.2,
            stratify=train_df["emotion_key"],
            random_state=seed
        )

        small_train_dataset = Dataset.from_pandas(small_train)
        train_dataset = Dataset.from_pandas(train_df)
        test_dataset = Dataset.from_pandas(test_df)
        dev_dataset = Dataset.from_pandas(dev_df)


        num_labels = len(emotion_to_id_dict)

        # Create a new column 'labels' containing multi-hot encoded labels
        def map_emotions_to_labels(example):
            # Initialise a zero array for all emotions
            labels = [0.0] * num_labels  # Use floats instead of integers

            # Handle both string format and list format
            emotions_list = example['emotions']

            # Set 1 for each emotion present in the example
            for emotion in emotions_list:
                labels[emotion_to_id_dict[emotion]] = 1.0

            example['labels'] = labels
            return example

        # Apply the mapping function to the train, test, and dev datasets
        small_train_dataset = small_train_dataset.map(map_emotions_to_labels)
        train_dataset = train_dataset.map(map_emotions_to_labels)
        test_dataset = test_dataset.map(map_emotions_to_labels)
        dev_dataset = dev_dataset.map(map_emotions_to_labels)


        print(f"Dataset for {lang} split with {len(train_dataset)} train, {len(test_dataset)} test, and {len(dev_dataset)} validation samples.")

        # Store the datasets in the dictionary
        preprocessed_datasets[lang] = {
            # "train": small_train_dataset,
            "train": train_dataset,
            "test": test_dataset,
            "validation": dev_dataset
        }
    return preprocessed_datasets


preprocessed_datasets = preprocess_datasets(datasets.copy(), emotion_labels)

# Apply data augmentation techique
> Skips if running baseline model

In [None]:
def sentence_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.get_word_vector(w) for w in words if w in model]
    if len(word_vectors) == 0:
        return np.zeros(model.get_dimension())
    return np.mean(word_vectors, axis=0)

def cosine_similarity(v1, v2):
    return dot(v1, v2) / (norm(v1) * norm(v2))

def synonym_insert(sentence, model, num_to_augment=5, min_similarity=0.75):
    """
    Replaces a random word in the sentence with similar word using the provided FastText model.

    Args:
        sentence (str): The input sentence to augment.
        model (fasttext.FastText._FastText): The pre-trained fastText model.
        num_to_augment (int): Number of augmented sentences to create to choose from
        min_similarity (float): Minimum cosine similarity to consider a word a synonym.

    Returns:
        str: The augmented sentences
    """

    clean_sentence = _clean_text(sentence)
    words = re.findall(r'\b\w+\b', clean_sentence)
    if not words:
        return None

    # Choose a word that exists in the model's vocabulary
    candidate_words = [word for word in words if word.lower() in model.words]
    if not candidate_words:
        print("No words in model")
        return None

    augmentation_choices = []

    for i in range(num_to_augment):
      word_to_replace = random.choice(candidate_words)
      neighbors = model.get_nearest_neighbors(word_to_replace)

      to_insert = random.choice(neighbors)
      if isinstance(to_insert, tuple):
        to_insert = to_insert[1]

      insert_index = random.randint(0, len(words))

      words.insert(insert_index, to_insert)

      augmented_sentence = ' '.join(words)
      augmentation_choices.append(augmented_sentence)

    best_cosine = -5
    best_augmentation = None
    original_vector = sentence_vector(sentence, model)

    for augmentation in augmentation_choices:
      augmented_vector = sentence_vector(augmentation, model)
      similarity = cosine_similarity(original_vector, augmented_vector)

      if similarity > best_cosine:
        best_cosine = similarity
        best_augmentation = augmentation

    if best_cosine < min_similarity:
      return sentence

    return best_augmentation

def get_label_counts(dataset):
  # Count each individual emotion
  emotion_counter = Counter()
  df = dataset.to_pandas()

  # Iterate through each row in the DataFrame
  for emotions_list in df['emotions']:
      # Add each emotion to the counter
      for emotion in emotions_list:
          emotion_counter[emotion] += 1

  # Display results
  return emotion_counter

def get_under_represented_labels(label_counts):
  max_count = max(label_counts.values())
  threshold = 0.7 * max_count

  underrepresented = [label for label, count in label_counts.items() if count < threshold]
  return underrepresented, max_count, threshold

def _clean_text(text):
  """
  Basic text cleaning - language agnostic
  """
  # Remove extra whitespace
  text = re.sub(r'\s+', ' ', text.strip())
  return text

def augment_sentence(instance, emotion_label, num_to_generate, model):
  """
  Apply both synonym replacement and random insertion
  Following the paper's approach
  """

  sentence = instance["text"]

  augmentations = []
  for i in range(num_to_generate):
    augmented = synonym_insert(sentence, model)
    if augmented == sentence:
      continue
    for a in augmentations:
      if augmented == a["text"]:
        continue
    if augmented is not None:
      new_instance = instance.copy()
      new_instance["text"] = augmented
      augmentations.append(new_instance)

  # print(f"Original: {sentence}")
  # print(f"Augmented: {augmentations}")
  return augmentations

def get_label_aug_percentages(train_dataset, lang):

  aug_per_label = []

  print(f"Total number of training instances before augmentation: {len(train_dataset)}")
  print(f"Training distribution before augmentation:")
  label_counts = get_label_counts(train_dataset)
  number_of_labels = len(label_counts)
  ideal_distribution = 1 / number_of_labels

  for emotion, count in label_counts.items():
    percentage = count/len(train_dataset)
    print(f"{emotion}: {count} - {round(percentage*100)}%")
    if percentage >= 1.5 * ideal_distribution:
      print(f"Label {emotion} is over represented")
      aug_per_label.append({"label": emotion, "to_augment": 1.0})
    if percentage >= ideal_distribution:
      print(f"Label {emotion} is well represented")
      aug_per_label.append({"label": emotion, "to_augment": 1.3})
    elif (ideal_distribution - percentage) <= 0.05:
      print(f"Label {emotion} is moderately represented")
      aug_per_label.append({"label": emotion, "to_augment": 2.0})
    elif (ideal_distribution - percentage) <= 0.10:
      print(f"Label {emotion} is underrepresented")
      aug_per_label.append({"label": emotion, "to_augment": 3.0})
    else:
      print(f"Label {emotion} is extremely underrepresented")
      aug_per_label.append({"label": emotion, "to_augment": 3.0})

  return aug_per_label, label_counts

def augment_datasets(datasets, language_models):
  if run_baseline:
      print("Running baseline model, skipping data augmentation.")
      return datasets

  augmented_datasets = {} # Use a new dictionary to store augmented datasets

  for lang, dataset in datasets.items():
      print(f"\n-------------- Applying data augmentation for {lang} --------------")
      training_set = dataset["train"]
      validation_set = dataset["validation"] # Keep original validation and test sets
      test_set = dataset["test"]

      augmentation_per_label, label_counts = get_label_aug_percentages(training_set, lang)

      augmented_training_list = training_set.to_list()

      for apl in augmentation_per_label:
          label = apl["label"]
          aug_amount = apl["to_augment"]
          target_count = label_counts[label] * aug_amount
          current_count = label_counts[label]

          print(f"Augmenting label: '{label}'...")
          if current_count >= target_count:
              print(f"Label '{label}' already has {current_count} samples, no augmentation needed.")
              continue

          num_to_generate = int(target_count - current_count)
          augmentations_per_instance = min(math.ceil(num_to_generate/current_count), 4)
          print(f"Generating approx. {num_to_generate} samples for '{label}'...")

          original_samples_for_label = []
          for sample in training_set:
              if label in sample['emotions']:
                  # Store the sample dictionary as is, we'll modify text later
                  original_samples_for_label.append(sample)

          if not original_samples_for_label:
              print(f"Warning: No samples found with label '{label}' for augmentation.")
              continue

          # Generate augmented samples
          augmented_samples_list = []
          num_augmented = 0

          for instance in original_samples_for_label:
            if num_augmented >= num_to_generate:
              break

            augmentations = augment_sentence(instance, label, augmentations_per_instance, language_models[lang])
            augmented_samples_list.extend(augmentations)
            num_augmented += len(augmentations)

          # Add generated samples to the list of training samples
          augmented_training_list.extend(augmented_samples_list)

      for i, item in enumerate(augmented_training_list):
        if not isinstance(item, dict):
            print(f"Item {i} is not a dict:", item)
      # Convert the list back to a Hugging Face Dataset

      augmented_train_dataset = Dataset.from_list(augmented_training_list)

      print(f"Total number of training instances after augmentation: {len(augmented_training_list)}")
      print(f"\nTraining distribution after augmentation for {lang}")
      augmented_label_counts = get_label_counts(augmented_train_dataset)
      for emotion, count in augmented_label_counts.items():
        percentage = count/len(augmented_train_dataset)
        print(f"{emotion}: {count} - {round(percentage*100)}")


      # Store the augmented train dataset and original validation/test sets
      augmented_datasets[lang] = {
          "train": augmented_train_dataset,
          "test": test_set,
          "validation": validation_set
      }

  return augmented_datasets


augmented_datasets = augment_datasets(preprocessed_datasets.copy(), language_models)


-------------- Applying data augmentation for afr --------------
Total number of training instances before augmentation: 2838
Training distribution before augmentation:
fear: 301 - 11%
Label fear is underrepresented
anger: 177 - 6%
Label anger is extremely underrepresented
disgust: 133 - 5%
Label disgust is extremely underrepresented
neutral: 1075 - 38%
Label neutral is over represented
Label neutral is well represented
joy: 1112 - 39%
Label joy is over represented
Label joy is well represented
sadness: 439 - 15%
Label sadness is moderately represented
Augmenting label: 'fear'...
Generating approx. 602 samples for 'fear'...
Augmenting label: 'anger'...
Generating approx. 354 samples for 'anger'...
Augmenting label: 'disgust'...
Generating approx. 266 samples for 'disgust'...
Augmenting label: 'neutral'...
Label 'neutral' already has 1075 samples, no augmentation needed.
Augmenting label: 'neutral'...
Generating approx. 322 samples for 'neutral'...
Augmenting label: 'joy'...
Label 'joy

  return dot(v1, v2) / (norm(v1) * norm(v2))


Augmenting label: 'disgust'...
Generating approx. 186 samples for 'disgust'...
Augmenting label: 'joy'...
Generating approx. 179 samples for 'joy'...
Augmenting label: 'surprise'...
Generating approx. 195 samples for 'surprise'...
Total number of training instances after augmentation: 5863

Training distribution after augmentation for hau
neutral: 1028 - 18
fear: 923 - 16
sadness: 1829 - 31
anger: 1123 - 19
disgust: 888 - 15
joy: 805 - 14
surprise: 923 - 16

-------------- Applying data augmentation for swa --------------
Total number of training instances before augmentation: 6176
Training distribution before augmentation:
neutral: 2749 - 45%
Label neutral is over represented
Label neutral is well represented
joy: 830 - 13%
Label joy is moderately represented
surprise: 1000 - 16%
Label surprise is well represented
sadness: 647 - 10%
Label sadness is moderately represented
fear: 171 - 3%
Label fear is extremely underrepresented
anger: 586 - 9%
Label anger is moderately represented
disg

# Tokenize the datasets

In [None]:
xmlr_tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    return xmlr_tokenizer(batch['text'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = {}
for lang, dataset in augmented_datasets.items():
    print(f"Tokenizing dataset for {lang}...")
    tokenized_datasets[lang] = {}
    tokenized_datasets[lang]['train'] = dataset['train'].map(preprocess, batched=True)
    tokenized_datasets[lang]['test'] = dataset['test'].map(preprocess, batched=True)
    tokenized_datasets[lang]['validation'] = dataset['validation'].map(preprocess, batched=True)
    # Set the format for PyTorch
    tokenized_datasets[lang]['train'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    tokenized_datasets[lang]['test'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    tokenized_datasets[lang]['validation'].set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    print(f"Tokenized dataset for {lang}")



tokenizer_config.json:   0%|          | 0.00/536 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.71M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/238 [00:00<?, ?B/s]

Tokenizing dataset for afr...


Map:   0%|          | 0/5154 [00:00<?, ? examples/s]

Map:   0%|          | 0/355 [00:00<?, ? examples/s]

Map:   0%|          | 0/355 [00:00<?, ? examples/s]

Tokenized dataset for afr
Tokenizing dataset for hau...


Map:   0%|          | 0/5863 [00:00<?, ? examples/s]

Map:   0%|          | 0/502 [00:00<?, ? examples/s]

Map:   0%|          | 0/502 [00:00<?, ? examples/s]

Tokenized dataset for hau
Tokenizing dataset for swa...


Map:   0%|          | 0/10522 [00:00<?, ? examples/s]

Map:   0%|          | 0/772 [00:00<?, ? examples/s]

Map:   0%|          | 0/773 [00:00<?, ? examples/s]

Tokenized dataset for swa


# Train Model

In [None]:
# Mignon added
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    hamming_loss,
    multilabel_confusion_matrix
)
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sigmoid = lambda x: 1 / (1 + np.exp(-x))
    probs = sigmoid(logits)
    preds = (probs >= 0.3).astype(int)

    # Get multilabel confusion matrix: shape (num_labels, 2, 2)
    mcm = multilabel_confusion_matrix(labels, preds)

    # Convert to dict for logging/inspection
    confusion_dict = {
        f"label_{i}": {
            "tn": int(cm[0][0]),
            "fp": int(cm[0][1]),
            "fn": int(cm[1][0]),
            "tp": int(cm[1][1]),
        }
        for i, cm in enumerate(mcm)
    }

    return {
        "subset_accuracy": accuracy_score(labels, preds),
        "hamming_loss": hamming_loss(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
        "micro_f1": f1_score(labels, preds, average="micro", zero_division=0),
        "macro_precision": precision_score(labels, preds, average="macro", zero_division=0),
        "micro_precision": precision_score(labels, preds, average="micro", zero_division=0),
        "macro_recall": recall_score(labels, preds, average="macro", zero_division=0),
        "micro_recall": recall_score(labels, preds, average="micro", zero_division=0),
        "confusion_matrix": confusion_dict
    }

def train_finetune(train_data, val_data, num_labels, lang):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type="multi_label_classification"  # Set problem type for multi-label
    )

    args = TrainingArguments(
        output_dir=f"{model_name}-{lang}-ft",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        save_strategy="no",
        report_to="none",
        logging_dir=None,
        seed=seed,  # Add seed to training arguments
        data_seed=seed,  # Add data seed for data loading
        dataloader_num_workers=0,  # Ensure deterministic data loading
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_data,
        eval_dataset=val_data,
        compute_metrics=compute_metrics
    )
    trainer.train()
    return trainer


trainers = {}
for lang, datasets in augmented_datasets.items():
    print(f"\nTraining on {lang} dataset...")

    num_labels = len(emotion_labels)
    set_all_seeds(seed)
    trainers[lang] = train_finetune(tokenized_datasets[lang]["train"],tokenized_datasets[lang]["validation"], num_labels, lang)
    results = trainers[lang].evaluate(tokenized_datasets[lang]["validation"], metric_key_prefix="eval")
    print("Training Fine-tuning Accuracy:", results['eval_subset_accuracy'])
    print("Training Hamming Loss:", results['eval_hamming_loss'])
    print("Macro F1 Score", results['eval_macro_f1'])
    print("Micro F1 Score", results['eval_micro_f1'])
    print("Micro Precision", results['eval_micro_precision'])
    print("Micro Recall", results['eval_micro_recall'])
    print("Macro Precision", results['eval_macro_precision'])
    print("Macro Recall", results['eval_macro_recall'])
    print("Confusion Matrix", results['eval_confusion_matrix'])

# Test Model

In [None]:
def test_model(trainer, test_data):
    # Reuse the already loaded tokenizer
    test_results = trainer.evaluate(
        test_data,
        metric_key_prefix="test"
    )

    print("Test Accuracy:", test_results['test_subset_accuracy'])
    print("Test Hamming Loss:", test_results['test_hamming_loss'])
    print("Test Macro F1 Score:", test_results['test_macro_f1'])
    print("Test Micro F1 Score:", test_results['test_micro_f1'])
    print("Test Micro Precision:", test_results['test_micro_precision'])
    print("Test Micro Recall:", test_results['test_micro_recall'])
    print("Test Macro Precision:", test_results['test_macro_precision'])
    print("Test Macro Recall:", test_results['test_macro_recall'])

    return test_results

for lang, datasets in augmented_datasets.items():
    print(f"\nTesting {lang} model on test {lang} dataset...")
    test_results = test_model(trainers[lang], tokenized_datasets[lang]["test"])


In [None]:
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

def plot_multilabel_roc(trainer, dataset, emotion_labels, lang):
    # Get raw predictions and labels
    outputs = trainer.predict(dataset)
    logits = outputs.predictions
    labels = outputs.label_ids

    sigmoid = lambda x: 1 / (1 + np.exp(-x))
    probs = sigmoid(logits)  # Shape: [num_samples, num_labels]

    num_labels = len(emotion_labels)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()

    for i in range(num_labels):
        fpr[i], tpr[i], _ = roc_curve(labels[:, i], probs[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])

    label_names = [None] * len(emotion_labels)
    for k, v in emotion_labels.items():
        label_names[v] = k

    # Plot
    plt.figure(figsize=(10, 8))
    for i in range(num_labels):
        plt.plot(fpr[i], tpr[i], label=f"{label_names[i]} (AUC = {roc_auc[i]:.2f})")

    plt.plot([0, 1], [0, 1], "k--")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(f"ROC Curves for Language: {lang}")
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
for lang in trainers:
    print(f"Plotting ROC curve for {lang}")
    plot_multilabel_roc(trainers[lang], tokenized_datasets[lang]["test"], emotion_labels, lang)

# Parameter Tuning

In [None]:
# param_grid = {
#     "learning_rate": [2e-5, 3e-5, 5e-5],
#     "batch_size": [8, 16],
#     "epochs": [3, 5],
#     "threshold": [0.3, 0.5, 0.7]
# }


In [None]:
# def train_finetune(model_ckpt, train_data, test_data, num_labels, lr, batch_size, epochs, threshold):
#     model = AutoModelForSequenceClassification.from_pretrained(
#         model_ckpt,
#         num_labels=num_labels,
#         problem_type="multi_label_classification"
#     )


#     args = TrainingArguments(
#         output_dir=f"{model_ckpt}-finetuned",
#         per_device_train_batch_size=batch_size,
#         per_device_eval_batch_size=batch_size,
#         learning_rate=lr,
#         num_train_epochs=epochs,
#         save_strategy="no",
#         report_to="none",
#         logging_dir=None,
#         seed=seed,  # Add seed to training arguments
#         data_seed=seed,  # Add data seed for data loading
#         dataloader_num_workers=0,  # Ensure deterministic data loading
#     )

#     def compute_metrics(eval_pred):
#         logits, labels = eval_pred
#         sigmoid = lambda x: 1 / (1 + np.exp(-x))
#         probs = sigmoid(logits)
#         preds = (probs >= threshold).astype(int)

#         return {
#             "subset_accuracy": accuracy_score(labels, preds),
#             "hamming_loss": hamming_loss(labels, preds),
#             "macro_f1": f1_score(labels, preds, average="macro", zero_division=0),
#             "micro_f1": f1_score(labels, preds, average="micro", zero_division=0),
#         }

#     trainer = Trainer(
#         model=model,
#         args=args,
#         train_dataset=train_data,
#         eval_dataset=test_data,
#         compute_metrics=compute_metrics
#     )

#     trainer.train()
#     results = trainer.evaluate()
#     return results


In [None]:
# import itertools
# import numpy as np
# from collections import defaultdict

# # Dictionary to store results for each language
# language_results = {}
# # Dictionary to track average performance across languages for each parameter combination
# param_avg_scores = defaultdict(float)
# # To count how many languages we test on (for averaging)
# num_languages = len(languages)

# # Create a compact parameter key format for readability
# def param_key(lr, batch_size, epochs, threshold):
#     return f"lr={lr}_bs={batch_size}_ep={epochs}_th={threshold}"

# print(f"Parameter tuning across {num_languages} languages: {', '.join(languages)}")

# num_labels = len(emotion_labels)
# # Generate all parameter combinations once
# param_combinations = list(itertools.product(
#     param_grid["learning_rate"],
#     param_grid["batch_size"],
#     param_grid["epochs"],
#     param_grid["threshold"]
# ))

# # First, train and evaluate on each language separately
# for lang in languages:
#     print(f"\n{'='*50}")
#     print(f"TUNING ON LANGUAGE: {lang}")
#     print(f"{'='*50}")

#     language_results[lang] = []
#     best_lang_score = 0
#     best_lang_params = {}

#     for lr, batch_size, epochs, threshold in param_combinations:
#         print(f"\nTesting on {lang}: lr={lr}, batch_size={batch_size}, epochs={epochs}, threshold={threshold}")

#         set_all_seeds(seed)
#         # Train and evaluate on this language with these parameters
#         results = train_finetune(
#             "Davlan/afro-xlmr-small",
#             tokenized_datasets[lang]["train"],
#             tokenized_datasets[lang]["validation"],
#             num_labels, lr, batch_size, epochs, threshold
#         )

#         # Extract the score (using micro_f1 as our metric)
#         score = results["eval_micro_f1"]
#         print(f"{lang} Micro-F1 Score: {score:.4f}")

#         # Store results for this language
#         language_results[lang].append((score, lr, batch_size, epochs, threshold))

#         # Update best parameters for this language
#         if score > best_lang_score:
#             best_lang_score = score
#             best_lang_params = {
#                 "learning_rate": lr,
#                 "batch_size": batch_size,
#                 "epochs": epochs,
#                 "threshold": threshold
#             }

#         # Add to our running average across languages
#         param_key_str = param_key(lr, batch_size, epochs, threshold)
#         param_avg_scores[param_key_str] += score / num_languages

#     # Print the best parameters for this language
#     print(f"\n✅ Best parameters for {lang}:")
#     print(best_lang_params)
#     print(f"Best {lang} Micro-F1 Score: {best_lang_score:.4f}")

# # Find the parameter combination with the best average performance across languages
# best_avg_score = 0
# best_avg_params = None

# for params, avg_score in param_avg_scores.items():
#     if avg_score > best_avg_score:
#         best_avg_score = avg_score
#         best_avg_params = params

# # Parse the parameter key back into a dictionary
# lr, batch_size, epochs, threshold = best_avg_params.split("_")
# best_params = {
#     "learning_rate": float(lr.split('=')[1]),
#     "batch_size": int(batch_size.split('=')[1]),
#     "epochs": int(epochs.split('=')[1]),
#     "threshold": float(threshold.split('=')[1])
# }

# print("\n" + "="*70)
# print("FINAL RESULTS ACROSS ALL LANGUAGES")
# print("="*70)
# print("\n✅ Best overall parameters (averaged across all languages):")
# print(best_params)
# print(f"Average Micro-F1 Score: {best_avg_score:.4f}")

# # Create a performance matrix to visualize results
# print("\nPerformance across languages:")
# print(f"{'Parameters':<40} | {'Average':<10} | " + " | ".join([f"{lang:<10}" for lang in languages]))
# print("-" * (50 + 12 * num_languages))

# # Sort parameter combinations by average score for better readability
# sorted_params = sorted(param_avg_scores.items(), key=lambda x: x[1], reverse=True)

# # Print top 5 parameter combinations
# for i, (params, avg_score) in enumerate(sorted_params[:5]):
#     # Get individual language scores for this parameter combination
#     lang_scores = []
#     for lang in languages:
#         # Find the score for this parameter combination in this language
#         for result in language_results[lang]:
#             score, lr, bs, ep, th = result
#             if param_key(lr, bs, ep, th) == params:
#                 lang_scores.append(score)
#                 break

#     # Print the row
#     print(f"{params:<40} | {avg_score:.4f}      | " + " | ".join([f"{score:.4f}      " for score in lang_scores]))
