# COS 760 Research Project: Analysing Sentiments for Low-resource African Languages

## Group Members: Mihir Arjun, Troy Clark, Hamza Mokiwa

### Establishing Baselines with Monolingual Long Short-Term Memory networks(LSTMs) and pre-trained Multilingual transformers

#### First we need to install the datasets

In [5]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [1]:
import os
from datasets import load_from_disk, load_dataset

def load_local_datasets():
    swa_path = "./datasets/afrisenti/swa"
    por_path = "./datasets/afrisenti/por"
    sot_path = "./datasets/news"

    if not all(os.path.exists(path) for path in [swa_path, por_path,sot_path]):
        print("One or more dataset directories not found. Please check the paths.")
        return None, None, None

    print("Loading Swahili (swa) dataset from disk...")
    swa_dataset = load_from_disk(swa_path)
    print("Swahili dataset loaded!")

    print("Loading Portuguese (por) dataset from disk...")
    por_dataset = load_from_disk(por_path)
    print("Portuguese dataset loaded!")

    print("Loading Sesotho (sot) dataset from disk...")
    sot_dataset = load_dataset("csv",  data_files="datasets/sotho-news/sotho_news_dataset.csv")
    print("Sesotho dataset loaded!")

    return swa_dataset, por_dataset, sot_dataset

if __name__ == "__main__":
    swa, por, sot = load_local_datasets()

    if swa is not None:
        print(f"Swahili dataset size: {len(swa['train'])} examples")
    if por is not None:
        print(f"Portuguese dataset size: {len(por['train'])} examples")
    if sot is not None:
        print(f"Sesotho dataset size: {len(sot['train'])} examples")

  from .autonotebook import tqdm as notebook_tqdm


Loading Swahili (swa) dataset from disk...
Swahili dataset loaded!
Loading Portuguese (por) dataset from disk...
Portuguese dataset loaded!
Loading Sesotho (sot) dataset from disk...
Sesotho dataset loaded!
Swahili dataset size: 1810 examples
Portuguese dataset size: 3063 examples
Sesotho dataset size: 2177 examples


## Now that the datasets have been loaded, we can start creating our LSTM baseline models below:

### First we will build an LSTM model for Swahili

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm


torch.manual_seed(42)
np.random.seed(42)


EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
BATCH_SIZE = 32
NUM_EPOCHS = 10

class SwahiliSentimentDataset(Dataset):
    def __init__(self, tweets, labels, vocab, label_map):
        self.tweets = tweets
        self.labels = labels
        self.vocab = vocab
        self.label_map = label_map

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]


        tokenized = [self.vocab.get(word, self.vocab['<UNK>']) for word in tweet.split()]
        return torch.tensor(tokenized, dtype=torch.long), torch.tensor(self.label_map[label], dtype=torch.long)

def collate_fn(batch):
    tweets, labels = zip(*batch)

    tweets_padded = pad_sequence(tweets, batch_first=True, padding_value=0)
    return tweets_padded, torch.stack(labels)

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        embedded = self.dropout(self.embedding(text))


        output, (hidden, cell) = self.lstm(embedded)



        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)


        hidden = self.dropout(hidden)
        return self.fc(hidden)

def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs):
    best_val_loss = float('inf')

    for epoch in range(epochs):

        model.train()
        running_loss = 0.0

        for tweets, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            tweets, labels = tweets.to(device), labels.to(device)

            optimizer.zero_grad()


            outputs = model(tweets)


            loss = criterion(outputs, labels)


            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)


        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for tweets, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                tweets, labels = tweets.to(device), labels.to(device)

                outputs = model(tweets)
                loss = criterion(outputs, labels)

                val_loss += loss.item()

                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")


        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_swahili_lstm_model.pt")
            print("  Saved new best model!")

        print("-" * 60)

    return model

def evaluate_model(model, test_loader, criterion, device, label_list):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for tweets, labels in tqdm(test_loader, desc="Testing"):
            tweets, labels = tweets.to(device), labels.to(device)

            outputs = model(tweets)
            loss = criterion(outputs, labels)

            test_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    test_loss /= len(test_loader)
    test_accuracy = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")


    class_names = [label_list[i] for i in range(len(label_list))]
    report = classification_report(all_labels, all_preds, target_names=class_names)
    print("\nClassification Report:")
    print(report)

    return test_loss, test_accuracy, test_f1

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")


    try:
        swa_dataset = load_from_disk("./datasets/afrisenti/swa")
        print("Swahili dataset loaded successfully!")

        print(f"Available columns in train split: {swa_dataset['train'].column_names}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    print(f"Dataset structure: {swa_dataset}")
    print(f"Train set size: {len(swa_dataset['train'])}")
    print(f"Test set size: {len(swa_dataset['test'])}")
    print(f"Validation set size: {len(swa_dataset['validation'])}")

    train_tweets = swa_dataset['train']['tweet']
    train_labels = swa_dataset['train']['label']
    val_tweets = swa_dataset['validation']['tweet']
    val_labels = swa_dataset['validation']['label']
    test_tweets = swa_dataset['test']['tweet']
    test_labels = swa_dataset['test']['label']

    unique_labels = set(train_labels + val_labels + test_labels)
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    print(f"Label mapping: {label_to_idx}")

    word_counts = Counter()
    for tweet in train_tweets:
        word_counts.update(tweet.split())

    min_freq = 2
    vocabulary = {'<PAD>': 0, '<UNK>': 1}
    vocab_idx = 2

    for word, count in word_counts.items():
        if count >= min_freq:
            vocabulary[word] = vocab_idx
            vocab_idx += 1

    print(f"Vocabulary size: {len(vocabulary)}")

    train_dataset = SwahiliSentimentDataset(train_tweets, train_labels, vocabulary, label_to_idx)
    val_dataset = SwahiliSentimentDataset(val_tweets, val_labels, vocabulary, label_to_idx)
    test_dataset = SwahiliSentimentDataset(test_tweets, test_labels, vocabulary, label_to_idx)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    model = LSTMSentiment(
        vocab_size=len(vocabulary),
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=len(unique_labels),
        n_layers=NUM_LAYERS,
        dropout=DROPOUT,
        pad_idx=vocabulary['<PAD>']
    ).to(device)

    print(f"Model architecture:\n{model}")

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    print("Starting training...")
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=NUM_EPOCHS
    )

    model.load_state_dict(torch.load("best_swahili_lstm_model.pt"))

    print("\nEvaluating on test set...")
    test_loss, test_accuracy, test_f1 = evaluate_model(
        model=model,
        test_loader=test_loader,
        criterion=criterion,
        device=device,
        label_list=list(idx_to_label.values())
    )

    results = {
        "test_loss": test_loss,
        "test_accuracy": test_accuracy,
        "test_f1": test_f1,
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "epochs": NUM_EPOCHS,
        "vocab_size": len(vocabulary)
    }


    pd.DataFrame([results]).to_csv("swahili_lstm_results.csv", index=False)
    print(f"Results saved to swahili_lstm_results.csv")

if __name__ == "__main__":
    main()

Using device: cpu
Swahili dataset loaded successfully!
Available columns in train split: ['tweet', 'label']
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 1810
    })
    validation: Dataset({
        features: ['tweet', 'label'],
        num_rows: 453
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 748
    })
})
Train set size: 1810
Test set size: 748
Validation set size: 453
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Vocabulary size: 3055
Model architecture:
LSTMSentiment(
  (embedding): Embedding(3055, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Starting training...


Epoch 1/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.55it/s]
Epoch 1/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 36.87it/s]


Epoch 1/10:
  Train Loss: 0.9255
  Val Loss: 0.8964, Val Accuracy: 0.5872, Val F1: 0.4377
  Saved new best model!
------------------------------------------------------------


Epoch 2/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.84it/s]
Epoch 2/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 42.44it/s]


Epoch 2/10:
  Train Loss: 0.8834
  Val Loss: 0.9085, Val Accuracy: 0.5872, Val F1: 0.4720
------------------------------------------------------------


Epoch 3/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.59it/s]
Epoch 3/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 32.75it/s]


Epoch 3/10:
  Train Loss: 0.8355
  Val Loss: 0.9070, Val Accuracy: 0.5960, Val F1: 0.5168
------------------------------------------------------------


Epoch 4/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.93it/s]
Epoch 4/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 32.35it/s]


Epoch 4/10:
  Train Loss: 0.7643
  Val Loss: 0.9697, Val Accuracy: 0.5762, Val F1: 0.4923
------------------------------------------------------------


Epoch 5/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.20it/s]
Epoch 5/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 27.65it/s]


Epoch 5/10:
  Train Loss: 0.6388
  Val Loss: 1.0302, Val Accuracy: 0.5497, Val F1: 0.5318
------------------------------------------------------------


Epoch 6/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.23it/s]
Epoch 6/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 29.00it/s]


Epoch 6/10:
  Train Loss: 0.5488
  Val Loss: 1.2454, Val Accuracy: 0.5055, Val F1: 0.5120
------------------------------------------------------------


Epoch 7/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.92it/s]
Epoch 7/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 28.39it/s]


Epoch 7/10:
  Train Loss: 0.4460
  Val Loss: 1.2486, Val Accuracy: 0.5585, Val F1: 0.5315
------------------------------------------------------------


Epoch 8/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.64it/s]
Epoch 8/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 26.75it/s]


Epoch 8/10:
  Train Loss: 0.3874
  Val Loss: 1.5211, Val Accuracy: 0.4857, Val F1: 0.4953
------------------------------------------------------------


Epoch 9/10 - Training: 100%|██████████| 57/57 [00:07<00:00,  7.96it/s]
Epoch 9/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 32.68it/s]


Epoch 9/10:
  Train Loss: 0.2739
  Val Loss: 1.5772, Val Accuracy: 0.5055, Val F1: 0.5088
------------------------------------------------------------


Epoch 10/10 - Training: 100%|██████████| 57/57 [00:06<00:00,  8.60it/s]
Epoch 10/10 - Validation: 100%|██████████| 15/15 [00:00<00:00, 35.11it/s]


Epoch 10/10:
  Train Loss: 0.2220
  Val Loss: 1.5910, Val Accuracy: 0.5541, Val F1: 0.5337
------------------------------------------------------------

Evaluating on test set...


Testing: 100%|██████████| 24/24 [00:00<00:00, 27.83it/s]

Test Loss: 0.8965
Test Accuracy: 0.5936
Test F1 Score: 0.4446

Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        80
     neutral       0.59      1.00      0.74       444
    positive       0.50      0.00      0.01       224

    accuracy                           0.59       748
   macro avg       0.36      0.33      0.25       748
weighted avg       0.50      0.59      0.44       748

Results saved to swahili_lstm_results.csv



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Next, we build an LSTM model for Mozambican Portuguese:

In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm

torch.manual_seed(42)
np.random.seed(42)

EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
BATCH_SIZE = 32
NUM_EPOCHS = 10

class PorSentimentDataset(Dataset):
    def __init__(self, tweets, labels, vocab, label_map):
        self.tweets = tweets
        self.labels = labels
        self.vocab = vocab
        self.label_map = label_map

    def __len__(self):
        return len(self.tweets)

    def __getitem__(self, idx):
        tweet = self.tweets[idx]
        label = self.labels[idx]

        tokenized = [self.vocab.get(word, self.vocab['<UNK>']) for word in tweet.split()]
        return torch.tensor(tokenized, dtype=torch.long), torch.tensor(self.label_map[label], dtype=torch.long)

def collate_fn(batch):
    tweets, labels = zip(*batch)
    tweets_padded = pad_sequence(tweets, batch_first=True, padding_value=0)
    return tweets_padded, torch.stack(labels)

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))

        output, (hidden, cell) = self.lstm(embedded)

        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)

        hidden = self.dropout(hidden)
        return self.fc(hidden)

def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs):
    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        running_loss = 0.0

        for tweets, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            tweets, labels = tweets.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(tweets)

            loss = criterion(outputs, labels)

            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)

        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for tweets, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                tweets, labels = tweets.to(device), labels.to(device)

                outputs = model(tweets)
                loss = criterion(outputs, labels)

                val_loss += loss.item()

                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_portuguese_lstm_model.pt")
            print("  Saved new best model!")

        print("-" * 60)

    return model

def evaluate_model(model, test_loader, criterion, device, label_list):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for tweets, labels in tqdm(test_loader, desc="Testing"):
            tweets, labels = tweets.to(device), labels.to(device)

            outputs = model(tweets)
            loss = criterion(outputs, labels)

            test_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    test_loss /= len(test_loader)
    test_accuracy = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")

    class_names = [label_list[i] for i in range(len(label_list))]
    report = classification_report(all_labels, all_preds, target_names=class_names)
    print("\nClassification Report:")
    print(report)

    return test_loss, test_accuracy, test_f1

def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")

    try:
        por_dataset = load_from_disk("./datasets/afrisenti/por")
        print("Mozambican Portuguese dataset loaded successfully!")

        print(f"Available columns in train split: {por_dataset['train'].column_names}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return

    print(f"Dataset structure: {por_dataset}")
    print(f"Train set size: {len(por_dataset['train'])}")
    print(f"Test set size: {len(por_dataset['test'])}")
    print(f"Validation set size: {len(por_dataset['validation'])}")

    train_tweets = por_dataset['train']['tweet']
    train_labels = por_dataset['train']['label']
    val_tweets = por_dataset['validation']['tweet']
    val_labels = por_dataset['validation']['label']
    test_tweets = por_dataset['test']['tweet']
    test_labels = por_dataset['test']['label']

    unique_labels = set(train_labels + val_labels + test_labels)
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    print(f"Label mapping: {label_to_idx}")

    word_counts = Counter()
    for tweet in train_tweets:
        word_counts.update(tweet.split())

    min_freq = 2
    vocabulary = {'<PAD>': 0, '<UNK>': 1}
    vocab_idx = 2

    for word, count in word_counts.items():
        if count >= min_freq:
            vocabulary[word] = vocab_idx
            vocab_idx += 1

    print(f"Vocabulary size: {len(vocabulary)}")

    train_dataset = PorSentimentDataset(train_tweets, train_labels, vocabulary, label_to_idx)
    val_dataset = PorSentimentDataset(val_tweets, val_labels, vocabulary, label_to_idx)
    test_dataset = PorSentimentDataset(test_tweets, test_labels, vocabulary, label_to_idx)

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)

    model = LSTMSentiment(
        vocab_size=len(vocabulary),
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=len(unique_labels),
        n_layers=NUM_LAYERS,
        dropout=DROPOUT,
        pad_idx=vocabulary['<PAD>']
    ).to(device)

    print(f"Model architecture:\n{model}")

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    print("Starting training...")
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=NUM_EPOCHS
    )

    model.load_state_dict(torch.load("best_portuguese_lstm_model.pt"))

    print("\nEvaluating on test set...")
    test_loss, test_accuracy, test_f1 = evaluate_model(
        model=model,
        test_loader=test_loader,
        criterion=criterion,
        device=device,
        label_list=list(idx_to_label.values())
    )

    results = {
        "test_loss": test_loss,
        "test_accuracy": test_accuracy,
        "test_f1": test_f1,
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "epochs": NUM_EPOCHS,
        "vocab_size": len(vocabulary)
    }


    pd.DataFrame([results]).to_csv("portuguese_lstm_results.csv", index=False)
    print(f"Results saved to portuguese_lstm_results.csv")

if __name__ == "__main__":
    main()

Using device: cpu
Mozambican Portuguese dataset loaded successfully!
Available columns in train split: ['tweet', 'label']
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['tweet', 'label'],
        num_rows: 3063
    })
    validation: Dataset({
        features: ['tweet', 'label'],
        num_rows: 767
    })
    test: Dataset({
        features: ['tweet', 'label'],
        num_rows: 3662
    })
})
Train set size: 3063
Test set size: 3662
Validation set size: 767
Label mapping: {'negative': 0, 'neutral': 1, 'positive': 2}
Vocabulary size: 4075
Model architecture:
LSTMSentiment(
  (embedding): Embedding(4075, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Starting training...


Epoch 1/10 - Training: 100%|██████████| 96/96 [00:12<00:00,  7.80it/s]
Epoch 1/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 19.98it/s]


Epoch 1/10:
  Train Loss: 1.0226
  Val Loss: 0.9994, Val Accuracy: 0.5215, Val F1: 0.3575
  Saved new best model!
------------------------------------------------------------


Epoch 2/10 - Training: 100%|██████████| 96/96 [00:15<00:00,  6.37it/s]
Epoch 2/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 24.36it/s]


Epoch 2/10:
  Train Loss: 0.9703
  Val Loss: 0.9651, Val Accuracy: 0.5450, Val F1: 0.4652
  Saved new best model!
------------------------------------------------------------


Epoch 3/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.98it/s]
Epoch 3/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 24.23it/s]


Epoch 3/10:
  Train Loss: 0.8965
  Val Loss: 0.9932, Val Accuracy: 0.5567, Val F1: 0.4923
------------------------------------------------------------


Epoch 4/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.95it/s]
Epoch 4/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 20.20it/s]


Epoch 4/10:
  Train Loss: 0.7983
  Val Loss: 0.9902, Val Accuracy: 0.5606, Val F1: 0.5459
------------------------------------------------------------


Epoch 5/10 - Training: 100%|██████████| 96/96 [00:14<00:00,  6.67it/s]
Epoch 5/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 23.91it/s]


Epoch 5/10:
  Train Loss: 0.6935
  Val Loss: 1.0411, Val Accuracy: 0.5528, Val F1: 0.5396
------------------------------------------------------------


Epoch 6/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.92it/s]
Epoch 6/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 25.49it/s]


Epoch 6/10:
  Train Loss: 0.5816
  Val Loss: 1.1400, Val Accuracy: 0.5424, Val F1: 0.5358
------------------------------------------------------------


Epoch 7/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  7.09it/s]
Epoch 7/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 21.56it/s]


Epoch 7/10:
  Train Loss: 0.4691
  Val Loss: 1.2334, Val Accuracy: 0.5593, Val F1: 0.5388
------------------------------------------------------------


Epoch 8/10 - Training: 100%|██████████| 96/96 [00:14<00:00,  6.71it/s]
Epoch 8/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 23.63it/s]


Epoch 8/10:
  Train Loss: 0.3591
  Val Loss: 1.4979, Val Accuracy: 0.5332, Val F1: 0.5243
------------------------------------------------------------


Epoch 9/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  7.02it/s]
Epoch 9/10 - Validation: 100%|██████████| 24/24 [00:00<00:00, 28.58it/s]


Epoch 9/10:
  Train Loss: 0.3022
  Val Loss: 1.4727, Val Accuracy: 0.5111, Val F1: 0.5080
------------------------------------------------------------


Epoch 10/10 - Training: 100%|██████████| 96/96 [00:13<00:00,  6.88it/s]
Epoch 10/10 - Validation: 100%|██████████| 24/24 [00:01<00:00, 21.51it/s]


Epoch 10/10:
  Train Loss: 0.2282
  Val Loss: 1.6321, Val Accuracy: 0.5332, Val F1: 0.5234
------------------------------------------------------------

Evaluating on test set...


Testing: 100%|██████████| 115/115 [00:04<00:00, 25.81it/s]

Test Loss: 0.8757
Test Accuracy: 0.6393
Test F1 Score: 0.5655

Classification Report:
              precision    recall  f1-score   support

    negative       0.31      0.15      0.21       655
     neutral       0.67      0.92      0.78      2379
    positive       0.62      0.08      0.14       628

    accuracy                           0.64      3662
   macro avg       0.54      0.38      0.37      3662
weighted avg       0.60      0.64      0.57      3662

Results saved to portuguese_lstm_results.csv





### Lastly, we build an LSTM model for Sesotho


In [None]:
import os
import torch
import numpy as np
import pandas as pd
from datasets import load_from_disk
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm


torch.manual_seed(42)
np.random.seed(42)


EMBEDDING_DIM = 100
HIDDEN_DIM = 128
NUM_LAYERS = 2
DROPOUT = 0.2
LEARNING_RATE = 0.001
BATCH_SIZE = 32
NUM_EPOCHS = 10

class SotSentimentDataset(Dataset):
    def __init__(self, headlines, labels, vocab, label_map):
        self.headlines = headlines
        self.labels = labels
        self.vocab = vocab
        self.label_map = label_map

    def __len__(self):
        return len(self.headlines)

    def __getitem__(self, idx):
        tweet = self.headlines[idx]
        label = self.labels[idx]


        tokenized = [self.vocab.get(word, self.vocab['<UNK>']) for word in tweet.split()]
        return torch.tensor(tokenized, dtype=torch.long), torch.tensor(self.label_map[label], dtype=torch.long)

def collate_fn(batch):
    headlines, labels = zip(*batch)

    headlines_padded = pad_sequence(headlines, batch_first=True, padding_value=0)
    return headlines_padded, torch.stack(labels)

class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers,
                           bidirectional=True, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):

        embedded = self.dropout(self.embedding(text))


        output, (hidden, cell) = self.lstm(embedded)



        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)


        hidden = self.dropout(hidden)
        return self.fc(hidden)

def train_model(model, train_loader, val_loader, optimizer, criterion, device, epochs):
    best_val_loss = float('inf')

    for epoch in range(epochs):

        model.train()
        running_loss = 0.0

        for headlines, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs} - Training"):
            headlines, labels = headlines.to(device), labels.to(device)

            optimizer.zero_grad()


            outputs = model(headlines)


            loss = criterion(outputs, labels)


            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        train_loss = running_loss / len(train_loader)


        model.eval()
        val_loss = 0.0
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for headlines, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{epochs} - Validation"):
                headlines, labels = headlines.to(device), labels.to(device)

                outputs = model(headlines)
                loss = criterion(outputs, labels)

                val_loss += loss.item()

                _, preds = torch.max(outputs, 1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())

        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(all_labels, all_preds)
        val_f1 = f1_score(all_labels, all_preds, average='weighted')

        print(f"Epoch {epoch+1}/{epochs}:")
        print(f"  Train Loss: {train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}")


        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), "best_portuguese_lstm_model.pt")
            print("  Saved new best model!")

        print("-" * 60)

    return model

def evaluate_model(model, test_loader, criterion, device, label_list):
    model.eval()
    test_loss = 0.0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for headlines, labels in tqdm(test_loader, desc="Testing"):
            headlines, labels = headlines.to(device), labels.to(device)

            outputs = model(headlines)
            loss = criterion(outputs, labels)

            test_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    test_loss /= len(test_loader)
    test_accuracy = accuracy_score(all_labels, all_preds)
    test_f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")


    class_names = [label_list[i] for i in range(len(label_list))]
    report = classification_report(all_labels, all_preds, target_names=class_names)
    print("\nClassification Report:")
    print(report)

    return test_loss, test_accuracy, test_f1

def main():

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {device}")


    try:
        sot_dataset = load_from_disk("./datasets/sesotho_news_dataset")
        print("Sesotho dataset loaded successfully!")


        print(f"Available columns in train split: {sot_dataset['train'].column_names}")
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return


    print(f"Dataset structure: {sot_dataset}")
    print(f"Train set size: {len(sot_dataset['train'])}")
    print(f"Test set size: {len(sot_dataset['test'])}")
    print(f"Validation set size: {len(sot_dataset['validation'])}")


    train_headlines = sot_dataset['train']['headline']
    train_labels = sot_dataset['train']['label']
    val_headlines = sot_dataset['validation']['headline']
    val_labels = sot_dataset['validation']['label']
    test_headlines = sot_dataset['test']['headline']
    test_labels = sot_dataset['test']['label']



    unique_labels = set(train_labels + val_labels + test_labels)
    label_to_idx = {label: idx for idx, label in enumerate(unique_labels)}
    idx_to_label = {idx: label for label, idx in label_to_idx.items()}
    print(f"Label mapping: {label_to_idx}")


    word_counts = Counter()
    for headline in train_headlines:
        word_counts.update(headline.split())


    min_freq = 2
    vocabulary = {'<PAD>': 0, '<UNK>': 1}
    vocab_idx = 2

    for word, count in word_counts.items():
        if count >= min_freq:
            vocabulary[word] = vocab_idx
            vocab_idx += 1

    print(f"Vocabulary size: {len(vocabulary)}")


    train_dataset = SotSentimentDataset(train_headlines, train_labels, vocabulary, label_to_idx)
    val_dataset = SotSentimentDataset(val_headlines, val_labels, vocabulary, label_to_idx)
    test_dataset = SotSentimentDataset(test_headlines, test_labels, vocabulary, label_to_idx)


    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn)


    model = LSTMSentiment(
        vocab_size=len(vocabulary),
        embedding_dim=EMBEDDING_DIM,
        hidden_dim=HIDDEN_DIM,
        output_dim=len(unique_labels),
        n_layers=NUM_LAYERS,
        dropout=DROPOUT,
        pad_idx=vocabulary['<PAD>']
    ).to(device)


    print(f"Model architecture:\n{model}")


    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()


    print("Starting training...")
    model = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=criterion,
        device=device,
        epochs=NUM_EPOCHS
    )


    torch.save(model.state_dict(), "best_sesotho_lstm_model.pt");


    model.load_state_dict(torch.load("best_sesotho_lstm_model.pt"))


    print("\nEvaluating on test set...")
    test_loss, test_accuracy, test_f1 = evaluate_model(
        model=model,
        test_loader=test_loader,
        criterion=criterion,
        device=device,
        label_list=list(idx_to_label.values())
    )


    results = {
        "test_loss": test_loss,
        "test_accuracy": test_accuracy,
        "test_f1": test_f1,
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "num_layers": NUM_LAYERS,
        "dropout": DROPOUT,
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "epochs": NUM_EPOCHS,
        "vocab_size": len(vocabulary)
    }


    pd.DataFrame([results]).to_csv("sesotho_lstm_results.csv", index=False)
    print(f"Results saved to sesotho_lstm_results.csv")

if __name__ == "__main__":
    main()

Using device: cpu
Sesotho dataset loaded successfully!
Available columns in train split: ['headline', 'label', '__index_level_0__']
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 1305
    })
    validation: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 436
    })
    test: Dataset({
        features: ['headline', 'label', '__index_level_0__'],
        num_rows: 436
    })
})
Train set size: 1305
Test set size: 436
Validation set size: 436
Label mapping: {'neutral': 0, 'negative': 1, 'positive': 2}
Vocabulary size: 876
Model architecture:
LSTMSentiment(
  (embedding): Embedding(876, 100, padding_idx=0)
  (lstm): LSTM(100, 128, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=256, out_features=3, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)
Starting training...


Epoch 1/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 50.00it/s]
Epoch 1/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 212.12it/s]


Epoch 1/10:
  Train Loss: 0.7751
  Val Loss: 0.7289, Val Accuracy: 0.6950, Val F1: 0.5699
  Saved new best model!
------------------------------------------------------------


Epoch 2/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 51.51it/s]
Epoch 2/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 202.90it/s]


Epoch 2/10:
  Train Loss: 0.6275
  Val Loss: 0.7368, Val Accuracy: 0.7179, Val F1: 0.6419
------------------------------------------------------------


Epoch 3/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 52.90it/s]
Epoch 3/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 215.37it/s]


Epoch 3/10:
  Train Loss: 0.5516
  Val Loss: 0.7012, Val Accuracy: 0.7317, Val F1: 0.6920
  Saved new best model!
------------------------------------------------------------


Epoch 4/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 56.79it/s]
Epoch 4/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 200.00it/s]


Epoch 4/10:
  Train Loss: 0.5039
  Val Loss: 0.7370, Val Accuracy: 0.7385, Val F1: 0.6711
------------------------------------------------------------


Epoch 5/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 57.91it/s]
Epoch 5/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 212.13it/s]


Epoch 5/10:
  Train Loss: 0.4261
  Val Loss: 0.7007, Val Accuracy: 0.7271, Val F1: 0.7154
  Saved new best model!
------------------------------------------------------------


Epoch 6/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 55.86it/s]
Epoch 6/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 215.39it/s]


Epoch 6/10:
  Train Loss: 0.3852
  Val Loss: 0.7299, Val Accuracy: 0.7638, Val F1: 0.7379
------------------------------------------------------------


Epoch 7/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 59.33it/s]
Epoch 7/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 218.75it/s]


Epoch 7/10:
  Train Loss: 0.2882
  Val Loss: 0.7523, Val Accuracy: 0.7454, Val F1: 0.7287
------------------------------------------------------------


Epoch 8/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 58.32it/s]
Epoch 8/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 218.74it/s]


Epoch 8/10:
  Train Loss: 0.2573
  Val Loss: 0.7572, Val Accuracy: 0.7156, Val F1: 0.7123
------------------------------------------------------------


Epoch 9/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 57.66it/s]
Epoch 9/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 199.99it/s]


Epoch 9/10:
  Train Loss: 0.1974
  Val Loss: 0.8882, Val Accuracy: 0.7271, Val F1: 0.7143
------------------------------------------------------------


Epoch 10/10 - Training: 100%|██████████| 41/41 [00:00<00:00, 53.81it/s]
Epoch 10/10 - Validation: 100%|██████████| 14/14 [00:00<00:00, 222.22it/s]


Epoch 10/10:
  Train Loss: 0.1676
  Val Loss: 0.9858, Val Accuracy: 0.7385, Val F1: 0.7134
------------------------------------------------------------

Evaluating on test set...


Testing: 100%|██████████| 14/14 [00:00<00:00, 237.29it/s]

Test Loss: 0.9303
Test Accuracy: 0.7202
Test F1 Score: 0.7024

Classification Report:
              precision    recall  f1-score   support

     neutral       0.83      0.25      0.38        20
    negative       0.77      0.87      0.82       308
    positive       0.49      0.38      0.43       108

    accuracy                           0.72       436
   macro avg       0.70      0.50      0.54       436
weighted avg       0.71      0.72      0.70       436

Results saved to sesotho_lstm_results.csv





Saving the dataset (1/1 shards): 100%|██████████| 1305/1305 [00:00<00:00, 326273.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 436/436 [00:00<00:00, 109014.40 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 436/436 [00:00<00:00, 145297.68 examples/s]


# We now create Baselines with pre-trained Multilingual transformers

## AfroXLMR


In [None]:
!pip install -U transformers datasets peft evaluate plotly --quiet

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
import numpy as np
from sklearn.manifold import TSNE
import pandas as pd

import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

# por_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "pt-MZ")
# swa_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "swa")
# sot_dataset=load_dataset("hamza-student-123/nlp-assignment-news-data",'sot')
#
# for ds in [por_dataset,swa_dataset,sot_dataset]: # Change to all three later
#     for lbl in ["train","validation","test"]:
#         if ds[lbl].column_names[0]== "tweet":
#             ds[lbl] = ds[lbl].rename_column("tweet","text")
#         else:
#             ds[lbl] = ds[lbl].rename_column("headline","text")
#
# por_df = por_dataset["train"].to_pandas()
# swa_df = swa_dataset["train"].to_pandas()
# sot_df = sot_dataset["train"].to_pandas()



[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Using device: cpu


#### Baseline with [AfroXLMR](https://huggingface.co/Davlan/afro-xlmr-large)


In [1]:
!pip install -U transformers datasets peft evaluate plotly sentencepiece --quiet


import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from datasets import load_dataset
# import torch_optimizer as optim
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def load_data(language):
    try:
        if language=='por':
            chosen_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "por",trust_remote_code=True)
        elif language=='swa':
            chosen_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "swa",trust_remote_code=True)
        elif language=='sot':
            chosen_dataset=load_dataset("hamza-student-123/nlp-assignment-news-data",'sot')
        else:
            raise Exception

        for ds in [chosen_dataset]: # Change to all three later
            for lbl in ["train","validation","test"]:
                if ds[lbl].column_names[0]== "tweet":
                    ds[lbl] = ds[lbl].rename_column("tweet","text")
                else:
                    ds[lbl] = ds[lbl].rename_column("headline","text")

        train_df  = chosen_dataset["train"].to_pandas()
        val_df  = chosen_dataset["validation"].to_pandas()
        test_df  = chosen_dataset["test"].to_pandas()
        logger.info(f"Data loaded successfully: {len(train_df)} training, {len(val_df)} validation, {len(test_df)} test examples")
        return train_df, val_df, test_df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise e

# Function to create DataLoaders
def create_data_loaders(train_df, val_df, test_df, tokenizer, batch_size=16, text_column='text', label_column='label'):
    train_dataset = SentimentDataset(
        texts=train_df[text_column].tolist(),
        labels=train_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    val_dataset = SentimentDataset(
        texts=val_df[text_column].tolist(),
        labels=val_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    test_dataset = SentimentDataset(
        texts=test_df[text_column].tolist(),
        labels=test_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, device):
    """
    Returns:
        Tuple of (loss, accuracy, precision, recall, f1)
    """
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted'
    )

    return total_loss / len(dataloader), accuracy, precision, recall, f1, all_preds, all_labels

def plot_confusion_matrix(true_labels, predictions, class_names):
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()

def evaluate_afro_xlmr_for_lang(language):
    config = {
        'model_name': 'Davlan/afro-xlmr-base',
        'num_labels': 3,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'epochs': 3,
        'warmup_steps': 0,
        'max_grad_norm': 1.0,
        'text_column': 'text',
        'label_column': 'label',
        'class_names': ['negative', 'neutral', 'positive']
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Using device: {device}")

    logger.info(f"Loading model: {config['model_name']}...")
    tokenizer = XLMRobertaTokenizer.from_pretrained(config['model_name'])
    model = XLMRobertaForSequenceClassification.from_pretrained(
        config['model_name'],
        num_labels=config['num_labels']
    )
    model.to(device)

    # Load data
    logger.info("Loading data...")
    train_df, val_df, test_df = load_data(f'{language}')

    # Create data loaders
    logger.info("Creating data loaders...")
    train_loader, val_loader, test_loader = create_data_loaders(
        train_df, val_df, test_df,
        tokenizer,
        batch_size=config['batch_size'],
        text_column=config['text_column'],
        label_column=config['label_column']
    )


    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
    total_steps = len(train_loader) * config['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config['warmup_steps'],
        num_training_steps=total_steps
    )

    logger.info("Starting training...")
    best_val_f1 = 0
    best_model_state = None

    for epoch in range(config['epochs']):
        logger.info(f"Epoch {epoch + 1}/{config['epochs']}")

        # Train
        start_time = time.time()
        train_loss = train_epoch(model, train_loader,optimizer,scheduler, device) # train_epoch(model, train_loader, scheduler, device)
        train_time = time.time() - start_time

        # Validate
        val_loss, val_accuracy, val_precision, val_recall, val_f1, _, _ = evaluate(model, val_loader, device)

        logger.info(f"Epoch {epoch + 1} results:")
        logger.info(f"Train Loss: {train_loss:.4f}, Time: {train_time:.2f}s")
        logger.info(f"Val Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = model.state_dict().copy()
            logger.info(f"New best model with F1: {best_val_f1:.4f}")

    # Load best model for testing
    if best_model_state:
        logger.info("Loading best model for testing...")
        model.load_state_dict(best_model_state)

    # Test evaluation
    logger.info("Evaluating on test set...")
    test_loss, test_accuracy, test_precision, test_recall, test_f1, test_preds, test_labels = evaluate(model, test_loader, device)

    logger.info(f"Test Results:")
    logger.info(f"Loss: {test_loss:.4f}")
    logger.info(f"Accuracy: {test_accuracy:.4f}")
    logger.info(f"Precision: {test_precision:.4f}")
    logger.info(f"Recall: {test_recall:.4f}")
    logger.info(f"F1 Score: {test_f1:.4f}")

    results_df_afro = {
    "test_loss": test_loss,
    "test_accuracy": test_accuracy,
    "test_f1": test_f1,
    "test_precision": test_precision,
    "test_recall": test_recall,
    "epochs": config['epochs'],
    "learning_rate": config['learning_rate'],
    "batch_size": config['batch_size'],
}

    path = f"afroxlmr_results_{language}.csv"
    pd.DataFrame([results_df_afro]).to_csv(path, index=False)
    print(f"Results saved to {path}")

    # Save model
    logger.info("Saving model...")
    model_save_path = f'./xmlr_sentiment_model_{language}'
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    logger.info(f"Model saved to {model_save_path}")

# evaluate_afro_xlmr()
langs = ['por','swa']
# langs = ['sot']
for l in langs:
    evaluate_afro_xlmr_for_lang(l)


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  from .autonotebook import tqdm as notebook_tqdm
2025-05-20 09:13:11,166 - INFO - Using device: cpu
2025-05-20 09:13:11,167 - INFO - Loading model: Davlan/afro-xlmr-base...
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-20 09:13:13,444 - INFO - Loading data...
2025-05-20 09:13:17,088 - INFO - Data loaded successfully: 3063 training, 767 validation, 3662 test examples
2025-05-20 09:13:17,089 - INFO - Creating data loaders...
2025-05-20 09:13:17,093 - INFO - Starting training...
2025-05-20 09:13:17,093 - INFO - Epoch 1/10
Training: 10

Results saved to afroxlmr_results_por.csv


2025-05-20 12:11:17,558 - INFO - Model saved to ./xmlr_sentiment_model_por
2025-05-20 12:11:17,675 - INFO - Using device: cpu
2025-05-20 12:11:17,676 - INFO - Loading model: Davlan/afro-xlmr-base...
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-20 12:11:20,054 - INFO - Loading data...
2025-05-20 12:11:21,866 - INFO - Data loaded successfully: 1810 training, 453 validation, 748 test examples
2025-05-20 12:11:21,867 - INFO - Creating data loaders...
2025-05-20 12:11:21,868 - INFO - Starting training...
2025-05-20 12:11:21,869 - INFO - Epoch 1/10
Training: 100%|██████████| 114/114 [10:14<00:00,  5.39s/it, loss=0.9087]
Evaluating: 100%|██████████| 29/29 [00:27<

KeyboardInterrupt: 

##### Evaluation for Sesotho

In [None]:
!pip install -U transformers datasets peft evaluate plotly sentencepiece --quiet


import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from datasets import load_dataset
# import torch_optimizer as optim
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_mapping = {"negative": 0, "neutral": 1, "positive": 2}  # Label conversion

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.label_mapping[self.labels[idx]]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def load_data(language):
    try:
        if language=='por':
            chosen_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "por",trust_remote_code=True)
        elif language=='swa':
            chosen_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "swa",trust_remote_code=True)
        elif language=='sot':
            chosen_dataset=load_dataset("hamza-student-123/nlp-assignment-news-data",'sot')
        else:
            raise Exception

        for ds in [chosen_dataset]: # Change to all three later
            for lbl in ["train","validation","test"]:
                if ds[lbl].column_names[0]== "tweet":
                    ds[lbl] = ds[lbl].rename_column("tweet","text")
                else:
                    ds[lbl] = ds[lbl].rename_column("headline","text")

        train_df  = chosen_dataset["train"].to_pandas()
        val_df  = chosen_dataset["validation"].to_pandas()
        test_df  = chosen_dataset["test"].to_pandas()
        logger.info(f"Data loaded successfully: {len(train_df)} training, {len(val_df)} validation, {len(test_df)} test examples")
        return train_df, val_df, test_df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise e

# Function to create DataLoaders
def create_data_loaders(train_df, val_df, test_df, tokenizer, batch_size=16, text_column='text', label_column='label'):
    train_dataset = SentimentDataset(
        texts=train_df[text_column].tolist(),
        labels=train_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    val_dataset = SentimentDataset(
        texts=val_df[text_column].tolist(),
        labels=val_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    test_dataset = SentimentDataset(
        texts=test_df[text_column].tolist(),
        labels=test_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, device):
    """
    Returns:
        Tuple of (loss, accuracy, precision, recall, f1)
    """
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted'
    )

    return total_loss / len(dataloader), accuracy, precision, recall, f1, all_preds, all_labels

def plot_confusion_matrix(true_labels, predictions, class_names):
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()

def evaluate_afro_xlmr_for_lang(language):
    config = {
        'model_name': 'Davlan/afro-xlmr-base',
        'num_labels': 3,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'epochs': 3,
        'warmup_steps': 0,
        'max_grad_norm': 1.0,
        'text_column': 'text',
        'label_column': 'label',
        'class_names': ['negative', 'neutral', 'positive']
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Using device: {device}")

    logger.info(f"Loading model: {config['model_name']}...")
    tokenizer = XLMRobertaTokenizer.from_pretrained(config['model_name'])
    model = XLMRobertaForSequenceClassification.from_pretrained(
        config['model_name'],
        num_labels=config['num_labels']
    )
    model.to(device)

    # Load data
    logger.info("Loading data...")
    train_df, val_df, test_df = load_data(f'{language}')

    # Create data loaders
    logger.info("Creating data loaders...")
    train_loader, val_loader, test_loader = create_data_loaders(
        train_df, val_df, test_df,
        tokenizer,
        batch_size=config['batch_size'],
        text_column=config['text_column'],
        label_column=config['label_column']
    )


    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
    total_steps = len(train_loader) * config['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config['warmup_steps'],
        num_training_steps=total_steps
    )

    logger.info("Starting training...")
    best_val_f1 = 0
    best_model_state = None

    for epoch in range(config['epochs']):
        logger.info(f"Epoch {epoch + 1}/{config['epochs']}")

        # Train
        start_time = time.time()
        train_loss = train_epoch(model, train_loader,optimizer,scheduler, device) # train_epoch(model, train_loader, scheduler, device)
        train_time = time.time() - start_time

        # Validate
        val_loss, val_accuracy, val_precision, val_recall, val_f1, _, _ = evaluate(model, val_loader, device)

        logger.info(f"Epoch {epoch + 1} results:")
        logger.info(f"Train Loss: {train_loss:.4f}, Time: {train_time:.2f}s")
        logger.info(f"Val Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = model.state_dict().copy()
            logger.info(f"New best model with F1: {best_val_f1:.4f}")

    # Load best model for testing
    if best_model_state:
        logger.info("Loading best model for testing...")
        model.load_state_dict(best_model_state)

    # Test evaluation
    logger.info("Evaluating on test set...")
    test_loss, test_accuracy, test_precision, test_recall, test_f1, test_preds, test_labels = evaluate(model, test_loader, device)

    logger.info(f"Test Results:")
    logger.info(f"Loss: {test_loss:.4f}")
    logger.info(f"Accuracy: {test_accuracy:.4f}")
    logger.info(f"Precision: {test_precision:.4f}")
    logger.info(f"Recall: {test_recall:.4f}")
    logger.info(f"F1 Score: {test_f1:.4f}")

    results_df_afro = {
    "test_loss": test_loss,
    "test_accuracy": test_accuracy,
    "test_f1": test_f1,
    "test_precision": test_precision,
    "test_recall": test_recall,
    "epochs": config['epochs'],
    "learning_rate": config['learning_rate'],
    "batch_size": config['batch_size'],
}

    path = f"afroxlmr_results_{language}.csv"
    pd.DataFrame([results_df_afro]).to_csv(path, index=False)
    print(f"Results saved to {path}")

    # Save model
    logger.info("Saving model...")
    model_save_path = f'./xmlr_sentiment_model_{language}'
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    logger.info(f"Model saved to {model_save_path}")

langs = ['sot']
for l in langs:
    evaluate_afro_xlmr_for_lang(l)


This will be through using Cross-Lingual transfer learning (CLTL) with Swahili data with the aim of improving model performance on Sesotho.
This decision was made because...(sesotho has less resources for example)



Furthermore, we explore how linguistic nuances within our affect classification across languages. There is a particular focus on Sesotho models because....


Link: [SHAP docs](https://shap.readthedocs.io/en/latest/)

## mBERT

### Baseline with [mBERT](https://huggingface.co/google-bert/bert-base-multilingual-cased)


/bin/python3



Link: [LIME docs](https://uc-r.github.io/lime)

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import time
import json
import random
from lime.lime_text import LimeTextExplainer
import torch.nn.functional as F

run_timestamp = time.strftime("%Y%m%d_%H%M%S")

os.makedirs("./benchmark_results", exist_ok=True)
os.makedirs("./lime_explanations", exist_ok=True)

output_csv = os.path.abspath(f"./benchmark_results/mbert_benchmark_results_{run_timestamp}.csv")
explanations_dir = os.path.abspath(f"./lime_explanations/explanations_{run_timestamp}")
os.makedirs(explanations_dir, exist_ok=True)

results_df = pd.DataFrame(columns=[
    "Dataset", "Model", "Loss", "Accuracy", "F1", "Precision", "Recall"
])

try:
    swa_dataset = load_dataset("masakhane/afrisenti", "swa")
except Exception as e:
    swa_dataset = None

try:
    por_dataset = load_dataset("masakhane/afrisenti", "por")
except Exception as e:
    por_dataset = None

try:
    sot_dataset = load_dataset("csv", data_files={"train": "./datasets/sotho-news/sotho_news_dataset.csv"})
except Exception as e:
    sot_dataset = None

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

class ModelPredictor:
    """Wrapper class for LIME explanations"""
    def __init__(self, model, tokenizer, device, num_labels):
        self.model = model
        self.tokenizer = tokenizer
        self.device = device
        self.num_labels = num_labels
        self.model.eval()
    
    def predict_proba(self, texts):
        """Predict probabilities for LIME"""
        if isinstance(texts, str):
            texts = [texts]
        
        inputs = self.tokenizer(
            texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        ).to(self.device)
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            probas = F.softmax(outputs.logits, dim=-1)
        
        return probas.cpu().numpy()

def generate_lime_explanations(model_predictor, test_texts, test_labels, dataset_name, 
                              label_names=None, num_samples=10):
    """Generate LIME explanations for sample predictions"""
    
    explainer = LimeTextExplainer(class_names=label_names or [f"Class_{i}" for i in range(3)])
    
    sample_indices = random.sample(range(len(test_texts)), min(num_samples, len(test_texts)))
    explanations_data = []
    
    print(f"\nGenerating LIME explanations for {dataset_name}...")
    
    for i, idx in enumerate(sample_indices):
        try:
            text = test_texts[idx]
            true_label = test_labels[idx]
            
            pred_proba = model_predictor.predict_proba([text])[0]
            pred_label = np.argmax(pred_proba)
            
            exp = explainer.explain_instance(
                text, 
                model_predictor.predict_proba, 
                num_features=10,
                num_samples=1000
            )
            
            explanation_data = {
                'sample_id': idx,
                'text': text,
                'true_label': int(true_label),
                'predicted_label': int(pred_label),
                'prediction_probability': float(pred_proba[pred_label]),
                'all_probabilities': pred_proba.tolist(),
                'lime_explanation': []
            }
            
            for feature, importance in exp.as_list():
                explanation_data['lime_explanation'].append({
                    'feature': feature,
                    'importance': float(importance)
                })
            
            explanations_data.append(explanation_data)
            
            html_file = os.path.join(explanations_dir, f"{dataset_name}_sample_{idx}_explanation.html")
            exp.save_to_file(html_file)
            
            print(f"  Generated explanation {i+1}/{len(sample_indices)} for sample {idx}")
            
        except Exception as e:
            print(f"  Error generating explanation for sample {idx}: {str(e)}")
            continue
    
    # Save explanations as JSON
    json_file = os.path.join(explanations_dir, f"{dataset_name}_lime_explanations.json")
    with open(json_file, 'w', encoding='utf-8') as f:
        json.dump(explanations_data, f, indent=2, ensure_ascii=False)
    
    return explanations_data

def get_benchmark_metrics(dataset_name, dataset, num_labels):

    if dataset is None:
        return None, None

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels=num_labels
    ).to(device)

    text_column = None
    label_column = None

    text_candidates = ["text", "content", "tweet", "sentence", "document"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in text_candidates:
            if candidate in columns:
                text_column = candidate
                break
        if text_column:
            break

    label_candidates = ["label", "sentiment", "class", "category"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in label_candidates:
            if candidate in columns:
                label_column = candidate
                break
        if label_column:
            break

    if text_column is None:
        for split in dataset:
            for col in dataset[split].column_names:
                if isinstance(dataset[split][col][0], str):
                    text_column = col
                    break
            if text_column:
                break

    if text_column is None:
        return None, None

    if label_column is None:
        return None, None

    required_splits = ["train", "validation", "test"]
    missing_splits = [split for split in required_splits if split not in dataset]

    if missing_splits:
        if "train" in dataset:
            train_valid_test = {}

            for split in dataset:
                if split in required_splits:
                    train_valid_test[split] = dataset[split]

            if "train" in dataset and ("validation" not in train_valid_test or "test" not in train_valid_test):
                if "validation" not in train_valid_test:
                    if "test" not in train_valid_test:
                        split_datasets = dataset["train"].train_test_split(test_size=0.2, seed=42)
                        test_valid_split = split_datasets["test"].train_test_split(test_size=0.5, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = test_valid_split["train"]
                        train_valid_test["test"] = test_valid_split["test"]
                    else:
                        split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = split_datasets["test"]
                else:
                    split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                    train_valid_test["test"] = split_datasets["test"]

            dataset = DatasetDict(train_valid_test)
        else:
            return None, None

    label_mapping = None
    label_names = None
    for split in dataset:
        if isinstance(dataset[split][label_column][0], str):
            all_labels = set()
            for example in dataset[split][label_column]:
                all_labels.add(example)

            sorted_labels = sorted(all_labels)
            label_mapping = {label: i for i, label in enumerate(sorted_labels)}
            label_names = sorted_labels
            break

    processed_dataset = DatasetDict()
    for split_name, split_dataset in dataset.items():
        texts = split_dataset[text_column]

        if label_mapping:
            labels = [label_mapping[label] for label in split_dataset[label_column]]
        else:
            labels = []
            for label in split_dataset[label_column]:
                if isinstance(label, (int, np.integer)):
                    labels.append(int(label))
                elif isinstance(label, str):
                    try:
                        labels.append(int(label))
                    except ValueError:
                        labels.append(0)
                else:
                    labels.append(0)

        processed_dataset[split_name] = Dataset.from_dict({
            "text": texts,
            "label": labels
        })

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

    tokenized_dataset = DatasetDict()
    for split_name, split_dataset in processed_dataset.items():
        tokenized_split = split_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=["text"]
        )
        tokenized_dataset[split_name] = tokenized_split

        for required_col in ["input_ids", "attention_mask", "label"]:
            if required_col not in tokenized_split.column_names:
                return None, None

    eval_args = TrainingArguments(
        output_dir=f"./benchmark_results/{dataset_name}_{run_timestamp}",
        per_device_eval_batch_size=16,
        logging_dir=f"./benchmark_results/{dataset_name}_{run_timestamp}/logs",
        report_to="none",
        remove_unused_columns=True
    )

    trainer = Trainer(
        model=model,
        args=eval_args,
        compute_metrics=compute_metrics,
    )

    try:
        benchmark_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
        
        model_predictor = ModelPredictor(model, tokenizer, device, num_labels)
        test_texts = processed_dataset["test"]["text"]
        test_labels = processed_dataset["test"]["label"]
        
        lime_explanations = generate_lime_explanations(
            model_predictor, test_texts, test_labels, dataset_name, 
            label_names=label_names, num_samples=10
        )

        return benchmark_results, lime_explanations
    except Exception as e:
        print(f"Error in benchmarking {dataset_name}: {str(e)}")
        return None, None

available_datasets = []
if swa_dataset:
    available_datasets.append(("Swahili", swa_dataset, 3))
if por_dataset:
    available_datasets.append(("Portuguese", por_dataset, 3))
if sot_dataset:
    available_datasets.append(("Sesotho", sot_dataset, 3))

all_results = []
all_explanations = {}

print("Starting benchmarking with LIME explanations...")

for dataset_name, dataset, num_labels in available_datasets:
    try:
        print(f"\nProcessing {dataset_name} dataset...")
        results, explanations = get_benchmark_metrics(dataset_name, dataset, num_labels)

        if results:
            results_df = results_df._append({
                "Dataset": dataset_name,
                "Model": "mBERT",
                "Loss": results.get("eval_loss"),
                "Accuracy": results.get("eval_accuracy"),
                "F1": results.get("eval_f1"),
                "Precision": results.get("eval_precision"),
                "Recall": results.get("eval_recall")
            }, ignore_index=True)

            all_results.append({"Dataset": dataset_name, "Results": results})
            if explanations:
                all_explanations[dataset_name] = explanations
        else:
            print(f"Failed to process {dataset_name} dataset")
    except Exception as e:
        print(f"Error processing {dataset_name}: {str(e)}")

try:
    results_df.to_csv(output_csv, index=False)
    print(f"\nResults saved to: {output_csv}")
except Exception as e:
    emergency_path = f"./emergency_results_{run_timestamp}.csv"
    results_df.to_csv(emergency_path, index=False)
    print(f"\nEmergency results saved to: {emergency_path}")

if all_explanations:
    summary_file = os.path.join(explanations_dir, "explanations_summary.json")
    explanation_summary = {}
    
    for dataset_name, explanations in all_explanations.items():
        summary_stats = {
            'total_explanations': len(explanations),
            'average_prediction_confidence': np.mean([exp['prediction_probability'] for exp in explanations]),
            'correct_predictions': sum(1 for exp in explanations if exp['true_label'] == exp['predicted_label']),
            'top_important_features': {}
        }
        
        feature_importance = {}
        for exp in explanations:
            for feature_data in exp['lime_explanation']:
                feature = feature_data['feature']
                importance = abs(feature_data['importance'])
                if feature in feature_importance:
                    feature_importance[feature].append(importance)
                else:
                    feature_importance[feature] = [importance]
        
        avg_feature_importance = {
            feature: np.mean(importances) 
            for feature, importances in feature_importance.items()
        }
        
        sorted_features = sorted(avg_feature_importance.items(), key=lambda x: x[1], reverse=True)
        summary_stats['top_important_features'] = dict(sorted_features[:10])
        
        explanation_summary[dataset_name] = summary_stats
    
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(explanation_summary, f, indent=2, ensure_ascii=False)
    


print("\nResults:")
print(results_df)

  from .autonotebook import tqdm as notebook_tqdm


Starting benchmarking with LIME explanations...

Processing Swahili dataset...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1810/1810 [00:00<00:00, 14853.57 examples/s]
Map: 100%|██████████| 453/453 [00:00<00:00, 12564.27 examples/s]
Map: 100%|██████████| 748/748 [00:00<00:00, 13980.89 examples/s]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Generating LIME explanations for Swahili...
  Generated explanation 1/10 for sample 654
  Generated explanation 2/10 for sample 114
  Generated explanation 3/10 for sample 25
  Generated explanation 4/10 for sample 281
  Generated explanation 5/10 for sample 250
  Generated explanation 6/10 for sample 228
  Generated explanation 7/10 for sample 142
  Generated explanation 8/10 for sample 104
  Generated explanation 9/10 for sample 692
  Generated explanation 10/10 for sample 558

Processing Portuguese dataset...


  results_df = results_df._append({
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3063/3063 [00:00<00:00, 14149.98 examples/s]
Map: 100%|██████████| 767/767 [00:00<00:00, 12529.87 examples/s]
Map: 100%|██████████| 3662/3662 [00:00<00:00, 14905.68 examples/s]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Generating LIME explanations for Portuguese...
  Generated explanation 1/10 for sample 2619
  Generated explanation 2/10 for sample 456
  Generated explanation 3/10 for sample 102
  Generated explanation 4/10 for sample 3037
  Generated explanation 5/10 for sample 1126
  Generated explanation 6/10 for sample 1003
  Generated explanation 7/10 for sample 914
  Generated explanation 8/10 for sample 571
  Generated explanation 9/10 for sample 3016
  Generated explanation 10/10 for sample 419

Processing Sesotho dataset...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1741/1741 [00:00<00:00, 8214.19 examples/s]
Map: 100%|██████████| 218/218 [00:00<00:00, 12935.68 examples/s]
Map: 100%|██████████| 218/218 [00:00<00:00, 13669.38 examples/s]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Generating LIME explanations for Sesotho...
  Generated explanation 1/10 for sample 163
  Generated explanation 2/10 for sample 28
  Generated explanation 3/10 for sample 6
  Generated explanation 4/10 for sample 189
  Generated explanation 5/10 for sample 70
  Generated explanation 6/10 for sample 62
  Generated explanation 7/10 for sample 57
  Generated explanation 8/10 for sample 35
  Generated explanation 9/10 for sample 188
  Generated explanation 10/10 for sample 26

Results saved to: /home/troy/Documents/nlp-research/benchmark_results/mbert_benchmark_results_20250612_225610.csv
LIME explanations summary saved to: /home/troy/Documents/nlp-research/lime_explanations/explanations_20250612_225610/explanations_summary.json

==== BENCHMARKING WITH LIME EXPLANATIONS COMPLETE ====

Results Summary:
      Dataset  Model      Loss  Accuracy        F1  Precision    Recall
0     Swahili  mBERT  1.035671  0.582888  0.440901   0.373173  0.582888
1  Portuguese  mBERT  1.078578  0.176952  0.07

### Fine-Tuning BERT


In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import logging
import time
import sys
import json
import shutil

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

try:
    swa_dataset = load_dataset("masakhane/afrisenti", "swa")
    logger.info("Successfully loaded Swahili dataset from HuggingFace Hub")
except Exception as e:
    logger.error(f"Failed to load Swahili dataset: {e}")
    swa_dataset = None

try:
    por_dataset = load_dataset("masakhane/afrisenti", "por")
    logger.info("Successfully loaded Portuguese dataset from HuggingFace Hub")
except Exception as e:
    logger.error(f"Failed to load Portuguese dataset: {e}")
    por_dataset = None

try:
    sot_dataset = load_dataset("csv", data_files={"train": "./datasets/sotho-news/sotho_news_dataset.csv"})
    logger.info("Successfully loaded Sesotho dataset from CSV")
except Exception as e:
    logger.error(f"Failed to load Sesotho dataset: {e}")
    sot_dataset = None

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def finetune_and_evaluate(dataset_name, dataset, num_labels):
    logger.info(f"Starting fine-tuning for {dataset_name}")

    if dataset is None:
        logger.error(f"Dataset {dataset_name} is None, cannot proceed with fine-tuning")
        return None

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")

    model_save_dir = f"./models/{dataset_name.lower()}"
    os.makedirs(model_save_dir, exist_ok=True)

    tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-multilingual-cased",
        num_labels=num_labels
    ).to(device)

    logger.info(f"Dataset {dataset_name} structure:")
    for split in dataset:
        logger.info(f"  - Split: {split}, Examples: {len(dataset[split])}")
        logger.info(f"  - Features: {dataset[split].features}")
        logger.info(f"  - Columns: {dataset[split].column_names}")

    text_column = None
    label_column = None

    text_candidates = ["text", "content", "tweet", "sentence", "document", "headline"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in text_candidates:
            if candidate in columns:
                text_column = candidate
                break
        if text_column:
            break

    label_candidates = ["label", "sentiment", "class", "category"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in label_candidates:
            if candidate in columns:
                label_column = candidate
                break
        if label_column:
            break

    logger.info(f"For {dataset_name}, using text_column={text_column}, label_column={label_column}")

    if text_column is None:
        for split in dataset:
            for col in dataset[split].column_names:
                if isinstance(dataset[split][col][0], str):
                    text_column = col
                    logger.info(f"Using '{col}' as text column based on string data")
                    break
            if text_column:
                break

    if text_column is None:
        logger.error(f"Could not identify a text column for {dataset_name}")
        return None

    if label_column is None:
        logger.error(f"Could not identify a label column for {dataset_name}")
        return None

    required_splits = ["train", "validation", "test"]
    missing_splits = [split for split in required_splits if split not in dataset]

    if missing_splits:
        logger.info(f"Creating missing splits: {missing_splits} for {dataset_name}")
        if "train" in dataset:
            train_valid_test = {}

            for split in dataset:
                if split in required_splits:
                    train_valid_test[split] = dataset[split]

            if "train" in dataset and ("validation" not in train_valid_test or "test" not in train_valid_test):
                if "validation" not in train_valid_test:
                    if "test" not in train_valid_test:
                        split_datasets = dataset["train"].train_test_split(test_size=0.2, seed=42)
                        test_valid_split = split_datasets["test"].train_test_split(test_size=0.5, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = test_valid_split["train"]
                        train_valid_test["test"] = test_valid_split["test"]
                    else:
                        split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = split_datasets["test"]
                else:
                    split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                    train_valid_test["test"] = split_datasets["test"]

            dataset = DatasetDict(train_valid_test)
        else:
            logger.error(f"Dataset {dataset_name} has no train split and cannot create splits")
            return None

    label_mapping = None
    for split in dataset:
        if isinstance(dataset[split][label_column][0], str):
            all_labels = set()
            for example in dataset[split][label_column]:
                all_labels.add(example)

            label_mapping = {label: i for i, label in enumerate(sorted(all_labels))}
            logger.info(f"Created label mapping for {dataset_name}: {label_mapping}")
            break

    processed_dataset = DatasetDict()
    for split_name, split_dataset in dataset.items():
        texts = split_dataset[text_column]

        if label_mapping:
            labels = [label_mapping[label] for label in split_dataset[label_column]]
        else:
            labels = []
            for label in split_dataset[label_column]:
                if isinstance(label, (int, np.integer)):
                    labels.append(int(label))
                elif isinstance(label, str):
                    try:
                        labels.append(int(label))
                    except ValueError:
                        logger.warning(f"Unexpected string label found in {split_name}: {label}")
                        labels.append(0)
                else:
                    logger.warning(f"Unexpected label type in {split_name}: {type(label)}")
                    labels.append(0)

        processed_dataset[split_name] = Dataset.from_dict({
            "text": texts,
            "label": labels
        })

        logger.info(f"Processed {split_name} split: {len(processed_dataset[split_name])} examples")

    def tokenize_function(examples):
        return tokenizer(
            examples["text"],
            padding="max_length",
            truncation=True,
            max_length=128,
        )

    tokenized_dataset = DatasetDict()
    for split_name, split_dataset in processed_dataset.items():
        tokenized_split = split_dataset.map(
            tokenize_function,
            batched=True,
            remove_columns=["text"]
        )
        tokenized_dataset[split_name] = tokenized_split

        logger.info(f"Tokenized {split_name} split: {len(tokenized_split)} examples")
        logger.info(f"Columns after tokenization: {tokenized_split.column_names}")

        for required_col in ["input_ids", "attention_mask", "label"]:
            if required_col not in tokenized_split.column_names:
                logger.error(f"Required column {required_col} missing after tokenization")
                return None

    model_output_dir = f"./tmp_model_dir_{dataset_name}"
    if os.path.exists(model_output_dir):
        shutil.rmtree(model_output_dir)
    os.makedirs(model_output_dir, exist_ok=True)

    training_args = TrainingArguments(
        output_dir=model_output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./tmp_logs",
        logging_steps=10,
        save_strategy="no",
        save_steps=1000000,
        eval_steps=100,
        do_eval=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,
    )

    logger.info(f"Starting fine-tuning {dataset_name} with {len(tokenized_dataset['train'])} examples")
    start_time = time.time()

    try:
        try:
            train_output = trainer.train()

            training_loss = None
            try:
                if hasattr(train_output, "metrics") and "loss" in train_output.metrics:
                    training_loss = train_output.metrics["loss"]
                elif isinstance(train_output, dict) and "loss" in train_output:
                    training_loss = train_output["loss"]

                if training_loss is None and hasattr(trainer, "state"):
                    if hasattr(trainer.state, "log_history") and trainer.state.log_history:
                        for log in reversed(trainer.state.log_history):
                            if "loss" in log:
                                training_loss = log["loss"]
                                break
            except Exception as e:
                logger.warning(f"Could not extract training loss: {e}")
        except RuntimeError as e:
            if "PytorchStreamWriter failed writing file" in str(e):
                logger.warning("Caught PyTorch serialization error during training. Will proceed with model saving.")

            else:

                raise

        try:
            model_path = os.path.join(model_save_dir, "model.pt")
            torch.save(model.state_dict(), model_path)
            logger.info(f"Model saved to {model_path}")
        except RuntimeError as e:
            if "PytorchStreamWriter failed writing file" in str(e):
                logger.warning(f"PyTorch serialization error when saving model state dict. Trying alternative method.")

                try:

                    with open(model_path, 'wb') as f:
                        torch.save(model.state_dict(), f, _use_new_zipfile_serialization=False)
                    logger.info(f"Model saved to {model_path} using legacy serialization")
                except Exception as e2:
                    logger.error(f"Failed to save model with alternative method: {e2}")
            else:

                raise

        tokenizer.save_pretrained(model_save_dir)
        logger.info(f"Tokenizer saved to {model_save_dir}")

        if label_mapping:
            label_mapping_path = os.path.join(model_save_dir, "label_mapping.json")
            with open(label_mapping_path, "w") as f:

                json_mapping = {str(k): int(v) for k, v in label_mapping.items()}
                json.dump(json_mapping, f, indent=2)
            logger.info(f"Label mapping saved to {label_mapping_path}")

        model_config = {
            "base_model": "bert-base-multilingual-cased",
            "num_labels": num_labels,
            "text_column": text_column,
            "label_column": label_column,
            "max_length": 128,

            "dataset_name": dataset_name,
        }

        config_path = os.path.join(model_save_dir, "config.json")
        with open(config_path, "w") as f:
            json.dump(model_config, f, indent=2)
        logger.info(f"Model config saved to {config_path}")

        training_time = time.time() - start_time
        logger.info(f"Training completed in {training_time:.2f} seconds")

        logger.info(f"Evaluating on validation set with {len(tokenized_dataset['validation'])} examples")
        validation_results = trainer.evaluate(eval_dataset=tokenized_dataset["validation"])

        logger.info(f"Evaluating on test set with {len(tokenized_dataset['test'])} examples")
        test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])


        processed_validation = {}
        for k, v in validation_results.items():
            if isinstance(v, (int, float, str, bool)) or v is None:
                processed_validation[k] = v
            else:
                processed_validation[k] = float(v)

        processed_test = {}
        for k, v in test_results.items():
            if isinstance(v, (int, float, str, bool)) or v is None:
                processed_test[k] = v
            else:
                processed_test[k] = float(v)

        metrics = {
            "training_time": float(training_time),
            "training_loss": float(training_loss) if training_loss is not None else None,
            "validation_results": processed_validation,
            "test_results": processed_test,
        }

        metrics_path = os.path.join(model_save_dir, "metrics.json")
        with open(metrics_path, "w") as f:
            json.dump(metrics, f, indent=2)
        logger.info(f"Model metrics saved to {metrics_path}")

        logger.info(f"============== RESULTS FOR {dataset_name} ==============")
        logger.info(f"Training loss: {training_loss}")
        logger.info(f"Validation results: {validation_results}")
        logger.info(f"Test results: {test_results}")

        return {
            "dataset": dataset_name,
            "training_time": training_time,
            "training_loss": training_loss,
            "validation_results": validation_results,
            "test_results": test_results,
            "model_path": model_path,
        }
    except Exception as e:
        logger.error(f"Fine-tuning failed for {dataset_name}: {e}", exc_info=True)
        return None

available_datasets = []
if swa_dataset:
    available_datasets.append(("Swahili", swa_dataset, 3))
if por_dataset:
    available_datasets.append(("Portuguese", por_dataset, 3))
if sot_dataset:
    available_datasets.append(("Sesotho", sot_dataset, 3))

logger.info(f"Available datasets: {[name for name, _, _ in available_datasets]}")

all_results = []
for dataset_name, dataset, num_labels in available_datasets:
    logger.info(f"\n==== FINE-TUNING ON {dataset_name.upper()} DATASET ====")
    try:
        results = finetune_and_evaluate(dataset_name, dataset, num_labels)

        if results:
            all_results.append(results)
        else:
            logger.warning(f"No results returned for {dataset_name}")
    except Exception as e:
        logger.error(f"Unexpected error during fine-tuning {dataset_name}: {e}", exc_info=True)

logger.info("\n==== FINE-TUNING COMPLETE ====")

print(f"{'Dataset':<12} | {'Acc (val)':<10} | {'F1 (val)':<10} | {'Acc (test)':<10} | {'F1 (test)':<10} | {'Model Dir':<20}")

for result in all_results:
    dataset = result["dataset"]
    val_acc = result["validation_results"].get("eval_accuracy", float('nan'))
    val_f1 = result["validation_results"].get("eval_f1", float('nan'))
    test_acc = result["test_results"].get("eval_accuracy", float('nan'))
    test_f1 = result["test_results"].get("eval_f1", float('nan'))
    model_dir = f"./models/{dataset.lower()}"

    print(f"{dataset:<12} | {val_acc:<10.4f} | {val_f1:<10.4f} | {test_acc:<10.4f} | {test_f1:<10.4f} | {model_dir}")

2025-05-10 00:40:55,321 - INFO - Successfully loaded Swahili dataset from HuggingFace Hub
2025-05-10 00:40:57,013 - INFO - Successfully loaded Portuguese dataset from HuggingFace Hub
2025-05-10 00:40:57,013 - INFO - Successfully loaded Portuguese dataset from HuggingFace Hub
2025-05-10 00:40:57,269 - INFO - Successfully loaded Sesotho dataset from CSV
2025-05-10 00:40:57,271 - INFO - Available datasets: ['Swahili', 'Portuguese', 'Sesotho']
2025-05-10 00:40:57,272 - INFO - 
==== FINE-TUNING ON SWAHILI DATASET ====
2025-05-10 00:40:57,272 - INFO - Starting fine-tuning for Swahili
2025-05-10 00:40:57,273 - INFO - Using device: cuda
2025-05-10 00:40:57,269 - INFO - Successfully loaded Sesotho dataset from CSV
2025-05-10 00:40:57,271 - INFO - Available datasets: ['Swahili', 'Portuguese', 'Sesotho']
2025-05-10 00:40:57,272 - INFO - 
==== FINE-TUNING ON SWAHILI DATASET ====
2025-05-10 00:40:57,272 - INFO - Starting fine-tuning for Swahili
2025-05-10 00:40:57,273 - INFO - Using device: cuda
So

Step,Training Loss
10,1.0513
20,0.8407
30,0.8912
40,0.9893
50,0.9284
60,0.9097
70,0.9544
80,0.9856
90,0.9255
100,0.8498


2025-05-10 00:42:41,517 - INFO - Model saved to ./models/swahili/model.pt
2025-05-10 00:42:41,567 - INFO - Tokenizer saved to ./models/swahili
2025-05-10 00:42:41,567 - INFO - Label mapping saved to ./models/swahili/label_mapping.json
2025-05-10 00:42:41,568 - INFO - Model config saved to ./models/swahili/config.json
2025-05-10 00:42:41,568 - INFO - Training completed in 102.97 seconds
2025-05-10 00:42:41,569 - INFO - Evaluating on validation set with 453 examples
2025-05-10 00:42:41,567 - INFO - Tokenizer saved to ./models/swahili
2025-05-10 00:42:41,567 - INFO - Label mapping saved to ./models/swahili/label_mapping.json
2025-05-10 00:42:41,568 - INFO - Model config saved to ./models/swahili/config.json
2025-05-10 00:42:41,568 - INFO - Training completed in 102.97 seconds
2025-05-10 00:42:41,569 - INFO - Evaluating on validation set with 453 examples


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025-05-10 00:42:44,114 - INFO - Evaluating on test set with 748 examples
2025-05-10 00:42:48,308 - INFO - Model metrics saved to ./models/swahili/metrics.json
2025-05-10 00:42:48,309 - INFO - Training loss: 0.8556
2025-05-10 00:42:48,310 - INFO - Validation results: {'eval_loss': 0.9941654205322266, 'eval_accuracy': 0.5209713024282561, 'eval_f1': 0.48724343479019466, 'eval_precision': 0.458357269134707, 'eval_recall': 0.5209713024282561, 'eval_runtime': 2.5441, 'eval_samples_per_second': 178.059, 'eval_steps_per_second': 11.399, 'epoch': 3.0}
2025-05-10 00:42:48,310 - INFO - Test results: {'eval_loss': 0.9500337243080139, 'eval_accuracy': 0.5347593582887701, 'eval_f1': 0.504784295364511, 'eval_precision': 0.5145401534051629, 'eval_recall': 0.5347593582887701, 'eval_runtime': 4.1913, 'eval_samples_per_second': 178.465, 'eval_steps_per_second': 11.214, 'epoch': 3.0}
2025-05-10 00:42:48,308 - INFO - Model metrics sa

Step,Training Loss
10,1.0257
20,1.1048
30,1.0352
40,0.99
50,1.0691
60,1.0036
70,1.052
80,1.0491
90,0.9734
100,0.9353


2025-05-10 00:45:48,836 - INFO - Model saved to ./models/portuguese/model.pt
2025-05-10 00:45:48,903 - INFO - Tokenizer saved to ./models/portuguese
2025-05-10 00:45:48,904 - INFO - Label mapping saved to ./models/portuguese/label_mapping.json
2025-05-10 00:45:48,905 - INFO - Model config saved to ./models/portuguese/config.json
2025-05-10 00:45:48,905 - INFO - Training completed in 179.09 seconds
2025-05-10 00:45:48,906 - INFO - Evaluating on validation set with 767 examples
2025-05-10 00:45:48,903 - INFO - Tokenizer saved to ./models/portuguese
2025-05-10 00:45:48,904 - INFO - Label mapping saved to ./models/portuguese/label_mapping.json
2025-05-10 00:45:48,905 - INFO - Model config saved to ./models/portuguese/config.json
2025-05-10 00:45:48,905 - INFO - Training completed in 179.09 seconds
2025-05-10 00:45:48,906 - INFO - Evaluating on validation set with 767 examples


2025-05-10 00:45:53,405 - INFO - Evaluating on test set with 3662 examples
2025-05-10 00:46:15,012 - INFO - Model metrics saved to ./models/portuguese/metrics.json
2025-05-10 00:46:15,014 - INFO - Training loss: 0.5003
2025-05-10 00:46:15,014 - INFO - Validation results: {'eval_loss': 0.8650572299957275, 'eval_accuracy': 0.6192959582790091, 'eval_f1': 0.6183814552471274, 'eval_precision': 0.6188338690391852, 'eval_recall': 0.6192959582790091, 'eval_runtime': 4.4974, 'eval_samples_per_second': 170.542, 'eval_steps_per_second': 10.673, 'epoch': 3.0}
2025-05-10 00:46:15,014 - INFO - Test results: {'eval_loss': 0.8389994502067566, 'eval_accuracy': 0.6318951392681594, 'eval_f1': 0.6440944722079657, 'eval_precision': 0.668456862074707, 'eval_recall': 0.6318951392681594, 'eval_runtime': 21.6037, 'eval_samples_per_second': 169.508, 'eval_steps_per_second': 10.6, 'epoch': 3.0}
2025-05-10 00:46:15,012 - INFO - Model metrics saved to ./models/portuguese/metrics.json
2025-05-10 00:46:15,014 - INFO

Step,Training Loss
10,0.8599
20,0.791
30,0.7293
40,0.726
50,0.7252
60,0.6236
70,0.7718
80,0.7042
90,0.7501
100,0.7576


2025-05-10 00:48:00,473 - INFO - Model saved to ./models/sesotho/model.pt
2025-05-10 00:48:00,544 - INFO - Tokenizer saved to ./models/sesotho
2025-05-10 00:48:00,545 - INFO - Label mapping saved to ./models/sesotho/label_mapping.json
2025-05-10 00:48:00,546 - INFO - Model config saved to ./models/sesotho/config.json
2025-05-10 00:48:00,546 - INFO - Training completed in 103.94 seconds
2025-05-10 00:48:00,547 - INFO - Evaluating on validation set with 218 examples
2025-05-10 00:48:00,544 - INFO - Tokenizer saved to ./models/sesotho
2025-05-10 00:48:00,545 - INFO - Label mapping saved to ./models/sesotho/label_mapping.json
2025-05-10 00:48:00,546 - INFO - Model config saved to ./models/sesotho/config.json
2025-05-10 00:48:00,546 - INFO - Training completed in 103.94 seconds
2025-05-10 00:48:00,547 - INFO - Evaluating on validation set with 218 examples


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025-05-10 00:48:01,850 - INFO - Evaluating on test set with 218 examples
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
2025-05-10 00:48:03,137 - INFO - Model metrics saved to ./models/sesotho/metrics.json
2025-05-10 00:48:03,139 - INFO - Training loss: 0.5543
2025-05-10 00:48:03,139 - INFO - Validation results: {'eval_loss': 0.5815077424049377, 'eval_accuracy': 0.7706422018348624, 'eval_f1': 0.758784555256037, 'eval_precision': 0.7473307497627334, 'eval_recall': 0.7706422018348624, 'eval_runtime': 1.3009, 'eval_samples_per_second': 167.578, 'eval_steps_per_second': 10.762, 'epoch': 3.0}
2025-05-10 00:48:03,139 - INFO - Test results: {'eval_loss': 0.6644694209098816, 'eval_accuracy': 0.7064220183486238, 'eval_f1': 0.6912700646963953, 'eval_precision': 0.6767571962526091, 'eval_recall': 0.7064220183486238, 'eval_runtime': 1.2845, 'eval_samples_per_second': 169.71, 'eval_steps_per_second':

Dataset      | Acc (val)  | F1 (val)   | Acc (test) | F1 (test)  | Model Dir           
Swahili      | 0.5210     | 0.4872     | 0.5348     | 0.5048     | ./models/swahili
Portuguese   | 0.6193     | 0.6184     | 0.6319     | 0.6441     | ./models/portuguese
Sesotho      | 0.7706     | 0.7588     | 0.7064     | 0.6913     | ./models/sesotho


## XLM-RoBERTa
### Baseline with [XLM-RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/xlm-roberta)


In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import time

run_timestamp = time.strftime("%Y%m%d_%H%M%S")

os.makedirs("./benchmark_results", exist_ok=True)

output_csv = os.path.abspath(f"./benchmark_results/xlmroberta_benchmark_results_{run_timestamp}.csv")

results_df = pd.DataFrame(columns=[
    "Dataset", "Model", "Loss", "Accuracy", "F1", "Precision", "Recall"
])

try:
    swa_dataset = load_dataset("masakhane/afrisenti", "swa")
except Exception as e:
    swa_dataset = None

try:
    por_dataset = load_dataset("masakhane/afrisenti", "por")
except Exception as e:
    por_dataset = None

try:
    sot_dataset = load_dataset("csv", data_files={"train": "./datasets/sotho-news/sotho_news_dataset.csv"})
except Exception as e:
    sot_dataset = None
    
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def get_benchmark_metrics(dataset_name, dataset, num_labels):
    
    if dataset is None:
        return None
        
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    model = AutoModelForSequenceClassification.from_pretrained(
        "xlm-roberta-base", 
        num_labels=num_labels
    ).to(device)
    
    text_column = None
    label_column = None
    
    text_candidates = ["text", "content", "tweet", "sentence", "document"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in text_candidates:
            if candidate in columns:
                text_column = candidate
                break
        if text_column:
            break
    
    label_candidates = ["label", "sentiment", "class", "category"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in label_candidates:
            if candidate in columns:
                label_column = candidate
                break
        if label_column:
            break
    
    if text_column is None:
        for split in dataset:
            for col in dataset[split].column_names:
                if isinstance(dataset[split][col][0], str):
                    text_column = col
                    break
            if text_column:
                break
    
    if text_column is None:
        return None
    
    if label_column is None:
        return None
    
    required_splits = ["train", "validation", "test"]
    missing_splits = [split for split in required_splits if split not in dataset]
    
    if missing_splits:
        if "train" in dataset:
            train_valid_test = {}
            
            for split in dataset:
                if split in required_splits:
                    train_valid_test[split] = dataset[split]
            
            if "train" in dataset and ("validation" not in train_valid_test or "test" not in train_valid_test):
                if "validation" not in train_valid_test:
                    if "test" not in train_valid_test:
                        split_datasets = dataset["train"].train_test_split(test_size=0.2, seed=42)
                        test_valid_split = split_datasets["test"].train_test_split(test_size=0.5, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = test_valid_split["train"]
                        train_valid_test["test"] = test_valid_split["test"]
                    else:
                        split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = split_datasets["test"]
                else:
                    split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                    train_valid_test["test"] = split_datasets["test"]
            
            dataset = DatasetDict(train_valid_test)
        else:
            return None
    
    label_mapping = None
    for split in dataset:
        if isinstance(dataset[split][label_column][0], str):
            all_labels = set()
            for example in dataset[split][label_column]:
                all_labels.add(example)
            
            label_mapping = {label: i for i, label in enumerate(sorted(all_labels))}
            break
    
    processed_dataset = DatasetDict()
    for split_name, split_dataset in dataset.items():
        texts = split_dataset[text_column]
        
        if label_mapping:
            labels = [label_mapping[label] for label in split_dataset[label_column]]
        else:
            labels = []
            for label in split_dataset[label_column]:
                if isinstance(label, (int, np.integer)):
                    labels.append(int(label))
                elif isinstance(label, str):
                    try:
                        labels.append(int(label))
                    except ValueError:
                        labels.append(0)
                else:
                    labels.append(0)
        
        processed_dataset[split_name] = Dataset.from_dict({
            "text": texts,
            "label": labels
        })
        
    def tokenize_function(examples):
        return tokenizer(
            examples["text"], 
            padding="max_length", 
            truncation=True, 
            max_length=128,
            return_tensors="pt"
        )
    
    tokenized_dataset = DatasetDict()
    for split_name, split_dataset in processed_dataset.items():
        tokenized_split = split_dataset.map(
            tokenize_function, 
            batched=True, 
            remove_columns=["text"]
        )
        tokenized_dataset[split_name] = tokenized_split
        
        for required_col in ["input_ids", "attention_mask", "label"]:
            if required_col not in tokenized_split.column_names:
                return None
    
    eval_args = TrainingArguments(
        output_dir=f"./benchmark_results/{dataset_name}_{run_timestamp}",
        per_device_eval_batch_size=16,
        logging_dir=f"./benchmark_results/{dataset_name}_{run_timestamp}/logs",
        report_to="none",
        remove_unused_columns=True
    )
    
    trainer = Trainer(
        model=model,
        args=eval_args,
        compute_metrics=compute_metrics,
    )
    
    try:
        benchmark_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
        
        return benchmark_results
    except Exception as e:
        return None

available_datasets = []
if swa_dataset:
    available_datasets.append(("Swahili", swa_dataset, 3))
if por_dataset:
    available_datasets.append(("Portuguese", por_dataset, 3))
if sot_dataset:
    available_datasets.append(("Sesotho", sot_dataset, 3))

all_results = []
for dataset_name, dataset, num_labels in available_datasets:
    try:
        results = get_benchmark_metrics(dataset_name, dataset, num_labels)
        
        if results:
            results_df = results_df._append({
                "Dataset": dataset_name,
                "Model": "XLM-RoBERTa",
                "Loss": results.get("eval_loss"),
                "Accuracy": results.get("eval_accuracy"),
                "F1": results.get("eval_f1"),
                "Precision": results.get("eval_precision"),
                "Recall": results.get("eval_recall")
            }, ignore_index=True)
            
            all_results.append({"Dataset": dataset_name, "Results": results})
        else:
            pass
    except Exception as e:
        pass

try:
    results_df.to_csv(output_csv, index=False)
except Exception as e:
    emergency_path = f"./emergency_results_{run_timestamp}.csv"
    results_df.to_csv(emergency_path, index=False)

print("\n==== BENCHMARKING COMPLETE ====")
print("\nResults Summary:")
print(results_df)

  from .autonotebook import tqdm as notebook_tqdm
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  results_df = results_df._append({
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 3063/3063 [00:00<00:00, 17953.28 examples/s]
Map:   0%|          | 0/767 [00:00<?, ? examples/s]
Map: 100%|██████████| 767/767 [00:00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1741/1741 [00:00<00:00, 20828.20 examples/s]
Map:   0%|          | 0/218 [00:00<?, ? examples/s]
Map: 100%|██████████| 218/218 [00:00<00:00, 12974.04 examples/s]
Map:   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



==== BENCHMARKING COMPLETE ====

Results Summary:
      Dataset        Model      Loss  Accuracy        F1  Precision    Recall
0     Swahili  XLM-RoBERTa  1.185747  0.106952  0.020667   0.011439  0.106952
1  Portuguese  XLM-RoBERTa  1.232671  0.171491  0.050220   0.029417  0.171491
2     Sesotho  XLM-RoBERTa  1.040040  0.247706  0.098354   0.061358  0.247706


In [None]:
#run this on google colab, my gpu doesnt have enough vram - Troy
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset, DatasetDict
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import os
import logging
import time
import sys
import json
import shutil

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

try:
    swa_dataset = load_dataset("masakhane/afrisenti", "swa")
    logger.info("Successfully loaded Swahili dataset from HuggingFace Hub")
except Exception as e:
    logger.error(f"Failed to load Swahili dataset: {e}")
    swa_dataset = None

try:
    por_dataset = load_dataset("masakhane/afrisenti", "por") 
    logger.info("Successfully loaded Portuguese dataset from HuggingFace Hub")
except Exception as e:
    logger.error(f"Failed to load Portuguese dataset: {e}")
    por_dataset = None

try:
    sot_dataset = load_dataset("csv", data_files={"train": "./datasets/sotho-news/sotho_news_dataset.csv"})
    logger.info("Successfully loaded Sesotho dataset from CSV")
except Exception as e:
    logger.error(f"Failed to load Sesotho dataset: {e}")
    sot_dataset = None
    
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

def finetune_and_evaluate(dataset_name, dataset, num_labels):
    logger.info(f"Starting fine-tuning for {dataset_name}")
    
    if dataset is None:
        logger.error(f"Dataset {dataset_name} is None, cannot proceed with fine-tuning")
        return None
        
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    
    model_save_dir = f"./models/{dataset_name.lower()}"
    os.makedirs(model_save_dir, exist_ok=True)
    
    tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
    model = AutoModelForSequenceClassification.from_pretrained(
        "xlm-roberta-base", 
        num_labels=num_labels
    ).to(device)
    
    logger.info(f"Dataset {dataset_name} structure:")
    for split in dataset:
        logger.info(f"  - Split: {split}, Examples: {len(dataset[split])}")
        logger.info(f"  - Features: {dataset[split].features}")
        logger.info(f"  - Columns: {dataset[split].column_names}")
    
    text_column = None
    label_column = None
    
    text_candidates = ["text", "content", "tweet", "sentence", "document", "headline"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in text_candidates:
            if candidate in columns:
                text_column = candidate
                break
        if text_column:
            break
    
    label_candidates = ["label", "sentiment", "class", "category"]
    for split in dataset:
        columns = dataset[split].column_names
        for candidate in label_candidates:
            if candidate in columns:
                label_column = candidate
                break
        if label_column:
            break
    
    logger.info(f"For {dataset_name}, using text_column={text_column}, label_column={label_column}")
    
    if text_column is None:
        for split in dataset:
            for col in dataset[split].column_names:
                if isinstance(dataset[split][col][0], str):
                    text_column = col
                    logger.info(f"Using '{col}' as text column based on string data")
                    break
            if text_column:
                break
    
    if text_column is None:
        logger.error(f"Could not identify a text column for {dataset_name}")
        return None
    
    if label_column is None:
        logger.error(f"Could not identify a label column for {dataset_name}")
        return None
    
    required_splits = ["train", "validation", "test"]
    missing_splits = [split for split in required_splits if split not in dataset]
    
    if missing_splits:
        logger.info(f"Creating missing splits: {missing_splits} for {dataset_name}")
        if "train" in dataset:
            train_valid_test = {}
            
            for split in dataset:
                if split in required_splits:
                    train_valid_test[split] = dataset[split]
            
            if "train" in dataset and ("validation" not in train_valid_test or "test" not in train_valid_test):
                if "validation" not in train_valid_test:
                    if "test" not in train_valid_test:
                        split_datasets = dataset["train"].train_test_split(test_size=0.2, seed=42)
                        test_valid_split = split_datasets["test"].train_test_split(test_size=0.5, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = test_valid_split["train"]
                        train_valid_test["test"] = test_valid_split["test"]
                    else:
                        split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                        train_valid_test["train"] = split_datasets["train"]
                        train_valid_test["validation"] = split_datasets["test"]
                else:
                    split_datasets = dataset["train"].train_test_split(test_size=0.1, seed=42)
                    train_valid_test["test"] = split_datasets["test"]
            
            dataset = DatasetDict(train_valid_test)
        else:
            logger.error(f"Dataset {dataset_name} has no train split and cannot create splits")
            return None
    
    label_mapping = None
    for split in dataset:
        if isinstance(dataset[split][label_column][0], str):
            all_labels = set()
            for example in dataset[split][label_column]:
                all_labels.add(example)
            
            label_mapping = {label: i for i, label in enumerate(sorted(all_labels))}
            logger.info(f"Created label mapping for {dataset_name}: {label_mapping}")
            break
    
    processed_dataset = DatasetDict()
    for split_name, split_dataset in dataset.items():
        texts = split_dataset[text_column]
        
        if label_mapping:
            labels = [label_mapping[label] for label in split_dataset[label_column]]
        else:
            labels = []
            for label in split_dataset[label_column]:
                if isinstance(label, (int, np.integer)):
                    labels.append(int(label))
                elif isinstance(label, str):
                    try:
                        labels.append(int(label))
                    except ValueError:
                        logger.warning(f"Unexpected string label found in {split_name}: {label}")
                        labels.append(0)
                else:
                    logger.warning(f"Unexpected label type in {split_name}: {type(label)}")
                    labels.append(0)
        
        processed_dataset[split_name] = Dataset.from_dict({
            "text": texts,
            "label": labels
        })
        
        logger.info(f"Processed {split_name} split: {len(processed_dataset[split_name])} examples")
    
    def tokenize_function(examples):
        return tokenizer(
            examples["text"], 
            padding="max_length", 
            truncation=True, 
            max_length=128,
        )
    
    tokenized_dataset = DatasetDict()
    for split_name, split_dataset in processed_dataset.items():
        tokenized_split = split_dataset.map(
            tokenize_function, 
            batched=True, 
            remove_columns=["text"]
        )
        tokenized_dataset[split_name] = tokenized_split
        
        logger.info(f"Tokenized {split_name} split: {len(tokenized_split)} examples")
        logger.info(f"Columns after tokenization: {tokenized_split.column_names}")
        
        for required_col in ["input_ids", "attention_mask", "label"]:
            if required_col not in tokenized_split.column_names:
                logger.error(f"Required column {required_col} missing after tokenization")
                return None
    
    model_output_dir = f"./tmp_model_dir_{dataset_name}"
    if os.path.exists(model_output_dir):
        shutil.rmtree(model_output_dir)
    os.makedirs(model_output_dir, exist_ok=True)
    
    training_args = TrainingArguments(
        output_dir=model_output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir="./tmp_logs",
        logging_steps=10,
        save_strategy="no",
        save_steps=1000000,
        eval_steps=100,
        do_eval=True,
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        compute_metrics=compute_metrics,
    )
    
    logger.info(f"Starting fine-tuning {dataset_name} with {len(tokenized_dataset['train'])} examples")
    start_time = time.time()
    
    try:
        try:
            train_output = trainer.train()
            
            training_loss = None
            try:
                if hasattr(train_output, "metrics") and "loss" in train_output.metrics:
                    training_loss = train_output.metrics["loss"]
                elif isinstance(train_output, dict) and "loss" in train_output:
                    training_loss = train_output["loss"]
                
                if training_loss is None and hasattr(trainer, "state"):
                    if hasattr(trainer.state, "log_history") and trainer.state.log_history:
                        for log in reversed(trainer.state.log_history):
                            if "loss" in log:
                                training_loss = log["loss"]
                                break
            except Exception as e:
                logger.warning(f"Could not extract training loss: {e}")
        except RuntimeError as e:
            if "PytorchStreamWriter failed writing file" in str(e):
                logger.warning("Caught PyTorch serialization error during training. Will proceed with model saving.")
                
            else:
                
                raise
        
        try:
            model_path = os.path.join(model_save_dir, "model.pt")
            torch.save(model.state_dict(), model_path)
            logger.info(f"Model saved to {model_path}")
        except RuntimeError as e:
            if "PytorchStreamWriter failed writing file" in str(e):
                logger.warning(f"PyTorch serialization error when saving model state dict. Trying alternative method.")
                
                try:
                    
                    with open(model_path, 'wb') as f:
                        torch.save(model.state_dict(), f, _use_new_zipfile_serialization=False)
                    logger.info(f"Model saved to {model_path} using legacy serialization")
                except Exception as e2:
                    logger.error(f"Failed to save model with alternative method: {e2}")
            else:
                
                raise
        
        tokenizer.save_pretrained(model_save_dir)
        logger.info(f"Tokenizer saved to {model_save_dir}")
        
        if label_mapping:
            label_mapping_path = os.path.join(model_save_dir, "label_mapping.json")
            with open(label_mapping_path, "w") as f:
                
                json_mapping = {str(k): int(v) for k, v in label_mapping.items()}
                json.dump(json_mapping, f, indent=2)
            logger.info(f"Label mapping saved to {label_mapping_path}")
            
        model_config = {
            "base_model": "xlm-roberta-base",
            "num_labels": num_labels,
            "text_column": text_column,
            "label_column": label_column,
            "max_length": 128,
            
            "dataset_name": dataset_name,
        }
        
        config_path = os.path.join(model_save_dir, "config.json")
        with open(config_path, "w") as f:
            json.dump(model_config, f, indent=2)
        logger.info(f"Model config saved to {config_path}")
        
        training_time = time.time() - start_time
        logger.info(f"Training completed in {training_time:.2f} seconds")
        
        logger.info(f"Evaluating on validation set with {len(tokenized_dataset['validation'])} examples")
        validation_results = trainer.evaluate(eval_dataset=tokenized_dataset["validation"])
        
        logger.info(f"Evaluating on test set with {len(tokenized_dataset['test'])} examples")
        test_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
        
        
        processed_validation = {}
        for k, v in validation_results.items():
            if isinstance(v, (int, float, str, bool)) or v is None:
                processed_validation[k] = v
            else:
                processed_validation[k] = float(v)
                
        processed_test = {}
        for k, v in test_results.items():
            if isinstance(v, (int, float, str, bool)) or v is None:
                processed_test[k] = v
            else:
                processed_test[k] = float(v)
        
        metrics = {
            "training_time": float(training_time),
            "training_loss": float(training_loss) if training_loss is not None else None,
            "validation_results": processed_validation,
            "test_results": processed_test,
        }
        
        metrics_path = os.path.join(model_save_dir, "metrics.json")
        with open(metrics_path, "w") as f:
            json.dump(metrics, f, indent=2)
        logger.info(f"Model metrics saved to {metrics_path}")
        
        logger.info(f"============== RESULTS FOR {dataset_name} ==============")
        logger.info(f"Training loss: {training_loss}")
        logger.info(f"Validation results: {validation_results}")
        logger.info(f"Test results: {test_results}")
        
        return {
            "dataset": dataset_name,
            "training_time": training_time,
            "training_loss": training_loss,
            "validation_results": validation_results,
            "test_results": test_results,
            "model_path": model_path,
        }
    except Exception as e:
        logger.error(f"Fine-tuning failed for {dataset_name}: {e}", exc_info=True)
        return None

available_datasets = []
if swa_dataset:
    available_datasets.append(("Swahili", swa_dataset, 3))
if por_dataset:
    available_datasets.append(("Portuguese", por_dataset, 3))
if sot_dataset:
    available_datasets.append(("Sesotho", sot_dataset, 3))

logger.info(f"Available datasets: {[name for name, _, _ in available_datasets]}")

all_results = []
for dataset_name, dataset, num_labels in available_datasets:
    logger.info(f"\n==== FINE-TUNING ON {dataset_name.upper()} DATASET ====")a
    try:
        results = finetune_and_evaluate(dataset_name, dataset, num_labels)
        
        if results:
            all_results.append(results)
        else:
            logger.warning(f"No results returned for {dataset_name}")
    except Exception as e:
        logger.error(f"Unexpected error during fine-tuning {dataset_name}: {e}", exc_info=True)

logger.info("\n==== FINE-TUNING COMPLETE ====")

print(f"{'Dataset':<12} | {'Acc (val)':<10} | {'F1 (val)':<10} | {'Acc (test)':<10} | {'F1 (test)':<10} | {'Model Dir':<20}")

for result in all_results:
    dataset = result["dataset"]
    val_acc = result["validation_results"].get("eval_accuracy", float('nan'))
    val_f1 = result["validation_results"].get("eval_f1", float('nan'))
    test_acc = result["test_results"].get("eval_accuracy", float('nan'))
    test_f1 = result["test_results"].get("eval_f1", float('nan'))
    model_dir = f"./models/{dataset.lower()}"
    
    print(f"{dataset:<12} | {val_acc:<10.4f} | {val_f1:<10.4f} | {test_acc:<10.4f} | {test_f1:<10.4f} | {model_dir}")

  from .autonotebook import tqdm as notebook_tqdm


2025-05-12 08:11:45,859 - INFO - Successfully loaded Swahili dataset from HuggingFace Hub
2025-05-12 08:11:47,817 - INFO - Successfully loaded Portuguese dataset from HuggingFace Hub
2025-05-12 08:11:48,086 - INFO - Successfully loaded Sesotho dataset from CSV
2025-05-12 08:11:48,088 - INFO - Available datasets: ['Swahili', 'Portuguese', 'Sesotho']
2025-05-12 08:11:48,088 - INFO - 
==== FINE-TUNING ON SWAHILI DATASET ====
2025-05-12 08:11:48,089 - INFO - Starting fine-tuning for Swahili
2025-05-12 08:11:48,209 - INFO - Using device: cuda


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2025-05-12 08:11:50,147 - INFO - Dataset Swahili structure:
2025-05-12 08:11:50,148 - INFO -   - Split: train, Examples: 1810
2025-05-12 08:11:50,149 - INFO -   - Features: {'tweet': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
2025-05-12 08:11:50,149 - INFO -   - Columns: ['tweet', 'label']
2025-05-12 08:11:50,150 - INFO -   - Split: validation, Examples: 453
2025-05-12 08:11:50,150 - INFO -   - Features: {'tweet': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
2025-05-12 08:11:50,151 - INFO -   - Columns: ['tweet', 'label']
2025-05-12 08:11:50,151 - INFO -   - Split: test, Examples: 748
2025-05-12 08:11:50,151 - INFO -   - Features: {'tweet': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
2025-05-12 08:11:50,152 - INFO -   - Columns: ['tweet', 'label']
2025-05-12 08:11:50,152 - INFO - For Swahili, using text_column=tweet, label_column=label
2025-05-12 08:11:50,156 - INFO - Created label mapping for Swahil

Map: 100%|██████████| 1810/1810 [00:00<00:00, 21445.33 examples/s]

2025-05-12 08:11:50,285 - INFO - Tokenized train split: 1810 examples
2025-05-12 08:11:50,285 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']



Map: 100%|██████████| 453/453 [00:00<00:00, 21110.39 examples/s]

2025-05-12 08:11:50,328 - INFO - Tokenized validation split: 453 examples
2025-05-12 08:11:50,328 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']



Map: 100%|██████████| 748/748 [00:00<00:00, 21557.73 examples/s]

2025-05-12 08:11:50,383 - INFO - Tokenized test split: 748 examples
2025-05-12 08:11:50,383 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']





2025-05-12 08:11:50,408 - INFO - Starting fine-tuning Swahili with 1810 examples


Step,Training Loss


2025-05-12 08:11:51,406 - ERROR - Fine-tuning failed for Swahili: CUDA out of memory. Tried to allocate 734.00 MiB. GPU 0 has a total capacity of 5.68 GiB of which 550.44 MiB is free. Including non-PyTorch memory, this process has 5.08 GiB memory in use. Of the allocated memory 4.16 GiB is allocated by PyTorch, and 795.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Traceback (most recent call last):
  File "/tmp/ipykernel_11049/2726313652.py", line 254, in finetune_and_evaluate
    train_output = trainer.train()
  File "/home/troy/Documents/nlp-research/.venv/lib/python3.10/site-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
  File "/home/troy/Documents/nlp-research/.venv/lib/python3.10/site-packages/transfo

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2025-05-12 08:11:53,200 - INFO - Dataset Portuguese structure:
2025-05-12 08:11:53,201 - INFO -   - Split: train, Examples: 3063
2025-05-12 08:11:53,201 - INFO -   - Features: {'tweet': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
2025-05-12 08:11:53,202 - INFO -   - Columns: ['tweet', 'label']
2025-05-12 08:11:53,202 - INFO -   - Split: validation, Examples: 767
2025-05-12 08:11:53,202 - INFO -   - Features: {'tweet': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
2025-05-12 08:11:53,203 - INFO -   - Columns: ['tweet', 'label']
2025-05-12 08:11:53,203 - INFO -   - Split: test, Examples: 3662
2025-05-12 08:11:53,203 - INFO -   - Features: {'tweet': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
2025-05-12 08:11:53,203 - INFO -   - Columns: ['tweet', 'label']
2025-05-12 08:11:53,204 - INFO - For Portuguese, using text_column=tweet, label_column=label
2025-05-12 08:11:53,209 - INFO - Created label mapping for

Map: 100%|██████████| 3063/3063 [00:00<00:00, 21644.46 examples/s]

2025-05-12 08:11:53,408 - INFO - Tokenized train split: 3063 examples
2025-05-12 08:11:53,408 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']



Map: 100%|██████████| 767/767 [00:00<00:00, 20834.34 examples/s]

2025-05-12 08:11:53,469 - INFO - Tokenized validation split: 767 examples
2025-05-12 08:11:53,470 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']



Map: 100%|██████████| 3662/3662 [00:00<00:00, 22204.18 examples/s]

2025-05-12 08:11:53,658 - INFO - Tokenized test split: 3662 examples
2025-05-12 08:11:53,658 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']





2025-05-12 08:11:53,681 - INFO - Starting fine-tuning Portuguese with 3063 examples


Step,Training Loss


2025-05-12 08:11:54,496 - ERROR - Fine-tuning failed for Portuguese: CUDA out of memory. Tried to allocate 734.00 MiB. GPU 0 has a total capacity of 5.68 GiB of which 538.44 MiB is free. Including non-PyTorch memory, this process has 5.09 GiB memory in use. Of the allocated memory 4.17 GiB is allocated by PyTorch, and 806.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Traceback (most recent call last):
  File "/tmp/ipykernel_11049/2726313652.py", line 254, in finetune_and_evaluate
    train_output = trainer.train()
  File "/home/troy/Documents/nlp-research/.venv/lib/python3.10/site-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
  File "/home/troy/Documents/nlp-research/.venv/lib/python3.10/site-packages/tran

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


2025-05-12 08:11:56,281 - INFO - Dataset Sesotho structure:
2025-05-12 08:11:56,282 - INFO -   - Split: train, Examples: 2177
2025-05-12 08:11:56,282 - INFO -   - Features: {'headline': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}
2025-05-12 08:11:56,283 - INFO -   - Columns: ['headline', 'label']
2025-05-12 08:11:56,283 - INFO - For Sesotho, using text_column=headline, label_column=label
2025-05-12 08:11:56,283 - INFO - Creating missing splits: ['validation', 'test'] for Sesotho
2025-05-12 08:11:56,302 - INFO - Created label mapping for Sesotho: {'negative': 0, 'neutral': 1, 'positive': 2}
2025-05-12 08:11:56,319 - INFO - Processed train split: 1741 examples
2025-05-12 08:11:56,324 - INFO - Processed validation split: 218 examples
2025-05-12 08:11:56,329 - INFO - Processed test split: 218 examples


Map: 100%|██████████| 1741/1741 [00:00<00:00, 25323.50 examples/s]

2025-05-12 08:11:56,423 - INFO - Tokenized train split: 1741 examples
2025-05-12 08:11:56,424 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']



Map: 100%|██████████| 218/218 [00:00<00:00, 20183.62 examples/s]

2025-05-12 08:11:56,458 - INFO - Tokenized validation split: 218 examples
2025-05-12 08:11:56,459 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']



Map: 100%|██████████| 218/218 [00:00<00:00, 19098.47 examples/s]

2025-05-12 08:11:56,495 - INFO - Tokenized test split: 218 examples
2025-05-12 08:11:56,496 - INFO - Columns after tokenization: ['label', 'input_ids', 'attention_mask']
2025-05-12 08:11:56,520 - INFO - Starting fine-tuning Sesotho with 1741 examples





Step,Training Loss


2025-05-12 08:11:57,338 - ERROR - Fine-tuning failed for Sesotho: CUDA out of memory. Tried to allocate 734.00 MiB. GPU 0 has a total capacity of 5.68 GiB of which 494.44 MiB is free. Including non-PyTorch memory, this process has 5.13 GiB memory in use. Of the allocated memory 4.17 GiB is allocated by PyTorch, and 847.37 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
Traceback (most recent call last):
  File "/tmp/ipykernel_11049/2726313652.py", line 254, in finetune_and_evaluate
    train_output = trainer.train()
  File "/home/troy/Documents/nlp-research/.venv/lib/python3.10/site-packages/transformers/trainer.py", line 2245, in train
    return inner_training_loop(
  File "/home/troy/Documents/nlp-research/.venv/lib/python3.10/site-packages/transfo

# Data Augmentation

Firstly, we want to check the actual distribution of the classes for Mozambican Portuguese

In [5]:
import os
from datasets import load_from_disk, load_dataset
from collections import Counter

def analyze_sentiment_distribution(dataset, split='train'):
    """Analyze sentiment distribution in a dataset with string labels."""
    if dataset is None:
        return
    
    # Count the occurrences of each sentiment label
    label_counts = Counter(dataset[split]['label'])
    
    # Print the distribution
    print(f"\nSentiment Distribution:")
    total = len(dataset[split])
    
    # Sort labels in a meaningful order: negative, neutral, positive
    ordered_labels = ['negative', 'neutral', 'positive']
    
    for label in ordered_labels:
        if label in label_counts:
            count = label_counts[label]
            percentage = (count / total) * 100
            # Capitalize first letter for display
            display_label = label.capitalize()
            print(f"{display_label}: {count} ({percentage:.2f}%)")
    
    # Check for any other labels not in our expected list
    for label, count in label_counts.items():
        if label not in ordered_labels:
            percentage = (count / total) * 100
            print(f"Other ({label}): {count} ({percentage:.2f}%)")

if __name__ == "__main__":
    swa, por, sot = load_local_datasets()

    if por is not None:
        print(f"Portuguese dataset size: {len(por['train'])} examples")
        print(f"Dataset features: {por['train'].features}")
        print("\nSample entry:")
        print(por['train'][0])
        analyze_sentiment_distribution(por)

Loading Swahili (swa) dataset from disk...
Swahili dataset loaded!
Loading Portuguese (por) dataset from disk...
Portuguese dataset loaded!
Loading Sesotho (sot) dataset from disk...
Sesotho dataset loaded!
Portuguese dataset size: 3063 examples
Dataset features: {'tweet': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None)}

Sample entry:
{'tweet': 'Pedi uma resposta a Deus, ele deu me. Estou muito triste com ela. Mas mais tarde sei que vou entender.', 'label': 'negative'}

Sentiment Distribution:
Negative: 782 (25.53%)
Neutral: 1600 (52.24%)
Positive: 681 (22.23%)


### There is a clear imbalance in the classes and so data augmentation techniques will be used such as dropout and back-translation in order to create synthetic data for the minority classes

In [9]:
import os
import random
from datasets import load_from_disk, load_dataset, Dataset
from collections import Counter

def load_local_datasets():
    swa_path = "./datasets/afrisenti/swa"
    por_path = "./datasets/afrisenti/por"
    sot_path = "./datasets/news"

    if not all(os.path.exists(path) for path in [swa_path, por_path, sot_path]):
        print("One or more dataset directories not found. Please check the paths.")
        return None, None, None

    print("Loading Swahili (swa) dataset from disk...")
    swa_dataset = load_from_disk(swa_path)
    print("Swahili dataset loaded!")

    print("Loading Portuguese (por) dataset from disk...")
    por_dataset = load_from_disk(por_path)
    print("Portuguese dataset loaded!")

    print("Loading Sesotho (sot) dataset from disk...")
    sot_dataset = load_dataset("csv", data_files="datasets/sotho-news/sotho_news_dataset.csv")
    print("Sesotho dataset loaded!")

    return swa_dataset, por_dataset, sot_dataset

def analyze_sentiment_distribution(dataset, split='train'):
    """Analyze sentiment distribution in a dataset with string labels."""
    if dataset is None:
        return
    
    # Check if this is a Dataset object with splits or just a simple Dataset
    if split in dataset:
        # This is a dataset with splits
        data = dataset[split]
    else:
        # This is a simple Dataset without splits
        data = dataset
    
    # Count the occurrences of each sentiment label
    label_counts = Counter(data['label'])
    
    # Print the distribution
    print(f"\nSentiment Distribution:")
    total = len(data)
    
    # Sort labels in a meaningful order: negative, neutral, positive
    ordered_labels = ['negative', 'neutral', 'positive']
    
    for label in ordered_labels:
        if label in label_counts:
            count = label_counts[label]
            percentage = (count / total) * 100
            # Capitalize first letter for display
            display_label = label.capitalize()
            print(f"{display_label}: {count} ({percentage:.2f}%)")
    
    # Check for any other labels not in our expected list
    for label, count in label_counts.items():
        if label not in ordered_labels:
            percentage = (count / total) * 100
            print(f"Other ({label}): {count} ({percentage:.2f}%)")

def simple_augment_text(text):
    """Simple text augmentation without relying on translation models"""
    augmentation_techniques = [
        lambda t: word_deletion(t, p=0.1),
        lambda t: word_swap(t, p=0.1),
        lambda t: add_punctuation(t)
    ]
    
    # Randomly select an augmentation technique
    technique = random.choice(augmentation_techniques)
    return technique(text)

def word_deletion(text, p=0.1):
    """Randomly delete words with probability p"""
    words = text.split()
    if len(words) <= 3:  # Don't delete from very short texts
        return text
        
    new_words = []
    for word in words:
        if random.random() > p:  # Keep the word with probability (1-p)
            new_words.append(word)
    
    # Ensure we don't delete all words
    if not new_words:
        return random.choice(words)
        
    return ' '.join(new_words)

def word_swap(text, p=0.1):
    """Randomly swap adjacent words with probability p"""
    words = text.split()
    if len(words) <= 1:
        return text
        
    for i in range(len(words) - 1):
        if random.random() < p:
            words[i], words[i+1] = words[i+1], words[i]
    
    return ' '.join(words)

def add_punctuation(text):
    """Add or modify punctuation without changing meaning"""
    # Add emphasis for positive/negative texts
    if text[-1] not in '!?.':
        if random.random() < 0.5:
            text += '!'
        else:
            text += '.'
    elif text[-1] == '.' and random.random() < 0.3:
        text = text[:-1] + '!'
    
    return text

def balance_and_augment_dataset(dataset, target_neutral_ratio=0.7, split='train'):
    """
    Balance and augment dataset:
    1. Undersample neutral class
    2. Augment negative and positive classes using simple techniques
    """
    # Count the occurrences of each label
    label_counts = {}
    for label in ['negative', 'neutral', 'positive']:
        label_counts[label] = sum(1 for l in dataset[split]['label'] if l == label)
    
    # Separate data by label
    data_by_label = {
        'negative': [],
        'neutral': [],
        'positive': []
    }
    
    for i in range(len(dataset[split])):
        label = dataset[split][i]['label']
        data_by_label[label].append({
            'tweet': dataset[split][i]['tweet'],
            'label': label
        })
    
    # Undersample neutral class
    neutral_target_size = int(label_counts['neutral'] * target_neutral_ratio)
    sampled_neutral = random.sample(data_by_label['neutral'], neutral_target_size)
    
    # Prepare balanced data with originals
    balanced_data = data_by_label['negative'] + sampled_neutral + data_by_label['positive']
    
    # Augment minority classes
    augmented_data = balanced_data.copy()
    
    # Target count for each class after balancing
    target_count = max(len(data_by_label['negative']), len(data_by_label['positive']), neutral_target_size)
    
    # Augment negative class
    negative_to_add = target_count - len(data_by_label['negative'])
    if negative_to_add > 0:
        # Select samples to augment (can select the same sample multiple times)
        for _ in range(negative_to_add):
            sample = random.choice(data_by_label['negative'])
            augmented_tweet = simple_augment_text(sample['tweet'])
            augmented_data.append({'tweet': augmented_tweet, 'label': 'negative'})
    
    # Augment positive class
    positive_to_add = target_count - len(data_by_label['positive'])
    if positive_to_add > 0:
        for _ in range(positive_to_add):
            sample = random.choice(data_by_label['positive'])
            augmented_tweet = simple_augment_text(sample['tweet'])
            augmented_data.append({'tweet': augmented_tweet, 'label': 'positive'})
    
    # Shuffle the augmented data
    random.shuffle(augmented_data)
    
    # Return the augmented data directly (not as a Dataset object)
    return augmented_data

if __name__ == "__main__":
    swa, por, sot = load_local_datasets()
    
    if por is not None:
        print("Original Portuguese dataset:")
        analyze_sentiment_distribution(por)
        
        # Balance and augment dataset
        augmented_data = balance_and_augment_dataset(por)
        
        # Create a new dataset from the augmented data
        augmented_dataset = Dataset.from_dict({
            'tweet': [item['tweet'] for item in augmented_data],
            'label': [item['label'] for item in augmented_data]
        })
        
        print("\nAfter balancing and augmentation:")
        analyze_sentiment_distribution(augmented_dataset)  # Now analyzing the dataset directly
        
        # Create a dataset with splits for saving
        full_dataset_with_splits = {"train": augmented_dataset}
        full_dataset = Dataset.from_dict({
            'train': augmented_dataset
        })
        
        # Save the balanced and augmented dataset
        full_dataset.save_to_disk("./datasets/afrisenti/por_balanced_augmented")
        print("\nBalanced and augmented dataset saved!")
        

Loading Swahili (swa) dataset from disk...
Swahili dataset loaded!
Loading Portuguese (por) dataset from disk...
Portuguese dataset loaded!
Loading Sesotho (sot) dataset from disk...
Sesotho dataset loaded!
Original Portuguese dataset:

Sentiment Distribution:
Negative: 782 (25.53%)
Neutral: 1600 (52.24%)
Positive: 681 (22.23%)

After balancing and augmentation:

Sentiment Distribution:
Negative: 1120 (33.33%)
Neutral: 1120 (33.33%)
Positive: 1120 (33.33%)


Saving the dataset (1/1 shards): 100%|██████████| 3360/3360 [00:00<00:00, 660056.27 examples/s]


Balanced and augmented dataset saved!





These text augmentation techniques (word deletion, word swap, punctuation changes) are more appropriate for Mozambican Portuguese since they don't rely on external translation models that might not understand the dialect. They preserve the unique characteristics of Mozambican Portuguese while still creating useful variations of the original text.

Now let's see if there is any improvement when training the best performing BERT model on the Mozambican Portuguese data which was Afro-XLMR:

In [None]:
# %pip install -U transformers datasets peft evaluate plotly sentencepiece --quiet


import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from datasets import load_dataset
# import torch_optimizer as optim
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# from tqdm import tqdm
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

def load_data(language):
    try:
        aug_por_path = "./datasets/afrisenti/por_balanced_augmented"

        if language=='por':
            chosen_dataset=load_from_disk(aug_por_path)
        else:
            raise Exception

        for ds in [chosen_dataset]: # Change to all three later
            for lbl in ["train","validation","test"]:
                if ds[lbl].column_names[0]== "tweet":
                    ds[lbl] = ds[lbl].rename_column("tweet","text")
                else:
                    ds[lbl] = ds[lbl].rename_column("headline","text")

        train_df  = chosen_dataset["train"].to_pandas()
        val_df  = chosen_dataset["validation"].to_pandas()
        test_df  = chosen_dataset["test"].to_pandas()
        logger.info(f"Data loaded successfully: {len(train_df)} training, {len(val_df)} validation, {len(test_df)} test examples")
        return train_df, val_df, test_df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise e

# Function to create DataLoaders
def create_data_loaders(train_df, val_df, test_df, tokenizer, batch_size=16, text_column='text', label_column='label'):
    train_dataset = SentimentDataset(
        texts=train_df[text_column].tolist(),
        labels=train_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    val_dataset = SentimentDataset(
        texts=val_df[text_column].tolist(),
        labels=val_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    test_dataset = SentimentDataset(
        texts=test_df[text_column].tolist(),
        labels=test_df[label_column].tolist(),
        tokenizer=tokenizer
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, device):
    """
    Returns:
        Tuple of (loss, accuracy, precision, recall, f1)
    """
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted'
    )

    return total_loss / len(dataloader), accuracy, precision, recall, f1, all_preds, all_labels

def plot_confusion_matrix(true_labels, predictions, class_names):
    cm = confusion_matrix(true_labels, predictions)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.close()

def evaluate_afro_xlmr_for_lang(language):
    config = {
        'model_name': 'Davlan/afro-xlmr-base',
        'num_labels': 3,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'epochs': 3,
        'warmup_steps': 0,
        'max_grad_norm': 1.0,
        'text_column': 'text',
        'label_column': 'label',
        'class_names': ['negative', 'neutral', 'positive']
    }

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info(f"Using device: {device}")

    logger.info(f"Loading model: {config['model_name']}...")
    tokenizer = XLMRobertaTokenizer.from_pretrained(config['model_name'])
    model = XLMRobertaForSequenceClassification.from_pretrained(
        config['model_name'],
        num_labels=config['num_labels']
    )
    model.to(device)

    # Load data
    logger.info("Loading data...")
    train_df, val_df, test_df = load_data(f'{language}')

    # Create data loaders
    logger.info("Creating data loaders...")
    train_loader, val_loader, test_loader = create_data_loaders(
        train_df, val_df, test_df,
        tokenizer,
        batch_size=config['batch_size'],
        text_column=config['text_column'],
        label_column=config['label_column']
    )


    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])
    total_steps = len(train_loader) * config['epochs']
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config['warmup_steps'],
        num_training_steps=total_steps
    )

    logger.info("Starting training...")
    best_val_f1 = 0
    best_model_state = None

    for epoch in range(config['epochs']):
        logger.info(f"Epoch {epoch + 1}/{config['epochs']}")

        # Train
        start_time = time.time()
        train_loss = train_epoch(model, train_loader,optimizer,scheduler, device) # train_epoch(model, train_loader, scheduler, device)
        train_time = time.time() - start_time

        # Validate
        val_loss, val_accuracy, val_precision, val_recall, val_f1, _, _ = evaluate(model, val_loader, device)

        logger.info(f"Epoch {epoch + 1} results:")
        logger.info(f"Train Loss: {train_loss:.4f}, Time: {train_time:.2f}s")
        logger.info(f"Val Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}")

        # Save best model
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            best_model_state = model.state_dict().copy()
            logger.info(f"New best model with F1: {best_val_f1:.4f}")

    # Load best model for testing
    if best_model_state:
        logger.info("Loading best model for testing...")
        model.load_state_dict(best_model_state)

    # Test evaluation
    logger.info("Evaluating on test set...")
    test_loss, test_accuracy, test_precision, test_recall, test_f1, test_preds, test_labels = evaluate(model, test_loader, device)

    logger.info(f"Test Results:")
    logger.info(f"Loss: {test_loss:.4f}")
    logger.info(f"Accuracy: {test_accuracy:.4f}")
    logger.info(f"Precision: {test_precision:.4f}")
    logger.info(f"Recall: {test_recall:.4f}")
    logger.info(f"F1 Score: {test_f1:.4f}")

    results_df_afro = {
    "test_loss": test_loss,
    "test_accuracy": test_accuracy,
    "test_f1": test_f1,
    "test_precision": test_precision,
    "test_recall": test_recall,
    "epochs": config['epochs'],
    "learning_rate": config['learning_rate'],
    "batch_size": config['batch_size'],
}

    path = f"afroxlmr_results_{language}_augmented.csv"
    pd.DataFrame([results_df_afro]).to_csv(path, index=False)
    print(f"Results saved to {path}")

    # Save model
    logger.info("Saving model...")
    model_save_path = f'./xmlr_sentiment_model_{language}_augmented'
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)
    logger.info(f"Model saved to {model_save_path}")

# evaluate_afro_xlmr()
langs = ['por']

for l in langs:
    evaluate_afro_xlmr_for_lang(l)

## Next, we evaluate the use of Adapters to perform cross-lingual transfer
This section will explore the effect of Adapters on model performance when doing cross-lingual transfer. This evaluation will use the Afroxlmr model, because it had the highest accuracy when analysing the Swahili data.

In [29]:
# !pip install -U transformers datasets peft evaluate plotly sentencepiece adapters adapter-transformers --quiet

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from datasets import load_dataset
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import time
import logging
from adapters import AutoAdapterModel,AdapterConfig
from torch.nn import CrossEntropyLoss

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128, lang=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.lang = lang
        if self.lang=='sot':
            self.label_mapping = {"negative": 0, "neutral": 1, "positive": 2}  # Label conversion

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        if self.lang=='sot':
            label = self.label_mapping[self.labels[idx]]
        else:
            label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def load_data(language):
    try:
        if language=='por':
            chosen_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "por",trust_remote_code=True)
        elif language=='swa':
            chosen_dataset=load_dataset("HausaNLP/AfriSenti-Twitter", "swa",trust_remote_code=True)
        elif language=='sot':
            chosen_dataset=load_dataset("hamza-student-123/nlp-assignment-news-data",'sot')
        else:
            raise Exception

        for ds in [chosen_dataset]: # Change to all three later
            for lbl in ["train","validation","test"]:
                if ds[lbl].column_names[0]== "tweet":
                    ds[lbl] = ds[lbl].rename_column("tweet","text")
                else:
                    ds[lbl] = ds[lbl].rename_column("headline","text")

        train_df  = chosen_dataset["train"].to_pandas()
        val_df  = chosen_dataset["validation"].to_pandas()
        test_df  = chosen_dataset["test"].to_pandas()
        logger.info(f"Data loaded successfully: {len(train_df)} training, {len(val_df)} validation, {len(test_df)} test examples")
        return train_df, val_df, test_df
    except Exception as e:
        logger.error(f"Error loading data: {str(e)}")
        raise e


def create_data_loaders(train_df, val_df, test_df, tokenizer, batch_size=16, text_column='text', label_column='label', lang=None):
    train_dataset = SentimentDataset(
        texts=train_df[text_column].tolist(),
        labels=train_df[label_column].tolist(),
        tokenizer=tokenizer,
        lang=lang
    )

    val_dataset = SentimentDataset(
        texts=val_df[text_column].tolist(),
        labels=val_df[label_column].tolist(),
        tokenizer=tokenizer,
        lang=lang
    )

    test_dataset = SentimentDataset(
        texts=test_df[text_column].tolist(),
        labels=test_df[label_column].tolist(),
        tokenizer=tokenizer,
        lang=lang
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    epoch_loss = 0

    progress_bar = tqdm(dataloader, desc="Training")
    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        epoch_loss += loss.item()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"loss": f"{loss.item():.4f}"})

    return epoch_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted'
    )

    return total_loss / len(dataloader), accuracy, precision, recall, f1, all_preds, all_labels

def setup_crosslingual_adapters(config, source_language, target_language):

    logger.info(f"Loading model: {config['model_name']}...")
    tokenizer = XLMRobertaTokenizer.from_pretrained(config['extern_model_name'])
    model = AutoAdapterModel.from_pretrained(config['model_name'], num_labels=config['num_labels'])
    logger.info("Successfully loaded as AutoAdapterModel")

    lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)

    source_adapter_name = f"lang_{source_language}"
    model.add_adapter(source_adapter_name, config=lang_adapter_config)
    logger.info(f"Added source language adapter: {source_adapter_name}")

    target_adapter_name = f"lang_{target_language}"
    model.add_adapter(target_adapter_name, config=lang_adapter_config)
    logger.info(f"Added target language adapter: {target_adapter_name}")

    task_adapter_name = "sentiment_task"
    task_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=16)
    model.add_adapter(task_adapter_name, config=task_adapter_config)

    model.add_classification_head(
        task_adapter_name,
        num_labels=config['num_labels'],
        id2label={i: label for i, label in enumerate(config['class_names'])}
    )

    return model, tokenizer, source_adapter_name, target_adapter_name, task_adapter_name

def setup_madx_adapter(config, source_language, target_language):
    tokenizer = XLMRobertaTokenizer.from_pretrained(config['extern_model_name'])
    model = AutoAdapterModel.from_pretrained(config['model_name'], num_labels=config['num_labels'])

    # I chose to use a MAD-X style configuration (using standard AdapterConfig)
    # reason being, it typically uses smaller reduction factors for language adapters
    lang_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=2)
    task_adapter_config = AdapterConfig.load("pfeiffer", reduction_factor=16)

    model.add_adapter(f"lang_{source_language}", config=lang_adapter_config)
    model.add_adapter(f"lang_{target_language}", config=lang_adapter_config)

    model.add_adapter("sentiment", config=task_adapter_config)
    model.add_classification_head("sentiment", num_labels=config['num_labels'])

    return model, tokenizer

def train_cross_lingual_transfer(model, source_train_loader, target_train_loader,
                               val_loader, config, source_adapter, target_adapter, task_adapter):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    criterion = CrossEntropyLoss()

    logger.info("Phase 1: Training source language + task adapters...")

    model.set_active_adapters([source_adapter, task_adapter])
    model.train_adapter([source_adapter, task_adapter])

    trainable_params = []
    for name, param in model.named_parameters():
        if param.requires_grad:
            trainable_params.append(param)

    optimizer_source = torch.optim.AdamW(trainable_params, lr=config['learning_rate'])

    for epoch in range(config['epochs']):
        model.train()
        total_loss = 0

        progress_bar = tqdm(source_train_loader, desc=f"Source Epoch {epoch+1}/{config['epochs']}")

        for batch_idx, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer_source.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(trainable_params, config['max_grad_norm'])
            optimizer_source.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

        avg_loss = total_loss / len(source_train_loader)
        logger.info(f"Source Epoch {epoch+1}: Average Loss = {avg_loss:.4f}")

        val_accuracy = validate_model(model, val_loader, device)
        logger.info(f"Source Epoch {epoch+1}: Validation Accuracy = {val_accuracy:.4f}")


    logger.info("Phase 2: Training target language adapter...")


    model.set_active_adapters([target_adapter, task_adapter])
    model.train_adapter([target_adapter])

    target_params = []
    for name, param in model.named_parameters():
        if param.requires_grad and target_adapter in name:
            target_params.append(param)


    optimizer_target = torch.optim.AdamW(target_params, lr=config['learning_rate'] * 0.1)


    for epoch in range(config['epochs']):
        model.train()
        total_loss = 0

        progress_bar = tqdm(target_train_loader, desc=f"Target Epoch {epoch+1}/{config['epochs']}")

        for batch_idx, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer_target.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(target_params, config['max_grad_norm'])
            optimizer_target.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

        avg_loss = total_loss / len(target_train_loader)
        logger.info(f"Target Epoch {epoch+1}: Average Loss = {avg_loss:.4f}")

        # Validation on target language
        val_accuracy = validate_model(model, val_loader, device)
        logger.info(f"Target Epoch {epoch+1}: Validation Accuracy = {val_accuracy:.4f}")

    return model

def validate_model(model, val_loader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, predicted = torch.max(outputs.logits.data, 1)

            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

def load_pretrained_lang_adapters(model, source_language, target_language):
    try:
        model.load_adapter(f"lang/{source_language}", source="hf", load_as=f"lang_{source_language}")
        logger.info(f"Loaded pretrained {source_language} adapter")
    except:
        logger.warning(f"No pretrained adapter found for {source_language}, using random initialization")

    try:
        model.load_adapter(f"lang/{target_language}", source="hf", load_as=f"lang_{target_language}")
        logger.info(f"Loaded pretrained {target_language} adapter")
    except:
        logger.warning(f"No pretrained adapter found for {target_language}, using random initialization")

    return model

def train_afro_xlmr_adapter(source_language, target_language='sot'):
    config = {
        'extern_model_name': f'Davlan/afro-xlmr-base',
        'model_name': f'./xmlr_sentiment_model_{source_language}',
        'num_labels': 3,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'epochs': 3,
        'warmup_steps': 0,
        'max_grad_norm': 1.0,
        'text_column': 'text',
        'label_column': 'label',
        'class_names': ['negative', 'neutral', 'positive']
    }

    model, tokenizer, src_adapter, tgt_adapter, task_adapter = setup_crosslingual_adapters(
    config, source_language, target_language)

    logger.info("Loading source data...")
    train_df, val_df, test_df = load_data(f'{source_language}')

    logger.info("Creating data loaders...")
    source_train_loader, source_val_loader, source_test_loader = create_data_loaders(
        train_df, val_df, test_df,
        tokenizer,
        batch_size=config['batch_size'],
        text_column=config['text_column'],
        label_column=config['label_column'],
        lang=source_language
    )

    logger.info("Loading target data...")
    train_df, val_df, test_df = load_data(f'{target_language}')

    logger.info("Creating data loaders...")
    target_train_loader, val_loader, test_loader = create_data_loaders(
        train_df, val_df, test_df,
        tokenizer,
        batch_size=config['batch_size'],
        text_column=config['text_column'],
        label_column=config['label_column'],
        lang=target_language
    )


    print("Starting sequential cross-lingual training...")
    model = train_cross_lingual_transfer(
        model, source_train_loader, target_train_loader, val_loader,
        config, src_adapter, tgt_adapter, task_adapter
    )

    try:
        logger.info("Attempting to save model and tokenizer")
        model.save_pretrained(f"./cross_lingual_model_{source_language}_{target_language}")
        tokenizer.save_pretrained(f"./cross_lingual_model_{source_language}_{target_language}")
    except:
        logger.info("Failed to save model and tokenizer")

    # Test evaluation: Showing results after training and adapters
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info("Evaluating on test set...")
    test_loss, test_accuracy, test_precision, test_recall, test_f1, test_preds, test_labels = evaluate(model, test_loader, device)

    logger.info(f"Test Results:")
    logger.info(f"Loss: {test_loss:.4f}")
    logger.info(f"Accuracy: {test_accuracy:.4f}")
    logger.info(f"Precision: {test_precision:.4f}")
    logger.info(f"Recall: {test_recall:.4f}")
    logger.info(f"F1 Score: {test_f1:.4f}")

    results_df_afro = {
    "test_loss": test_loss,
    "test_accuracy": test_accuracy,
    "test_f1": test_f1,
    "test_precision": test_precision,
    "test_recall": test_recall,
    "epochs": config['epochs'],
    "learning_rate": config['learning_rate'],
    "batch_size": config['batch_size'],
}

    path = f"afroxlmr_results_{target_language}_adapters.csv"
    pd.DataFrame([results_df_afro]).to_csv(path, index=False)
    print(f"Results saved to {path}")

    # Saving the adapters
    model.save_adapter(f"./adapters/{src_adapter}", src_adapter)
    model.save_adapter(f"./adapters/{tgt_adapter}", tgt_adapter)
    model.save_adapter(f"./adapters/{task_adapter}", task_adapter)


train_afro_xlmr_adapter('swa','sot')


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
2025-05-24 11:24:58,280 - INFO - Loading model: ./xmlr_sentiment_model_swa...
2025-05-24 11:24:59,952 - INFO - Adding head 'default' with config {'head_type': 'classification', 'num_labels': 3, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}, 'use_pooler': False, 'bias': True, 'dropout_prob': None}.
Some weights of XLMRobertaAdapterModel were not initialized from the model checkpoint at ./xmlr_sentiment_model_swa and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
2025-05-24 11:25:00,018 - INFO - Successfully loaded as AutoAdapterModel
2025-05-24 11:25:00,019 - INFO - Adding adapter 'lang_swa'.
2025-05-24 11:25:00,084 - INFO - Added source language adapter: lang_swa


Starting sequential cross-lingual training...


Source Epoch 1/3: 100%|██████████| 114/114 [06:17<00:00,  3.31s/it, loss=0.468]
2025-05-24 11:31:24,699 - INFO - Source Epoch 1: Average Loss = 0.7073
2025-05-24 11:31:50,179 - INFO - Source Epoch 1: Validation Accuracy = 0.0172
Source Epoch 2/3: 100%|██████████| 114/114 [06:24<00:00,  3.38s/it, loss=0.235]
2025-05-24 11:38:15,134 - INFO - Source Epoch 2: Average Loss = 0.6068
2025-05-24 11:38:40,638 - INFO - Source Epoch 2: Validation Accuracy = 0.0172
Source Epoch 3/3: 100%|██████████| 114/114 [06:26<00:00,  3.39s/it, loss=0.882]
2025-05-24 11:45:07,466 - INFO - Source Epoch 3: Average Loss = 0.5808
2025-05-24 11:45:33,192 - INFO - Source Epoch 3: Validation Accuracy = 0.0172
2025-05-24 11:45:33,194 - INFO - Phase 2: Training target language adapter...
Target Epoch 1/3: 100%|██████████| 109/109 [06:00<00:00,  3.31s/it, loss=1.5]  
2025-05-24 11:51:33,690 - INFO - Target Epoch 1: Average Loss = 1.3688
2025-05-24 11:51:58,532 - INFO - Target Epoch 1: Validation Accuracy = 0.7106
Target

Results saved to afroxlmr_results_sot_adapters.csv


FileNotFoundError: [WinError 3] The system cannot find the path specified: './adapters/lang_swa'

In [27]:
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )

            loss = outputs.loss
            total_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()

            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        all_labels, all_preds, average='weighted'
    )

    return total_loss / len(dataloader), accuracy, precision, recall, f1, all_preds, all_labels


def evaluate_afro_xlmr_adapter(source_language,target_language):
    config = {
        'extern_model_name': f'Davlan/afro-xlmr-base',
        'model_name': f'./xmlr_sentiment_model_{source_language}',
        'adapter_model_name': f'./cross_lingual_model_{source_language}_{target_language}',
        'num_labels': 3,
        'batch_size': 16,
        'learning_rate': 2e-5,
        'epochs': 3,
        'warmup_steps': 0,
        'max_grad_norm': 1.0,
        'text_column': 'text',
        'label_column': 'label',
        'class_names': ['negative', 'neutral', 'positive']
    }

    tokenizer = XLMRobertaTokenizer.from_pretrained(config['extern_model_name'])

    try:
        model = AutoAdapterModel.from_pretrained(config['adapter_model_name'], num_labels=config['num_labels'])
        logger.info("Successfully loaded as AutoAdapterModel")
    except:
        exit(9)

    logger.info("Loading target data...")
    train_df, val_df, test_df = load_data(f'{target_language}')

    logger.info("Creating data loaders...")
    target_train_loader, val_loader, test_loader = create_data_loaders(
        train_df, val_df, test_df,
        tokenizer,
        batch_size=config['batch_size'],
        text_column=config['text_column'],
        label_column=config['label_column'],
        lang=target_language
    )
    # Test evaluation: Showing results after training and adapters
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    logger.info("Evaluating on test set...")
    test_loss, test_accuracy, test_precision, test_recall, test_f1, test_preds, test_labels = evaluate(model, test_loader, device)

    logger.info(f"Test Results:")
    logger.info(f"Loss: {test_loss:.4f}")
    logger.info(f"Accuracy: {test_accuracy:.4f}")
    logger.info(f"Precision: {test_precision:.4f}")
    logger.info(f"Recall: {test_recall:.4f}")
    logger.info(f"F1 Score: {test_f1:.4f}")

    results_df_afro = {
    "test_loss": test_loss,
    "test_accuracy": test_accuracy,
    "test_f1": test_f1,
    "test_precision": test_precision,
    "test_recall": test_recall,
    "epochs": config['epochs'],
    "learning_rate": config['learning_rate'],
    "batch_size": config['batch_size'],
}

    path = f"afroxlmr_results_{target_language}_adapters.csv"
    pd.DataFrame([results_df_afro]).to_csv(path, index=False)
    print(f"Results saved to {path}")

    # Saving the individual adapters
    # model.save_adapter(f"./adapters/{src_adapter}", src_adapter)
    # model.save_adapter(f"./adapters/{tgt_adapter}", tgt_adapter)
    # model.save_adapter(f"./adapters/{task_adapter}", task_adapter)

evaluate_afro_xlmr_adapter('swa','sot')

2025-05-24 10:16:50,951 - INFO - Adding head 'default' with config {'head_type': 'classification', 'num_labels': 3, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'LABEL_0': 0, 'LABEL_1': 1, 'LABEL_2': 2}, 'use_pooler': False, 'bias': True, 'dropout_prob': None}.
2025-05-24 10:16:50,953 - INFO - Adding head 'sentiment_task' with config {'head_type': 'classification', 'num_labels': 3, 'layers': 2, 'activation_function': 'tanh', 'label2id': {'negative': 0, 'neutral': 1, 'positive': 2}, 'use_pooler': False, 'bias': True, 'dropout_prob': None}.
2025-05-24 10:16:51,088 - INFO - Successfully loaded as AutoAdapterModel
2025-05-24 10:16:51,088 - INFO - Loading target data...
2025-05-24 10:16:55,320 - INFO - Data loaded successfully: 1740 training, 349 validation, 349 test examples
2025-05-24 10:16:55,321 - INFO - Creating data loaders...
2025-05-24 10:16:55,322 - INFO - Evaluating on test set...
Evaluating: 100%|██████████| 22/22 [00:21<00:00,  1.02it/s]
  _warn_prf(average, modifier

Results saved to afroxlmr_results_sot_adapters.csv
