In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd
import shap
import numpy as np

# ==========================================
# 1️⃣ Load and Prepare Data
# ==========================================
df = pd.read_csv("cleaned_corpus_gpt_SUBTOKENIZED.csv")

# Use multilingual text and label columns
texts = df["text_hybrid"].astype(str).tolist()
labels = df["label"].astype(str).tolist()

# Encode labels to integers
unique_labels = sorted(set(labels))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}
numeric_labels = [label2id[l] for l in labels]

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, numeric_labels, test_size=0.2, random_state=42
)

In [6]:
# 2️⃣ Dataset and DataLoader
# ==========================================
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [7]:
# 3️⃣ Model Setup
# ==========================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device set to use {device}")

# AfroXLMR
xlmr_name = "Davlan/afro-xlmr-base"
tokenizer_xlmr = AutoTokenizer.from_pretrained(xlmr_name)
model_xlmr = AutoModelForSequenceClassification.from_pretrained(
    xlmr_name, num_labels=len(unique_labels), id2label=id2label, label2id=label2id
).to(device)

# AfriBERTa
afri_name = "castorini/afriberta_large"
tokenizer_afri = AutoTokenizer.from_pretrained(afri_name)
model_afri = AutoModelForSequenceClassification.from_pretrained(
    afri_name, num_labels=len(unique_labels), id2label=id2label, label2id=label2id
).to(device)


Device set to use cpu


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at castorini/afriberta_large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 4️⃣ Training Function
# ==========================================
def train_model(model, tokenizer, train_texts, train_labels, epochs=1, batch_size=8, lr=1e-5):
    dataset = SentimentDataset(train_texts, train_labels, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = AdamW(model.parameters(), lr=lr)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for batch in loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch+1} | Loss: {total_loss / len(loader):.4f}")

    return model

In [9]:
# 5️⃣ Train Both Models
# ==========================================
print("Training AfroXLMR...")
model_xlmr = train_model(model_xlmr, tokenizer_xlmr, train_texts, train_labels, epochs=1)

print("Training AfriBERTa...")
model_afri = train_model(model_afri, tokenizer_afri, train_texts, train_labels, epochs=1)

Training AfroXLMR...
Epoch 1 | Loss: 0.9996
Training AfriBERTa...
Epoch 1 | Loss: 1.0046


In [3]:
# 6️⃣ Explainable AI (XAI) - SHAP Explanations
# ==========================================
print("\nExplaining AfroXLMR...")
sample_texts = test_texts[:5]
explainer_xlmr = shap.Explainer(lambda x: model_xlmr(**tokenizer_xlmr(x, return_tensors="pt", truncation=True, padding=True)).logits.detach().numpy(), tokenizer_xlmr)
shap_values_xlmr = explainer_xlmr(sample_texts)
shap.plots.text(shap_values_xlmr[0])

print("\nExplaining AfriBERTa...")
explainer_afri = shap.Explainer(lambda x: model_afri(**tokenizer_afri(x, return_tensors="pt", truncation=True, padding=True)).logits.detach().numpy(), tokenizer_afri)
shap_values_afri = explainer_afri(sample_texts)
shap.plots.text(shap_values_afri[0])



Explaining AfroXLMR...


NameError: name 'tokenizer_xlmr' is not defined

In [11]:
#saving the trained models

model_xlmr.save_pretrained("trained_afroxlmr_model")
tokenizer_xlmr.save_pretrained("trained_afroxlmr_tokenizer")

model_afri.save_pretrained("trained_afriberta_model")
tokenizer_afri.save_pretrained("trained_afriberta_tokenizer")

Non-default generation parameters: {'max_length': 512}


('trained_afriberta_tokenizer\\tokenizer_config.json',
 'trained_afriberta_tokenizer\\special_tokens_map.json',
 'trained_afriberta_tokenizer\\sentencepiece.bpe.model',
 'trained_afriberta_tokenizer\\added_tokens.json',
 'trained_afriberta_tokenizer\\tokenizer.json')

In [None]:
#load trained models 

from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer_xlmr = AutoTokenizer.from_pretrained("trained_afroxlmr_tokenizer")
model_xlmr = AutoModelForSequenceClassification.from_pretrained("trained_afroxlmr_model")

tokenizer_afri = AutoTokenizer.from_pretrained("trained_afriberta_tokenizer")
model_afri = AutoModelForSequenceClassification.from_pretrained("trained_afriberta_model")