In [None]:
%pip install shap --user

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.optim import AdamW
import shap
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = r"expanded_lexicon_v4_working.xlsx"  # Replace with your file path
df = pd.read_excel(file_path)

languages = ['Sepedi', 'Xhosa', 'Shona', 'Afrikaans', 'Zulu']

all_texts = []
all_labels = []

for lang in languages:
    texts = df[lang].astype(str).tolist()
    labels = df['Sentiment'].tolist()  # Use your sentiment column
    all_texts.extend(texts)
    all_labels.extend(labels)

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
all_labels_encoded = le.fit_transform(all_labels)
# Check the first few examples
print(all_texts[:5])
print(all_labels[:5])

In [None]:
# Split into train/test
train_texts, test_texts, train_labels, test_labels = train_test_split(
    all_texts, all_labels_encoded, test_size=0.2, random_state=42, stratify=all_labels_encoded
)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=64):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [None]:
# 5️⃣ Load Models and Tokenizers
# ==============================
# AfroXLMR
xlmr_name = "Davlan/afro-xlmr-base"
tokenizer_xlmr = AutoTokenizer.from_pretrained(xlmr_name, use_fast=False)
model_xlmr = AutoModelForSequenceClassification.from_pretrained(xlmr_name, num_labels=len(le.classes_))

# AfriBERTa
afri_name = "castorini/afriberta_base"
tokenizer_afri = AutoTokenizer.from_pretrained(afri_name, use_fast=False)
model_afri = AutoModelForSequenceClassification.from_pretrained(afri_name, num_labels=len(le.classes_))


In [None]:
def train_model(model, tokenizer, train_texts, train_labels, epochs=1, batch_size=2, lr=2e-5):
    dataset = SentimentDataset(train_texts, train_labels, tokenizer)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for batch in loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1} | Avg Loss: {total_loss / len(loader):.4f}")
    return model


In [None]:
# Train Models
# ==============================
print("Training AfroXLMR...")
model_xlmr = train_model(model_xlmr, tokenizer_xlmr, train_texts, train_labels, epochs=1)

print("Training AfriBERTa...")
model_afri = train_model(model_afri, tokenizer_afri, train_texts, train_labels, epochs=1)


In [None]:
# SHAP Explainable AI
# ==============================
# Create pipelines
pipe_xlmr = pipeline("text-classification", model=model_xlmr, tokenizer=tokenizer_xlmr, top_k=None)
pipe_afri = pipeline("text-classification", model=model_afri, tokenizer=tokenizer_afri, top_k=None)

# Small sample for SHAP
sample_texts = test_texts[:3]

explainer_xlmr = shap.Explainer(pipe_xlmr)
explainer_afri = shap.Explainer(pipe_afri)

print("Explaining AfroXLMR predictions...")
shap_values_xlmr = explainer_xlmr(sample_texts)
shap.plots.text(shap_values_xlmr[0])

print("Explaining AfriBERTa predictions...")
shap_values_afri = explainer_afri(sample_texts)
shap.plots.text(shap_values_afri[0])

In [None]:
# Ensemble Learning
# ==============================
def get_probs(pipe, texts):
    preds = pipe(texts, top_k=None)
    return np.array([[score['score'] for score in p] for p in preds])

print("Getting predictions from AfroXLMR...")
probs_xlmr = get_probs(pipe_xlmr, test_texts)

print("Getting predictions from AfriBERTa...")
probs_afri = get_probs(pipe_afri, test_texts)

# Soft-voting ensemble
ensemble_probs = (probs_xlmr + probs_afri) / 2
ensemble_preds = ensemble_probs.argmax(axis=1)

# Evaluate
acc = accuracy_score(test_labels, ensemble_preds)
print(f"\nEnsemble Accuracy (multilingual): {acc:.4f}")
print("\nClassification Report:")
print(classification_report(test_labels, ensemble_preds, target_names=le.classes_))