In [11]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import shap

In [None]:
# AfroXLMR
xlmr_model_name = "Davlan/afro-xlmr-base"
tokenizer_xlmr = AutoTokenizer.from_pretrained(xlmr_model_name, use_fast=False)
model_xlmr = AutoModelForSequenceClassification.from_pretrained(xlmr_model_name, num_labels=3)
model_xlmr.eval()

# AfriBERTa
afri_model_name = "castorini/afriberta_base"
tokenizer_afri = AutoTokenizer.from_pretrained(afri_model_name, use_fast=False)
model_afri = AutoModelForSequenceClassification.from_pretrained(afri_model_name, num_labels=3)
model_afri.eval()


lexicon = pd.read_excel(r"C:\Users\HP\OneDrive\Desktop\expanded_lexicon_v4.xlsx")

# Choose a target language column (e.g., Zulu)
texts = lexicon['Zulu'].astype(str)
labels = lexicon['Sentiment'].map({'negatif': 0, 'neuitre': 1, 'positif': 2})

# Split into train and test sets
train_enc_xlmr = tokenizer_xlmr(train_texts, truncation=True, padding=True, return_tensors="pt")
test_enc_xlmr  = tokenizer_xlmr(test_texts, truncation=True, padding=True, return_tensors="pt")

train_enc_afri = tokenizer_afri(train_texts, truncation=True, padding=True, return_tensors="pt")
test_enc_afri  = tokenizer_afri(test_texts, truncation=True, padding=True, return_tensors="pt")

train_dataset_xlmr = TensorDataset(train_enc_xlmr['input_ids'], train_enc_xlmr['attention_mask'], torch.tensor(train_labels))
eval_dataset_xlmr  = TensorDataset(test_enc_xlmr['input_ids'], test_enc_xlmr['attention_mask'], torch.tensor(test_labels))

train_dataset_afri = TensorDataset(train_enc_afri['input_ids'], train_enc_afri['attention_mask'], torch.tensor(train_labels))
eval_dataset_afri  = TensorDataset(test_enc_afri['input_ids'], test_enc_afri['attention_mask'], torch.tensor(test_labels))

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at Davlan/afro-xlmr-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/446M [00:00<?, ?B/s]

In [6]:

# 4️⃣ Define metrics
# ========================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

# ========================
# 5️⃣ TrainingArguments (low-memory friendly)
# ========================
training_args_xlmr = TrainingArguments(
    output_dir="./results_xlmr",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs_xlmr",
    logging_steps=5,
    save_strategy="epoch"
)

training_args_afri = TrainingArguments(
    output_dir="./results_afri",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir="./logs_afri",
    logging_steps=5,
    save_strategy="epoch"
)

# ========================
# 6️⃣ Create Trainers
# ========================
trainer_xlmr = Trainer(
    model=model_xlmr,
    args=training_args_xlmr,
    train_dataset=train_dataset_xlmr,
    eval_dataset=eval_dataset_xlmr,
    compute_metrics=compute_metrics
)

trainer_afri = Trainer(
    model=model_afri,
    args=training_args_afri,
    train_dataset=train_dataset_afri,
    eval_dataset=eval_dataset_afri,
    compute_metrics=compute_metrics
)

# ========================
# 7️⃣ Train models
# ========================
trainer_xlmr.train()
trainer_afri.train()


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
#XAI

# AfroXLMR
def predict_xlmr(texts):
    enc = tokenizer_xlmr(texts, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model_xlmr(**enc)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).numpy()
    return probs

# AfriBERTa
def predict_afri(texts):
    enc = tokenizer_afri(texts, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model_afri(**enc)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).numpy()
    return probs
    background_texts = train_texts[:5] 

    # Create explainers using partition algorithm (lightweight)
explainer_xlmr = shap.Explainer(predict_xlmr, background_texts, algorithm="partition")
explainer_afri = shap.Explainer(predict_afri, background_texts, algorithm="partition")

sample_texts = test_texts  # or select a few examples if memory is tight

shap_values_xlmr = explainer_xlmr(sample_texts)
shap_values_afri  = explainer_afri(sample_texts)

for i, text in enumerate(sample_texts):
    print(f"Text: {text}\n--- AfroXLMR ---")
    shap.plots.text(shap_values_xlmr[i])
    print(f"--- AfriBERTa ---")
    shap.plots.text(shap_values_afri[i])

In [None]:
#ensemble

# Get probabilities from each model
probs_xlmr = predict_xlmr(test_texts)
probs_afri = predict_afri(test_texts)

# Average probabilities (soft-voting)
ensemble_probs = (probs_xlmr + probs_afri) / 2
ensemble_preds = np.argmax(ensemble_probs, axis=1)

# Evaluate ensemble
acc = accuracy_score(test_labels, ensemble_preds)
f1 = f1_score(test_labels, ensemble_preds, average="weighted")

print("Ensemble Predictions:", ensemble_preds)
print(f"Ensemble Accuracy: {acc:.3f}, F1-score: {f1:.3f}")