In [2]:
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer,
    pipeline
)
from datasets import Dataset, DatasetDict, concatenate_datasets
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)


  from .autonotebook import tqdm as notebook_tqdm





In [5]:
# -----------------------
# 1. Load and Prepare Dataset
# -----------------------

# Load the ShonaSenti corpus (from Part B)
shona_senti_path = "data/shona_senti.csv"
df_shona = pd.read_csv(shona_senti_path)

print(f"Dataset shape: {df_shona.shape}")
print(f"Columns: {df_shona.columns.tolist()}")
print("Label distribution:")
print(df_shona['label'].value_counts().sort_index())

# Check label format and convert if necessary
if df_shona['label'].dtype == 'object':
    # Convert string labels to numerical
    label_mapping = {'negative': 0, 'neutral': 1, 'positive': 2}
    df_shona['label'] = df_shona['label'].map(label_mapping)
    print("Converted string labels to numerical")

# Split into train and validation sets
train_df, val_df = train_test_split(
    df_shona, 
    test_size=0.2, 
    random_state=42,
    stratify=df_shona['label']
)

print(f"Training samples: {len(train_df)}")
print(f"Validation samples: {len(val_df)}")

# Optional: Augment with lexicon data if 
lexicon_aug_path = "expanded_lexicon_v4.csv"
if os.path.exists(lexicon_aug_path):
    df_lexicon_aug = pd.read_csv(lexicon_aug_path)
    print(f"Lexicon augmentation samples: {len(df_lexicon_aug)}")
    train_df = pd.concat([train_df, df_lexicon_aug], ignore_index=True)
    print(f"Training samples after augmentation: {len(train_df)}")

# Create Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['text', 'label']])

dataset = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset
})

print("Dataset preparation complete!")

Dataset shape: (15959, 3)
Columns: ['text', 'label', 'text_en']
Label distribution:
label
negative    7820
neutral     3395
positive    4744
Name: count, dtype: int64
Converted string labels to numerical
Training samples: 12767
Validation samples: 3192
Dataset preparation complete!


In [7]:
# -----------------------
# 2. Initialize Models and Tokenizers
# -----------------------

# Model configurations
MODELS_CONFIG = {
    "afroxlmr": {
        "name": "afroxlmr-base",
        "display_name": "AfroXLMR"
    },
    "afriberta": {
        "name": "castorini/afriberta_large",  # or "castorini/afriberta_base"
        "display_name": "AfriBERTa"
    }
}

# Initialize tokenizers and models
tokenizers = {}
models = {}

for model_key, config in MODELS_CONFIG.items():
    print(f"Loading {config['display_name']}...")
    
    # Load tokenizer
    tokenizers[model_key] = AutoTokenizer.from_pretrained(config['name'])
    
    # Load model for sequence classification (3 classes: negative, neutral, positive)
    models[model_key] = AutoModelForSequenceClassification.from_pretrained(
        config['name'], 
        num_labels=3,
        id2label={0: "negative", 1: "neutral", 2: "positive"},
        label2id={"negative": 0, "neutral": 1, "positive": 2}
    )

print("Models and tokenizers loaded successfully!")


Loading AfroXLMR...


OSError: afroxlmr-base is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `hf auth login` or by passing `token=<your_token>`

In [None]:
# -----------------------
# 3. Tokenization Function
# -----------------------

def tokenize_function(examples, tokenizer_key):
    """Tokenize the text data"""
    tokenizer = tokenizers[tokenizer_key]
    return tokenizer(
        examples['text'], 
        padding=True, 
        truncation=True, 
        max_length=256,
        return_tensors=None  # Let Trainer handle tensor conversion
    )

# Tokenize datasets for each model
tokenized_datasets = {}

for model_key in MODELS_CONFIG.keys():
    print(f"Tokenizing data for {MODELS_CONFIG[model_key]['display_name']}...")
    
    tokenized_datasets[model_key] = dataset.map(
        lambda examples: tokenize_function(examples, model_key),
        batched=True,
        remove_columns=dataset['train'].column_names
    )

print("Tokenization complete!")


In [None]:
# -----------------------
# 4. Training Setup
# -----------------------

print("\n4. Setting up training...")

# Training arguments
training_args = TrainingArguments(
    output_dir=f"./results/{model_key}",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir=f"./logs/{model_key}",
    logging_steps=50,
    report_to=None,  # Disable wandb if not needed
)

# Compute metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'f1_weighted': f1_score(labels, predictions, average='weighted')
    }


In [None]:
# -----------------------
# 5. Train Individual Models
# -----------------------

print("\n5. Training individual models...")

trainers = {}
model_predictions = {}

for model_key in MODELS_CONFIG.keys():
    print(f"\n--- Training {MODELS_CONFIG[model_key]['display_name']} ---")
    
    # Update output directories for each model
    model_training_args = TrainingArguments(
        **{**training_args.to_dict(), 
           'output_dir': f"./results/{model_key}",
           'logging_dir': f"./logs/{model_key}"}
    )
    
    # Create trainer
    trainer = Trainer(
        model=models[model_key],
        args=model_training_args,
        train_dataset=tokenized_datasets[model_key]['train'],
        eval_dataset=tokenized_datasets[model_key]['validation'],
        tokenizer=tokenizers[model_key],
        compute_metrics=compute_metrics
    )
    
    # Train the model
    train_result = trainer.train()
    
    # Save trainer and model
    trainers[model_key] = trainer
    
    # Evaluate on validation set
    print(f"Evaluating {MODELS_CONFIG[model_key]['display_name']}...")
    eval_results = trainer.evaluate()
    print(f"{MODELS_CONFIG[model_key]['display_name']} Validation Results:")
    for key, value in eval_results.items():
        print(f"  {key}: {value:.4f}")
    
    # Get predictions for ensemble
    predictions = trainer.predict(tokenized_datasets[model_key]['validation'])
    model_predictions[model_key] = predictions

print("Individual model training complete!")


In [None]:
# -----------------------
# 6. Ensemble Learning
# -----------------------

print("\n6. Implementing ensemble learning...")

# Get prediction probabilities from both models
afroxlmr_probs = torch.softmax(torch.tensor(model_predictions['afroxlmr'].predictions), dim=1).numpy()
afriberta_probs = torch.softmax(torch.tensor(model_predictions['afriberta'].predictions), dim=1).numpy()

# Simple averaging ensemble
ensemble_probs = (afroxlmr_probs + afriberta_probs) / 2
ensemble_predictions = np.argmax(ensemble_probs, axis=1)
true_labels = model_predictions['afroxlmr'].label_ids

# Evaluate ensemble
ensemble_accuracy = accuracy_score(true_labels, ensemble_predictions)
ensemble_f1_macro = f1_score(true_labels, ensemble_predictions, average='macro')
ensemble_f1_weighted = f1_score(true_labels, ensemble_predictions, average='weighted')

print("Ensemble Results:")
print(f"  Accuracy: {ensemble_accuracy:.4f}")
print(f"  F1 Macro: {ensemble_f1_macro:.4f}")
print(f"  F1 Weighted: {ensemble_f1_weighted:.4f}")

# Compare with individual models
print("\nModel Comparison:")
models_comparison = []
for model_key in MODELS_CONFIG.keys():
    eval_results = trainers[model_key].evaluate()
    models_comparison.append({
        'Model': MODELS_CONFIG[model_key]['display_name'],
        'Accuracy': eval_results['eval_accuracy'],
        'F1_Macro': eval_results['eval_f1_macro'],
        'F1_Weighted': eval_results['eval_f1_weighted']
    })

models_comparison.append({
    'Model': 'Ensemble (Average)',
    'Accuracy': ensemble_accuracy,
    'F1_Macro': ensemble_f1_macro,
    'F1_Weighted': ensemble_f1_weighted
})

comparison_df = pd.DataFrame(models_comparison)
print(comparison_df.round(4))

In [None]:
# -----------------------
# 7. Explainable AI (XAI) - Attention Visualization
# -----------------------

print("\n7. Applying Explainable AI techniques...")

def visualize_attention(text, model, tokenizer, model_name):
    """Visualize attention for a given text"""
    # Create a pipeline for the model
    classifier = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        return_all_scores=True
    )
    
    # Get prediction
    results = classifier(text)
    
    print(f"\n{model_name} Prediction for: '{text}'")
    for result in results[0]:
        print(f"  {result['label']}: {result['score']:.4f}")
    
    # For attention visualization, we need to manually extract attention
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    
    # Get model outputs with attention
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)
    
    # Get attention weights from the last layer
    attention = outputs.attentions[-1]  # Last layer
    attention = attention.mean(dim=1)   # Average over attention heads
    
    # Visualize attention
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    attention_weights = attention[0].mean(dim=0).detach().numpy()  # Average over layers
    
    # Plot attention visualization
    plt.figure(figsize=(12, 4))
    plt.bar(range(len(tokens)), attention_weights[:len(tokens)])
    plt.xticks(range(len(tokens)), tokens, rotation=45)
    plt.title(f"Attention Weights - {model_name}")
    plt.tight_layout()
    plt.show()
    
    return attention_weights

# Test XAI on sample texts
sample_texts = [
    "This is absolutely wonderful and amazing!",  # Should be positive
    "I hate this terrible experience",           # Should be negative  
    "The weather is okay today"                  # Should be neutral
]

print("\nXAI Analysis on Sample Texts:")
for i, text in enumerate(sample_texts, 1):
    print(f"\n--- Sample {i} ---")
    for model_key in MODELS_CONFIG.keys():
        try:
            _ = visualize_attention(
                text, 
                models[model_key], 
                tokenizers[model_key],
                MODELS_CONFIG[model_key]['display_name']
            )
        except Exception as e:
            print(f"Error in XAI for {MODELS_CONFIG[model_key]['display_name']}: {e}")


In [None]:
# -----------------------
# 8. Detailed Performance Analysis
# -----------------------

print("\n8. Generating detailed performance analysis...")

# Confusion matrices for each model
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, (model_key, ax) in enumerate(zip(MODELS_CONFIG.keys(), axes[:2])):
    predictions = np.argmax(model_predictions[model_key].predictions, axis=1)
    cm = confusion_matrix(true_labels, predictions)
    
    sns.heatmap(cm, annot=True, fmt='d', ax=ax, cmap='Blues',
                xticklabels=['Negative', 'Neutral', 'Positive'],
                yticklabels=['Negative', 'Neutral', 'Positive'])
    ax.set_title(f'{MODELS_CONFIG[model_key]["display_name"]} Confusion Matrix')
    ax.set_xlabel('Predicted')
    ax.set_ylabel('Actual')

# Ensemble confusion matrix
cm_ensemble = confusion_matrix(true_labels, ensemble_predictions)
sns.heatmap(cm_ensemble, annot=True, fmt='d', ax=axes[2], cmap='Blues',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
axes[2].set_title('Ensemble Confusion Matrix')
axes[2].set_xlabel('Predicted')
axes[2].set_ylabel('Actual')

plt.tight_layout()
plt.savefig('./results/confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.show()

# Detailed classification reports
print("\nDetailed Classification Reports:")
for model_key in MODELS_CONFIG.keys():
    predictions = np.argmax(model_predictions[model_key].predictions, axis=1)
    print(f"\n{MODELS_CONFIG[model_key]['display_name']}:")
    print(classification_report(true_labels, predictions, 
                              target_names=['Negative', 'Neutral', 'Positive']))

print("\nEnsemble:")
print(classification_report(true_labels, ensemble_predictions,
                          target_names=['Negative', 'Neutral', 'Positive']))

# -----------------------
# 9. Save Results and Models
# -----------------------

print("\n9. Saving results and models...")

# Save performance comparison
comparison_df.to_csv('./results/model_performance_comparison.csv', index=False)

# Save predictions
predictions_df = pd.DataFrame({
    'true_labels': true_labels,
    'afroxlmr_predictions': np.argmax(model_predictions['afroxlmr'].predictions, axis=1),
    'afriberta_predictions': np.argmax(model_predictions['afriberta'].predictions, axis=1),
    'ensemble_predictions': ensemble_predictions
})
predictions_df.to_csv('./results/model_predictions.csv', index=False)

# Save fine-tuned models
for model_key in MODELS_CONFIG.keys():
    model_save_path = f'./saved_models/{model_key}'
    trainers[model_key].save_model(model_save_path)
    print(f"Saved {MODELS_CONFIG[model_key]['display_name']} to {model_save_path}")

print("\n=== Part C Complete! ===")
print("Results saved in './results/' directory")
print("Models saved in './saved_models/' directory")

# Print summary of findings
print("\n" + "="*50)
print("SUMMARY OF FINDINGS")
print("="*50)
print(f"1. Dataset: {len(train_df)} training, {len(val_df)} validation samples")
print(f"2. Individual Model Performance:")
for _, row in comparison_df.iterrows():
    if row['Model'] != 'Ensemble (Average)':
        print(f"   - {row['Model']}: Accuracy={row['Accuracy']:.3f}, F1_Macro={row['F1_Macro']:.3f}")
print(f"3. Ensemble Performance: Accuracy={ensemble_accuracy:.3f}, F1_Macro={ensemble_f1_macro:.3f}")
print(f"4. Ensemble Improvement: {((ensemble_accuracy - comparison_df.iloc[0]['Accuracy']) / comparison_df.iloc[0]['Accuracy'] * 100):.1f}% over best individual model")
print("5. XAI techniques applied for model interpretability")