# Yorùbá Sentiment Analysis Results Visualization

This notebook visualizes the results of our Yorùbá sentiment analysis model based on AfriBERTa.


In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.preprocessing import label_binarize

# Set style for plots
plt.style.use('ggplot')
sns.set_style("whitegrid")
sns.set_palette("colorblind")

# Set larger figure sizes and font sizes for better readability
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 14

## Loading Model Results

First, we'll load the model results from the log files.


In [None]:
# Function to load model logs
def load_model_logs(logs_dir="logs"):
    model_logs = []
    
    for filename in os.listdir(logs_dir):
        if filename.endswith(".json"):
            with open(os.path.join(logs_dir, filename), 'r', encoding='utf-8') as f:
                try:
                    data = json.load(f)
                    model_logs.append(data)
                except Exception as e:
                    print(f"Error loading {filename}: {e}")
    
    return model_logs

# Load the model logs
model_logs = load_model_logs()

print(f"Loaded {len(model_logs)} model logs")

# Create a DataFrame with the main metrics
metrics_data = []

for log in model_logs:
    model_name = f"{log['model_name']}_{log['timestamp']}"
    metrics = log.get("metrics", {})
    
    data_row = {
        "model": model_name,
        "accuracy": metrics.get("accuracy", 0) * 100,  # Convert to percentage
        "precision": metrics.get("precision", 0) * 100,
        "recall": metrics.get("recall", 0) * 100,
        "f1": metrics.get("f1", 0) * 100,
        "sample_test_accuracy": metrics.get("sample_test_accuracy", 0) * 100
    }
    
    # Parse confusion matrix if available in results
    if "sample_test_results" in log:
        true_labels = [result["true_label"] for result in log["sample_test_results"]]
        pred_labels = [result["predicted_label"] for result in log["sample_test_results"]]
        data_row["true_labels"] = true_labels
        data_row["pred_labels"] = pred_labels
        
    metrics_data.append(data_row)

metrics_df = pd.DataFrame(metrics_data)
metrics_df

## 1. Correlation Heatmap of Metrics

This shows how different metrics correlate with each other.

In [None]:
# Select only numeric columns for correlation
numeric_columns = ['accuracy', 'precision', 'recall', 'f1', 'sample_test_accuracy']
correlation_matrix = metrics_df[numeric_columns].corr()

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(
    correlation_matrix,
    annot=True,
    mask=mask,
    cmap='coolwarm',
    vmin=-1, vmax=1,
    fmt='.2f',
    linewidths=1,
    square=True,
    cbar_kws={"shrink": .8},
)
plt.title('Correlation Heatmap of Model Metrics', fontsize=18, pad=20)
plt.tight_layout()
plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

## 2. Model Comparison - Accuracy Scores

In [None]:
# Instead of using parts of the original name, assign simple sequential labels
metrics_df['model_name'] = [f"Model {i+1}" for i in range(len(metrics_df))]

# Create bar chart for accuracy
plt.figure(figsize=(12, 8))
sns.barplot(
    data=metrics_df,
    x='model_name',
    y='accuracy',
    palette='Blues_d',
    width=0.5,
    hue='model_name',
    legend=False,
)

# Add value labels on top of bars
for i, v in enumerate(metrics_df['accuracy']):
    plt.text(
        i, v + 1, 
        f"{v:.2f}%", 
        ha='center', 
        fontweight='bold'
    )

plt.title('Model Accuracy Comparison', fontsize=18, pad=20)
plt.xlabel('Model')
plt.ylabel('Accuracy (%)')
plt.ylim(0, 100)  # Set y-axis from 0 to 100%
plt.grid(axis='y', alpha=0.3)
plt.xticks(rotation=25)
plt.tight_layout()
plt.savefig('model_accuracy_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 3. Model Comparison - Precision Scores

In [None]:
# Create bar chart for precision
plt.figure(figsize=(12, 8))
sns.barplot(
    data=metrics_df,
    x='model_name',
    y='precision',
    width=0.5,
    palette='Greens_d',
    hue='model_name',
    legend=False,
)

# Add value labels on top of bars
for i, v in enumerate(metrics_df['precision']):
    plt.text(
        i, v + 1, 
        f"{v:.2f}%", 
        ha='center', 
        fontweight='bold'
    )

plt.title('Model Precision Comparison', fontsize=18, pad=20)
plt.xlabel('Model')
plt.ylabel('Precision (%)')
plt.ylim(0, 100)  # Set y-axis from 0 to 100%
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('model_precision_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 4. Model Comparison - Recall Scores

In [None]:
# Create bar chart for recall
plt.figure(figsize=(12, 8))
sns.barplot(
    data=metrics_df,
    x='model_name',
    y='recall',
    palette='Oranges_d',
    width=0.5,
    hue='model_name',
    legend=False,
)

# Add value labels on top of bars
for i, v in enumerate(metrics_df['recall']):
    plt.text(
        i, v + 1, 
        f"{v:.2f}%", 
        ha='center', 
        fontweight='bold'
    )

plt.title('Model Recall Comparison', fontsize=18, pad=20)
plt.xlabel('Model')
plt.ylabel('Recall (%)')
plt.ylim(0, 100)  # Set y-axis from 0 to 100%
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('model_recall_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Model Comparison - F1 Scores 

In [None]:
# 1. Create a new bar chart for F1 scores
plt.figure(figsize=(12, 8))
sns.barplot(
    data=metrics_df,
    x='model_name',
    y='f1',
    palette='Purples_d',  # Using a different color palette
    width=0.5,
    hue='model_name',
    legend=False,
)

# Add value labels on top of bars
for i, v in enumerate(metrics_df['f1']):
    plt.text(
        i, v + 1, 
        f"{v:.2f}%", 
        ha='center', 
        fontweight='bold'
    )

plt.title('Model F1 Score Comparison', fontsize=18, pad=20)
plt.xlabel('Model')
plt.ylabel('F1 Score (%)')
plt.ylim(0, 100)  # Set y-axis from 0 to 100%
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('model_f1_comparison.png', dpi=300, bbox_inches='tight')
plt.show()


## 6. Model Comparison - Accuracy, Precision and Recall Together

In [None]:
# Reshape the data for plotting multiple metrics together
metrics_melted = pd.melt(
    metrics_df, 
    id_vars=['model', 'model_name'], 
    value_vars=['accuracy', 'precision', 'recall', 'f1'],
    var_name='metric', 
    value_name='score'
)

# Create a line graph with a very narrow y-axis range
plt.figure(figsize=(14, 8))

# Use distinct marker styles, colors, and line styles for each metric
markers = ['o', 's', 'D', '^']  # circle, square, diamond, triangle
colors = ['#1f77b4', '#2ca02c', '#ff7f0e', '#9467bd']  # blue, green, orange, purple
linestyles = ['-', '-', '-', '-']  # solid, dashed, dash-dot, dotted
zorders = [4, 3, 2, 1]  # Higher zorder will be on top

# Plot each metric as a line with variations to prevent visual overlapping
for i, metric in enumerate(['accuracy', 'precision', 'recall', 'f1']):
    metric_data = metrics_melted[metrics_melted['metric'] == metric]
    
    # Apply a tiny offset to separate overlapping lines
    # This is purely visual and doesn't change the data interpretation
    offset = i * 0.05  # Small offset for visual separation
    
    plt.plot(
        metric_data['model_name'], 
        metric_data['score'] + offset,  # Add small offset for visualization
        marker=markers[i],
        linestyle=linestyles[i],
        linewidth=3 + (3-i)*0.5,  # Vary line thickness
        markersize=12 + (3-i)*1,  # Vary marker size
        color=colors[i],
        label=f"{metric.capitalize()} (+{offset:.2f}% visual offset)",  # Note offset in legend
        zorder=zorders[i]  # Control which lines appear on top
    )
    
    # Add value labels with the original (non-offset) values
    for x, y in zip(metric_data['model_name'], metric_data['score']):
        plt.text(
            x, y + offset + 0.15,  # Offset text position but show original value
            f"{y:.2f}%", 
            ha='center',
            fontsize=10,
            fontweight='bold',
            color=colors[i]
        )

# Set an extremely narrow y-axis range to highlight minimal differences
# Add room for the offsets
plt.ylim(68.5, 73.0)  # Set to just below min and just above max + offsets

plt.title('Model Metrics Comparison (Zoomed In)', fontsize=18, pad=20)
plt.xlabel('Model', fontsize=14)
plt.ylabel('Score (%)', fontsize=14)

# Add very fine horizontal grid lines
plt.grid(axis='y', which='major', alpha=0.3, linestyle='-')
plt.grid(axis='y', which='minor', alpha=0.15, linestyle='--')
plt.minorticks_on()
plt.gca().yaxis.set_minor_locator(plt.MultipleLocator(0.1))  # Add grid lines every 0.1%

# Add a legend with better positioning and explanation
plt.legend(
    title='Metrics (with visual offsets)',
    loc='lower center',
    bbox_to_anchor=(0.5, -0.28),
    ncol=2,  # Use two rows instead of one for better readability
    fontsize=10,
    title_fontsize=12,
    framealpha=0.9,  # More opaque background
)

# Add annotations explaining the visualization choices
plt.figtext(
    0.5, 0.01, 
    "Note: Y-axis is zoomed in (range: 69-72%). Small offsets added to each metric line for better visibility. Original values shown in labels.",
    ha='center',
    fontsize=10,
    fontstyle='italic',
    bbox=dict(boxstyle='round,pad=0.5', facecolor='white', alpha=0.7)
)

plt.tight_layout(rect=[0, 0.05, 1, 1])  # Make room for the note at bottom
plt.savefig('model_metrics_comparison_zoomed.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. ROC Curves for Sentiment Analysis

This visualization shows ROC curves for each model. Note that for multiclass classification (positive, negative, neutral), we'll need to create a one-vs-rest ROC curve for each class.

In [None]:
# First, we need to extract predictions and true labels
def extract_multiclass_predictions(log_entry):
    """Extract predictions with confidence scores for ROC analysis"""
    results = log_entry.get("sample_test_results", [])
    if not results:
        return None, None
    
    # Get class labels
    classes = sorted(list(set([r["true_label"] for r in results])))
    class_to_idx = {label: idx for idx, label in enumerate(classes)}
    
    # Extract true labels and predictions
    y_true = [class_to_idx[r["true_label"]] for r in results]
    
    # For each class, get confidence scores
    y_scores = []
    for r in results:
        # If probabilities for each class aren't available, we'll use the confidence
        # score only for the predicted class
        confidence = r.get("confidence", 0.5)
        pred_label = r["predicted_label"]
        pred_idx = class_to_idx[pred_label]
        
        # Create a score vector with confidence for predicted class
        scores = [0.0] * len(classes)
        scores[pred_idx] = confidence
        y_scores.append(scores)
    
    return np.array(y_true), np.array(y_scores), classes

# Plot ROC curves
plt.figure(figsize=(14, 10))

# Process each model with sample test results
for log in model_logs:
    model_name = f"Model {model_logs.index(log) + 1}"
        
    # Extract predictions and true labels
    y_true, y_scores, classes = extract_multiclass_predictions(log)
        
    if y_true is None or len(y_true) == 0:
        print(f"No valid prediction data found for {model_name}")
        continue
        
    # Binarize the labels for multi-class ROC
    y_true_bin = label_binarize(y_true, classes=range(len(classes)))
        
    # Plot ROC curve for each class
    for i, class_name in enumerate(classes):
        # Get scores for this class
        class_scores = y_scores[:, i]
            
        # Compute ROC curve and area
        fpr, tpr, _ = roc_curve(y_true_bin[:, i], class_scores)
        roc_auc = auc(fpr, tpr)
            
        # Plot the curve
        plt.plot(
            fpr, tpr, 
            lw=2, 
            label=f'{model_name}... - {class_name} (AUC = {roc_auc:.2f})'
        )

# Plot the diagonal line
plt.plot([0, 1], [0, 1], 'k--', lw=2)

plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Sentiment Analysis Models', fontsize=18, pad=20)
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('roc_curves.png', dpi=300, bbox_inches='tight')
plt.show()

# If we have issues with ROC curve generation, we can use a simpler approach with confusion matrices instead
def plot_confusion_matrices(model_logs):
    """Plot confusion matrices for each model"""
    for log in model_logs:
        model_name = f"Model {model_logs.index(log) + 1}"
        results = log.get("sample_test_results", [])
            
        if not results:
            continue
                
        true_labels = [r["true_label"] for r in results]
        pred_labels = [r["predicted_label"] for r in results]
            
        # Get unique classes
        classes = sorted(list(set(true_labels + pred_labels)))
            
        # Convert string labels to indices
        class_to_idx = {cls: i for i, cls in enumerate(classes)}
        y_true = [class_to_idx[label] for label in true_labels]
        y_pred = [class_to_idx[label] for label in pred_labels]
            
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred)
            
        # Normalize the confusion matrix
        cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            
        # Plot
        plt.figure(figsize=(10, 8))
        sns.heatmap(
            cm_norm, 
            annot=True, 
            fmt='.2f', 
            cmap='Blues',
            xticklabels=classes,
            yticklabels=classes
        )
        plt.title(f'Confusion Matrix - {model_name}')
        plt.ylabel('True Label')
        plt.xlabel('Predicted Label')
        plt.tight_layout()
        plt.savefig(f'confusion_matrix_{model_name.replace(" ", "_")}.png', dpi=300, bbox_inches='tight')
        plt.show()

# Plot confusion matrices as backup
plot_confusion_matrices(model_logs)

## 8. Training Loss Curves

Let's visualize the training loss over epochs to see how training progressed.

In [None]:
# Create a combined visualization for training loss across all models
plt.figure(figsize=(14, 8))

# Create a separate figure for validation accuracy
plt.figure(figsize=(14, 8))

# Track figures to plot data on the correct one
loss_fig = plt.figure(1)
accuracy_fig = plt.figure(2)

# Use different colors and line styles for each model
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b']
linestyles = ['-', '--', '-.', ':', '-', '--']

# Track whether any models had validation metrics
has_validation_metrics = False

# Iterate through models and add to combined plots
for idx, log in enumerate(model_logs):
    color = colors[idx % len(colors)]  # Cycle through colors if more models than colors
    linestyle = linestyles[idx % len(linestyles)]
    
    model_name = f"Model {idx+1}"  # Simplified model name for legend
    training_info = log.get('training_info', {})
    epochs_data = training_info.get('epochs', [])
    
    if not epochs_data:
        print(f"No training data found for {model_name}")
        continue
    
    # Extract epoch numbers and training loss
    epochs = [epoch_data.get('epoch', i+1) for i, epoch_data in enumerate(epochs_data)]
    train_losses = [epoch_data.get('train_loss', 0) for epoch_data in epochs_data]
    
    # Add to the training loss plot
    plt.figure(1)  # Select the loss figure
    plt.plot(
        epochs, train_losses, 
        marker='o', 
        linestyle=linestyle,
        linewidth=2, 
        markersize=8,
        color=color,
        label=model_name
    )
    
    # Check if validation metrics are available
    if 'accuracy' in epochs_data[0]:
        has_validation_metrics = True
        val_accuracy = [epoch_data.get('accuracy', 0) * 100 for epoch_data in epochs_data]
        
        # Add to the validation accuracy plot
        plt.figure(2)  # Select the accuracy figure
        plt.plot(
            epochs, val_accuracy, 
            marker='s',  # Square markers to distinguish from loss curves
            linestyle=linestyle,
            linewidth=2, 
            markersize=8,
            color=color,
            label=model_name
        )

# Finalize the training loss plot
plt.figure(1)
plt.title('Training Loss Comparison Across Models', fontsize=18, pad=20)
plt.xlabel('Epoch', fontsize=14)
plt.ylabel('Training Loss', fontsize=14)
plt.grid(True, alpha=0.3)
plt.legend(loc='upper right', fontsize=12)
plt.tight_layout()
plt.savefig('combined_training_loss.png', dpi=300, bbox_inches='tight')
plt.show()

# Finalize the validation accuracy plot if we have validation metrics
if has_validation_metrics:
    plt.figure(2)
    plt.title('Validation Accuracy Comparison Across Models', fontsize=18, pad=20)
    plt.xlabel('Epoch', fontsize=14)
    plt.ylabel('Validation Accuracy (%)', fontsize=14)
    plt.grid(True, alpha=0.3)
    plt.legend(loc='lower right', fontsize=12)
    plt.tight_layout()
    plt.savefig('combined_validation_accuracy.png', dpi=300, bbox_inches='tight')
    plt.show()
else:
    print("No validation metrics available for any model")

# Also keep the individual plots for detailed per-model analysis
for idx, log in enumerate(model_logs):
    model_name = f"{log['model_name']}_{log['timestamp']}"
    training_info = log.get('training_info', {})
    epochs_data = training_info.get('epochs', [])
    
    if not epochs_data:
        continue
    
    # Extract epoch numbers and training loss
    epochs = [epoch_data.get('epoch', i+1) for i, epoch_data in enumerate(epochs_data)]
    train_losses = [epoch_data.get('train_loss', 0) for epoch_data in epochs_data]
    
    # Plot individual training loss curve
    plt.figure(figsize=(12, 8))
    plt.plot(epochs, train_losses, 'o-', linewidth=2, markersize=8)
    
    plt.title(f'Training Loss - {model_name}', fontsize=18, pad=20)
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'training_loss_{model_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # If we have validation metrics, plot those too individually
    if 'accuracy' in epochs_data[0]:
        val_accuracy = [epoch_data.get('accuracy', 0) * 100 for epoch_data in epochs_data]
        
        plt.figure(figsize=(12, 8))
        plt.plot(epochs, val_accuracy, 'o-', linewidth=2, markersize=8, color='green')
        
        plt.title(f'Validation Accuracy - {model_name}', fontsize=18, pad=20)
        plt.xlabel('Epoch')
        plt.ylabel('Validation Accuracy (%)')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(f'validation_accuracy_{model_name}.png', dpi=300, bbox_inches='tight')
        plt.show()

## 9. Class Distribution Analysis

Let's analyze the distribution of sentiment classes in our test results.

In [None]:
# Analyze class distribution in test results
for log in model_logs:
    model_name = f"{log['model_name']}_{log['timestamp']}"
    results = log.get("sample_test_results", [])
    
    if not results:
        continue
        
    # Count occurrences of each class
    true_labels = [r["true_label"] for r in results]
    pred_labels = [r["predicted_label"] for r in results]
    
    # Create DataFrame for analysis
    results_df = pd.DataFrame({
        'true_label': true_labels,
        'predicted_label': pred_labels,
        'correct': [r.get("correct", False) for r in results],
        'confidence': [r.get("confidence", 0) for r in results]
    })
    
    # Plot distributions
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8))
    
    # True label distribution
    sns.countplot(data=results_df, x='true_label', ax=ax1)
    ax1.set_title('True Label Distribution')
    ax1.set_xlabel('Sentiment Class')
    ax1.set_ylabel('Count')
    
    # Predicted label distribution
    sns.countplot(data=results_df, x='predicted_label', ax=ax2)
    ax2.set_title('Predicted Label Distribution')
    ax2.set_xlabel('Sentiment Class')
    ax2.set_ylabel('Count')
    
    plt.suptitle(f'Class Distribution - {model_name}', fontsize=18, y=1.05)
    plt.tight_layout()
    plt.savefig(f'class_distribution_{model_name}.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Confidence analysis by class
    plt.figure(figsize=(12, 8))
    sns.boxplot(data=results_df, x='predicted_label', y='confidence')
    plt.title(f'Prediction Confidence by Class - {model_name}', fontsize=18, pad=20)
    plt.xlabel('Predicted Sentiment Class')
    plt.ylabel('Confidence Score')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'confidence_by_class_{model_name}.png', dpi=300, bbox_inches='tight')
    plt.show()