In [1]:
def generate_ensemble_combinations_analysis(results_df):
    """
    Generate analysis of different ensemble combinations (2, 3, or 4 models)
    using the existing predictions from individual models
    """
    print("\nüîπ Analyzing different ensemble combinations...")
    
    # Fix model names (remove brackets and numbers from BioBert)
    original_model_cols = [f"{name}_Prediction" for name in model_names]
    fixed_model_names = [name.replace("AraBert", "AraBERT") for name in model_names]
    fixed_model_names = [name.replace("distilBert", "Multilingual DistilBERT") for name in model_names]
    fixed_model_names = [name.replace("BioBert (2)", "BioBert") for name in model_names]
    fixed_model_names = [name.replace("multiBert", "mBERT") for name in model_names]
    fixed_model_names = [name.replace("xlmRoBERTa", "'XLM-RoBERTa") for name in model_names]

    
    # Get the ground truth
    y_true = results_df["True_Category"]
    
    # Function to perform soft voting ensemble on a combination of models
    def soft_voting_ensemble(model_cols):
        # Get one-hot encodings for each model's predictions
        encodings = []
        for col in model_cols:
            # Get predictions
            preds = results_df[col].values
            
            # Convert to one-hot encoding
            one_hot = np.zeros((len(preds), len(valid_categories)))
            for i, pred in enumerate(preds):
                category_idx = valid_categories.index(pred)
                one_hot[i, category_idx] = 1
            
            encodings.append(one_hot)
        
        # Average the one-hot encodings
        avg_encoding = np.mean(encodings, axis=0)
        
        # Get the most probable category for each sample
        ensemble_preds = [valid_categories[np.argmax(avg_encoding[i])] for i in range(len(avg_encoding))]
        
        # Calculate accuracy
        accuracy = np.mean(np.array(ensemble_preds) == y_true.values)
        
        return ensemble_preds, accuracy
    
    # =============================================================================
    # Analyze pairs of models (2-model ensembles)
    # =============================================================================
    print("Analyzing 2-model combinations...")
    
    # Generate all combinations of 2 models
    pairs = []
    pair_accuracies = []
    pair_names = []
    
    from itertools import combinations
    for combo in combinations(range(len(original_model_cols)), 2):
        model_cols = [original_model_cols[i] for i in combo]
        model_names_combo = [fixed_model_names[i] for i in combo]
        
        # Get predictions and accuracy for this combination
        ensemble_preds, accuracy = soft_voting_ensemble(model_cols)
        
        # Store results
        pairs.append((model_cols, ensemble_preds))
        pair_accuracies.append(accuracy)
        pair_names.append(" + ".join(model_names_combo))
    
    # Create DataFrame with results
    pair_results = pd.DataFrame({
        'Model_Combination': pair_names,
        'Accuracy': pair_accuracies
    })
    
    # Sort by accuracy (descending)
    pair_results = pair_results.sort_values('Accuracy', ascending=False).reset_index(drop=True)
    
    # Save results
    pair_results.to_csv("visualizations/pair_ensemble_results.csv", index=False)
    
    # Plot top pairs
    plt.figure(figsize=(14, 8))
    bars = plt.bar(pair_results['Model_Combination'], pair_results['Accuracy'], color='lightblue')
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
    
    plt.title("Accuracy of 2-Model Ensemble Combinations", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Model Combination", fontsize=14)
    plt.ylim(pair_results['Accuracy'].min() * 0.98, pair_results['Accuracy'].max() * 1.02)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=90)
    plt.tight_layout()
    
    plt.savefig("visualizations/pair_ensemble_accuracy.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # =============================================================================
    # Analyze triplets of models (3-model ensembles)
    # =============================================================================
    print("Analyzing 3-model combinations...")
    
    # Generate all combinations of 3 models
    triplets = []
    triplet_accuracies = []
    triplet_names = []
    
    for combo in combinations(range(len(original_model_cols)), 3):
        model_cols = [original_model_cols[i] for i in combo]
        model_names_combo = [fixed_model_names[i] for i in combo]
        
        # Get predictions and accuracy for this combination
        ensemble_preds, accuracy = soft_voting_ensemble(model_cols)
        
        # Store results
        triplets.append((model_cols, ensemble_preds))
        triplet_accuracies.append(accuracy)
        triplet_names.append(" + ".join(model_names_combo))
    
    # Create DataFrame with results
    triplet_results = pd.DataFrame({
        'Model_Combination': triplet_names,
        'Accuracy': triplet_accuracies
    })
    
    # Sort by accuracy (descending)
    triplet_results = triplet_results.sort_values('Accuracy', ascending=False).reset_index(drop=True)
    
    # Save results
    triplet_results.to_csv("visualizations/triplet_ensemble_results.csv", index=False)
    
    # Plot top triplets
    plt.figure(figsize=(14, 8))
    bars = plt.bar(triplet_results['Model_Combination'], triplet_results['Accuracy'], color='lightgreen')
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
    
    plt.title("Accuracy of 3-Model Ensemble Combinations", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Model Combination", fontsize=14)
    plt.ylim(triplet_results['Accuracy'].min() * 0.98, triplet_results['Accuracy'].max() * 1.02)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=90)
    plt.tight_layout()
    
    plt.savefig("visualizations/triplet_ensemble_accuracy.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # =============================================================================
    # Analyze quartets of models (4-model ensembles) if we have at least 4 models
    # =============================================================================
    if len(original_model_cols) >= 4:
        print("Analyzing 4-model combinations...")
        
        # Generate all combinations of 4 models
        quartets = []
        quartet_accuracies = []
        quartet_names = []
        
        for combo in combinations(range(len(original_model_cols)), 4):
            model_cols = [original_model_cols[i] for i in combo]
            model_names_combo = [fixed_model_names[i] for i in combo]
            
            # Get predictions and accuracy for this combination
            ensemble_preds, accuracy = soft_voting_ensemble(model_cols)
            
            # Store results
            quartets.append((model_cols, ensemble_preds))
            quartet_accuracies.append(accuracy)
            quartet_names.append(" + ".join(model_names_combo))
        
        # Create DataFrame with results
        quartet_results = pd.DataFrame({
            'Model_Combination': quartet_names,
            'Accuracy': quartet_accuracies
        })
        
        # Sort by accuracy (descending)
        quartet_results = quartet_results.sort_values('Accuracy', ascending=False).reset_index(drop=True)
        
        # Save results
        quartet_results.to_csv("visualizations/quartet_ensemble_results.csv", index=False)
        
        # Plot quartets
        plt.figure(figsize=(14, 8))
        bars = plt.bar(quartet_results['Model_Combination'], quartet_results['Accuracy'], color='lightsalmon')
        
        # Add accuracy values on top of bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                    f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
        
        plt.title("Accuracy of 4-Model Ensemble Combinations", fontsize=16)
        plt.ylabel("Accuracy", fontsize=14)
        plt.xlabel("Model Combination", fontsize=14)
        plt.ylim(quartet_results['Accuracy'].min() * 0.98, quartet_results['Accuracy'].max() * 1.02)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.xticks(rotation=90)
        plt.tight_layout()
        
        plt.savefig("visualizations/quartet_ensemble_accuracy.png", dpi=300, bbox_inches="tight")
        plt.close()
    
    # =============================================================================
    # Create a summary comparison chart of best combinations from each size
    # =============================================================================
    print("Creating ensemble summary comparison...")
    
    # Collect best models from each group
    best_models = []
    best_accuracies = []
    group_names = []
    
    # Add individual models
    individual_accuracies = []
    for i, model_name in enumerate(fixed_model_names):
        original_name = model_names[i]
        accuracy = (results_df["True_Category"] == results_df[f"{original_name}_Prediction"]).mean()
        individual_accuracies.append((model_name, accuracy))
    
    best_individual = max(individual_accuracies, key=lambda x: x[1])
    best_models.append(best_individual[0])
    best_accuracies.append(best_individual[1])
    group_names.append("Best Single Model")
    
    # Add best pair
    if len(pair_results) > 0:
        best_models.append(pair_results.iloc[0]['Model_Combination'])
        best_accuracies.append(pair_results.iloc[0]['Accuracy'])
        group_names.append("Best 2-Model Ensemble")
    
    # Add best triplet
    if len(triplet_results) > 0:
        best_models.append(triplet_results.iloc[0]['Model_Combination'])
        best_accuracies.append(triplet_results.iloc[0]['Accuracy'])
        group_names.append("Best 3-Model Ensemble")
    
    # Add best quartet
    if len(original_model_cols) >= 4 and len(quartet_results) > 0:
        best_models.append(quartet_results.iloc[0]['Model_Combination'])
        best_accuracies.append(quartet_results.iloc[0]['Accuracy'])
        group_names.append("Best 4-Model Ensemble")
    
    # Add full ensemble
    full_ensemble_accuracy = (results_df["True_Category"] == results_df["Ensemble_Prediction"]).mean()
    best_models.append("All Models")
    best_accuracies.append(full_ensemble_accuracy)
    group_names.append(f"Full Ensemble ({len(original_model_cols)} Models)")
    
    # Create summary DataFrame
    summary_df = pd.DataFrame({
        'Ensemble_Type': group_names,
        'Models': best_models,
        'Accuracy': best_accuracies
    })
    
    # Save summary
    summary_df.to_csv("visualizations/ensemble_summary.csv", index=False)
    
    # Plot summary comparison
    plt.figure(figsize=(12, 8))
    
    # Use a color gradient for the bars
    colors = plt.cm.viridis(np.linspace(0.1, 0.9, len(group_names)))
    
    bars = plt.bar(group_names, best_accuracies, color=colors)
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold')
    
    plt.title("Accuracy Comparison of Best Ensemble Combinations", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Ensemble Type", fontsize=14)
    plt.ylim(min(best_accuracies) * 0.98, max(best_accuracies) * 1.02)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/ensemble_summary_comparison.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # =============================================================================
    # Create a detailed comparison chart showing performance by category for the best models
    # =============================================================================
    print("Creating category-wise performance comparison for best ensembles...")
    
    # Fix Arabic category names
    arabic_categories = {cat: fix_arabic_text(cat) for cat in valid_categories}
    
    # Calculate per-category accuracies for each best ensemble
    category_performance = {}
    
    # Initialize category performance dictionary
    for category in valid_categories:
        category_performance[arabic_categories[category]] = {}
    
    # Function to get predictions for a specific ensemble combination
    def get_ensemble_predictions(model_combination_str, original_model_cols):
        if model_combination_str == "All Models":
            return results_df["Ensemble_Prediction"].values
        
        # Parse model combination string to get individual models
        if " + " in model_combination_str:
            model_names_combo = model_combination_str.split(" + ")
            model_cols = []
            
            for model_name in model_names_combo:
                # Find the corresponding original column
                for i, original_name in enumerate(fixed_model_names):
                    if original_name == model_name:
                        model_cols.append(original_model_cols[i])
                        break
            
            # Get predictions using soft voting
            ensemble_preds, _ = soft_voting_ensemble(model_cols)
            return ensemble_preds
        else:
            # It's a single model
            idx = fixed_model_names.index(model_combination_str)
            return results_df[original_model_cols[idx]].values
    
    # Calculate accuracies by category for each best ensemble
    for ensemble_type, model_combo in zip(group_names, best_models):
        ensemble_predictions = get_ensemble_predictions(model_combo, original_model_cols)
        
        for category in valid_categories:
            # Filter for this category
            category_mask = results_df["True_Category"] == category
            category_indices = np.where(category_mask)[0]
            
            if len(category_indices) > 0:
                # Extract predictions for this category
                category_preds = [ensemble_predictions[i] for i in category_indices]
                
                # Calculate accuracy
                correct = sum(pred == category for pred in category_preds)
                accuracy = correct / len(category_indices)
            else:
                accuracy = 0
                
            category_performance[arabic_categories[category]][ensemble_type] = accuracy
    
    # Convert to DataFrame for easier plotting
    category_comparison_df = pd.DataFrame(category_performance).T
    
    # Save the data
    category_comparison_df.to_csv("visualizations/best_ensembles_category_performance.csv")
    
    # Plot the comparison
    plt.figure(figsize=(18, 10))
    category_comparison_df.plot(kind='bar', figsize=(18, 10))
    plt.title("Category Performance of Best Ensemble Combinations", fontsize=16)
    plt.xlabel("Category", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(title="Ensemble Type", loc='upper left', bbox_to_anchor=(1, 1))
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/best_ensembles_category_performance.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    print("\n‚úÖ Ensemble combination analysis completed")

In [3]:
# Add these imports at the top of your file
import arabic_reshaper
from bidi.algorithm import get_display

# Function to fix Arabic text rendering
def fix_arabic_text(text):
    """
    Reshape Arabic text for proper display in matplotlib
    """
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text

# Updated visualization functions with Arabic text support
def generate_visualizations(results_df):
    """
    Generate and save visualizations for research paper with Arabic text support
    
    Parameters:
    - results_df: DataFrame containing all predictions
    """
    print("\nüîπ Generating visualizations for research paper...")
    
    # Setup for better Arabic text display
    plt.rcParams['font.family'] = 'Arial'
    
    # Fix model names (remove brackets and numbers from BioBert)
    fixed_model_names = [name.replace("BioBert (2)", "BioBert") for name in model_names]
    
    # Update results_df column names if BioBert (2) exists
    if "BioBert (2)_Prediction" in results_df.columns:
        results_df = results_df.rename(columns={"BioBert (2)_Prediction": "BioBert_Prediction"})
    
    # 1. Generate confusion matrices
    generate_confusion_matrices(results_df, fixed_model_names)
    
    # 2. Generate accuracy comparison bar chart
    generate_accuracy_comparison(results_df, fixed_model_names)
    
    # 3. Generate per-category performance chart
    generate_category_performance(results_df, fixed_model_names)
    
    # 4. Generate model agreement heatmap
    generate_model_agreement_heatmap(results_df, fixed_model_names)
    
    # 5. Generate ensemble improvement chart
    generate_ensemble_improvement_chart(results_df, fixed_model_names)
    
    # 6. Generate ensemble combinations analysis
    generate_ensemble_combinations_analysis(results_df)
    
    print("\n‚úÖ All visualizations generated and saved in 'visualizations' directory")

def generate_confusion_matrices(results_df, fixed_model_names):
    """Generate confusion matrices for each model and the ensemble with Arabic text support"""
    # Fix Arabic category names
    arabic_categories = [fix_arabic_text(cat) for cat in valid_categories]
    
    plt.figure(figsize=(20, 16))
    
    # Create subplots for each model + ensemble
    models_to_plot = fixed_model_names + ["Ensemble"]
    num_models = len(models_to_plot)
    rows = (num_models + 1) // 2
    
    for i, model_name in enumerate(models_to_plot):
        plt.subplot(rows, 2, i+1)
        
        if model_name == "Ensemble":
            y_true = results_df["True_Category"]
            y_pred = results_df["Ensemble_Prediction"]
        else:
            y_true = results_df["True_Category"]
            original_name = model_names[fixed_model_names.index(model_name)]
            y_pred = results_df[f"{original_name}_Prediction"]
            
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=valid_categories)
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=arabic_categories, yticklabels=arabic_categories)
        plt.title(f"{model_name} Confusion Matrix")
        plt.xlabel("Predicted Category")
        plt.ylabel("True Category")
        plt.xticks(rotation=45, ha="right")
        plt.yticks(rotation=45)
        plt.tight_layout()
    
    plt.savefig("visualizations/confusion_matrices.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # Also save individual high-res confusion matrices
    for model_name in models_to_plot:
        plt.figure(figsize=(12, 10))
        
        if model_name == "Ensemble":
            y_true = results_df["True_Category"]
            y_pred = results_df["Ensemble_Prediction"]
        else:
            y_true = results_df["True_Category"]
            original_name = model_names[fixed_model_names.index(model_name)]
            y_pred = results_df[f"{original_name}_Prediction"]
            
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=valid_categories)
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=arabic_categories, yticklabels=arabic_categories)
        plt.title(f"{model_name} Confusion Matrix")
        plt.xlabel("Predicted Category")
        plt.ylabel("True Category")
        plt.xticks(rotation=45, ha="right")
        plt.yticks(rotation=45)
        plt.tight_layout()
        
        plt.savefig(f"visualizations/confusion_matrix_{model_name}.png", dpi=300, bbox_inches="tight")
        plt.close()

def generate_accuracy_comparison(results_df, fixed_model_names):
    """Generate bar chart comparing accuracy of all models"""
    # Calculate accuracies
    accuracies = []
    model_labels = []
    
    # Add individual model accuracies
    for i, model_name in enumerate(fixed_model_names):
        original_name = model_names[i]
        accuracy = (results_df["True_Category"] == results_df[f"{original_name}_Prediction"]).mean()
        accuracies.append(accuracy)
        model_labels.append(model_name)
    
    # Add ensemble accuracy
    ensemble_accuracy = (results_df["True_Category"] == results_df["Ensemble_Prediction"]).mean()
    accuracies.append(ensemble_accuracy)
    model_labels.append("Ensemble")
    
    # Create bar chart
    plt.figure(figsize=(12, 8))
    bars = plt.bar(model_labels, accuracies, color=['skyblue']*len(fixed_model_names) + ['darkblue'])
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold')
    
    plt.title("Model Accuracy Comparison", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Model", fontsize=14)
    plt.ylim(0, max(accuracies) * 1.15)  # Add some space above bars
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/accuracy_comparison.png", dpi=300, bbox_inches="tight")
    plt.close()

def generate_category_performance(results_df, fixed_model_names):
    """Generate per-category performance chart for all models"""
    # Fix Arabic category names
    arabic_categories = {cat: fix_arabic_text(cat) for cat in valid_categories}
    
    # Calculate per-category accuracies for each model
    category_performance = {}
    
    # Initialize category performance dictionary
    for category in valid_categories:
        category_performance[arabic_categories[category]] = {}
    
    # Calculate accuracies by category for each model
    all_models = fixed_model_names + ["Ensemble"]
    for i, model_name in enumerate(all_models):
        for category in valid_categories:
            # Filter for this category
            category_mask = results_df["True_Category"] == category
            
            if model_name == "Ensemble":
                correct_predictions = results_df.loc[category_mask, "Ensemble_Prediction"] == category
            else:
                original_name = model_names[fixed_model_names.index(model_name)] if model_name in fixed_model_names else model_name
                correct_predictions = results_df.loc[category_mask, f"{original_name}_Prediction"] == category
            
            # Calculate accuracy
            if sum(category_mask) > 0:
                accuracy = correct_predictions.sum() / sum(category_mask)
            else:
                accuracy = 0
                
            category_performance[arabic_categories[category]][model_name] = accuracy
    
    # Convert to DataFrame for easier plotting
    category_df = pd.DataFrame(category_performance).T
    
    # Plot
    plt.figure(figsize=(16, 10))
    category_df.plot(kind='bar', figsize=(16, 10))
    plt.title("Model Performance by Category", fontsize=16)
    plt.xlabel("Category", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(title="Model", loc='upper left', bbox_to_anchor=(1, 1))
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/category_performance.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # Also save the data
    category_df.to_csv("visualizations/category_performance.csv")

def generate_model_agreement_heatmap(results_df, fixed_model_names):
    """Generate heatmap showing agreement between models"""
    # Create a dictionary to store agreement percentages
    agreement_matrix = {}
    
    # Include all individual models and ensemble
    all_models = fixed_model_names + ["Ensemble"]
    
    # Initialize the matrix
    for model1 in all_models:
        agreement_matrix[model1] = {}
        for model2 in all_models:
            agreement_matrix[model1][model2] = 0.0
    
    # Calculate agreement percentages
    for model1 in all_models:
        for model2 in all_models:
            # Get prediction columns
            if model1 == "Ensemble":
                pred1 = results_df["Ensemble_Prediction"]
            else:
                original_name1 = model_names[fixed_model_names.index(model1)]
                pred1 = results_df[f"{original_name1}_Prediction"]
                
            if model2 == "Ensemble":
                pred2 = results_df["Ensemble_Prediction"]
            else:
                original_name2 = model_names[fixed_model_names.index(model2)]
                pred2 = results_df[f"{original_name2}_Prediction"]
            
            # Calculate agreement percentage
            agreement = np.mean(pred1 == pred2)
            agreement_matrix[model1][model2] = agreement
    
    # Convert to DataFrame
    agreement_df = pd.DataFrame(agreement_matrix)
    
    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(agreement_df, annot=True, fmt=".3f", cmap="YlGnBu", vmin=0, vmax=1)
    plt.title("Model Agreement Heatmap", fontsize=16)
    plt.tight_layout()
    
    plt.savefig("visualizations/model_agreement.png", dpi=300, bbox_inches="tight")
    plt.close()

def generate_ensemble_improvement_chart(results_df, fixed_model_names):
    """Generate chart showing where ensemble improves over individual models"""
    # Calculate where ensemble is correct but models are wrong
    improvements = {}
    
    # For each individual model
    for i, model_name in enumerate(fixed_model_names):
        original_name = model_names[i]
        # Cases where ensemble is correct
        ensemble_correct = results_df["True_Category"] == results_df["Ensemble_Prediction"]
        
        # Cases where this model is incorrect
        model_incorrect = results_df["True_Category"] != results_df[f"{original_name}_Prediction"]
        
        # Cases where ensemble improves over this model
        improvement = ensemble_correct & model_incorrect
        improvements[model_name] = improvement.sum()
    
    # Plot improvement chart
    plt.figure(figsize=(12, 6))
    plt.bar(improvements.keys(), improvements.values(), color='green')
    plt.title("Number of Cases Where Ensemble Corrects Individual Model Errors", fontsize=16)
    plt.ylabel("Number of Improvements", fontsize=14)
    plt.xlabel("Base Model", fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45, ha="right")
    
    # Add counts on top of bars
    for i, (model, count) in enumerate(improvements.items()):
        plt.text(i, count + 5, str(count), ha='center', fontweight='bold')
    
    plt.tight_layout()
    
    plt.savefig("visualizations/ensemble_improvements.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # Also calculate the percentage of total samples where ensemble corrects models
    total_samples = len(results_df)
    improvement_percentages = {model: count/total_samples*100 for model, count in improvements.items()}
    
    # Save these stats
    improvement_df = pd.DataFrame({
        'Model': list(improvements.keys()),
        'Improvement_Count': list(improvements.values()),
        'Improvement_Percentage': [improvement_percentages[model] for model in improvements.keys()]
    })
    improvement_df.to_csv("visualizations/ensemble_improvement_stats.csv", index=False)

In [2]:
pip install arabic-reshaper


Collecting arabic-reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl.metadata (12 kB)
Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic-reshaper
Successfully installed arabic-reshaper-3.0.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Main script to run all the analyses on the saved ensemble predictions

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import os
import arabic_reshaper
from bidi.algorithm import get_display
from itertools import combinations

# Create directory for visualizations
os.makedirs("visualizations", exist_ok=True)

# Define valid medical categories
valid_categories = [
    "ÿßŸÖÿ±ÿßÿ∂ ŸÜÿ≥ÿßÿ¶Ÿäÿ©",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπÿ∂ŸÑÿßÿ™ ŸàÿßŸÑÿπÿ∏ÿßŸÖ Ÿà ÿßŸÑŸÖŸÅÿßÿµŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨Ÿáÿßÿ≤ ÿßŸÑŸáÿ∂ŸÖŸä",
    "ÿßŸÑÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨ŸÜÿ≥Ÿäÿ©",
    "ÿ∑ÿ® ÿßŸÑÿßÿ≥ŸÜÿßŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑŸÇŸÑÿ® Ÿà ÿßŸÑÿ¥ÿ±ÿßŸäŸäŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπŸäŸàŸÜ",
    "ÿßŸÜŸÅ ÿßÿ∞ŸÜ Ÿàÿ≠ŸÜÿ¨ÿ±ÿ©",
    "ÿ¨ÿ±ÿßÿ≠ÿ© ÿ™ÿ¨ŸÖŸäŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿØŸÖ"
]

# Original model names from your code
model_names = [
    "AraBert",
    "BioBert (2)",
    "distilBert",
    "multiBert",
    "xlmRoBERTaa"
]

# Function to fix Arabic text rendering
def fix_arabic_text(text):
    """
    Reshape Arabic text for proper display in matplotlib
    """
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text

# Import the visualization functions
# Note: These would be imported from the previous artifacts
# For brevity, I'm assuming those functions are available

def run_ensemble_analysis():
    """
    Main function to run all analyses on the saved ensemble predictions
    """
    print("üîπ Starting ensemble analysis on saved predictions...")
    
    # Load the saved predictions
    try:
        results_df = pd.read_csv("/kaggle/input/ensembleee/ensemble_individual_prediction.csv")
        print(f"‚úÖ Loaded predictions data with {len(results_df)} samples.")
    except FileNotFoundError:
        print("‚ùå Error: The file 'ensemble_individual_prediction.csv' was not found.")
        print("Please make sure you've run the ensemble prediction code first.")
        return
    
    # Check if the data has the expected format
    required_columns = ["Text", "True_Category", "Ensemble_Prediction"]
    for model in model_names:
        required_columns.append(f"{model}_Prediction")
    
    missing_columns = [col for col in required_columns if col not in results_df.columns]
    
    if missing_columns:
        print(f"‚ùå Error: The following required columns are missing from the data: {missing_columns}")
        return
    
    # Fix model names (remove brackets and numbers from BioBert)
    fixed_model_names = [name.replace("BioBert (2)", "BioBert") for name in model_names]
    
    # Update results_df column names if BioBert (2) exists
    if "BioBert (2)_Prediction" in results_df.columns:
        results_df = results_df.rename(columns={"BioBert (2)_Prediction": "BioBert_Prediction"})
    
    # Print data summary
    print("\nüîπ Dataset Summary:")
    print(f"Total samples: {len(results_df)}")
    print("\nCategory distribution:")
    category_counts = results_df["True_Category"].value_counts()
    for category, count in category_counts.items():
        print(f"  - {category}: {count} samples ({count/len(results_df)*100:.2f}%)")
    
    # Calculate individual model accuracies
    print("\nüîπ Individual Model Accuracies:")
    for i, model_name in enumerate(model_names):
        accuracy = (results_df["True_Category"] == results_df[f"{model_name}_Prediction"]).mean()
        print(f"  - {fixed_model_names[i]}: {accuracy:.4f}")
    
    # Calculate ensemble accuracy
    ensemble_accuracy = (results_df["True_Category"] == results_df["Ensemble_Prediction"]).mean()
    print(f"  - Ensemble (All Models): {ensemble_accuracy:.4f}")
    
    # Generate all visualizations
    print("\nüîπ Generating all visualizations...")
    
    # Generate confusion matrices
    generate_confusion_matrices(results_df, fixed_model_names)
    
    # Generate accuracy comparison chart
    generate_accuracy_comparison(results_df, fixed_model_names)
    
    # Generate per-category performance chart
    generate_category_performance(results_df, fixed_model_names)
    
    # Generate model agreement heatmap
    generate_model_agreement_heatmap(results_df, fixed_model_names)
    
    # Generate ensemble improvement chart
    generate_ensemble_improvement_chart(results_df, fixed_model_names)
    
    # Generate ensemble combinations analysis
    generate_ensemble_combinations_analysis(results_df)
    
    print("\n‚úÖ Analysis complete! All visualizations saved to the 'visualizations' directory.")

# Run the analysis if this script is executed directly
# Run the analysis if this script is executed directly
if __name__ == "__main__":
    run_ensemble_analysis()


üîπ Starting ensemble analysis on saved predictions...
‚úÖ Loaded predictions data with 39789 samples.

üîπ Dataset Summary:
Total samples: 39789

Category distribution:
  - ÿßŸÖÿ±ÿßÿ∂ ŸÜÿ≥ÿßÿ¶Ÿäÿ©: 14032 samples (35.27%)
  - ÿßŸÜŸÅ ÿßÿ∞ŸÜ Ÿàÿ≠ŸÜÿ¨ÿ±ÿ©: 3912 samples (9.83%)
  - ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπÿ∂ŸÑÿßÿ™ ŸàÿßŸÑÿπÿ∏ÿßŸÖ Ÿà ÿßŸÑŸÖŸÅÿßÿµŸÑ: 3712 samples (9.33%)
  - ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπŸäŸàŸÜ: 3660 samples (9.20%)
  - ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑŸÇŸÑÿ® Ÿà ÿßŸÑÿ¥ÿ±ÿßŸäŸäŸÜ: 3190 samples (8.02%)
  - ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨Ÿáÿßÿ≤ ÿßŸÑŸáÿ∂ŸÖŸä: 3177 samples (7.98%)
  - ÿßŸÑÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨ŸÜÿ≥Ÿäÿ©: 2219 samples (5.58%)
  - ÿ∑ÿ® ÿßŸÑÿßÿ≥ŸÜÿßŸÜ: 2202 samples (5.53%)
  - ÿ¨ÿ±ÿßÿ≠ÿ© ÿ™ÿ¨ŸÖŸäŸÑ: 1969 samples (4.95%)
  - ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿØŸÖ: 1716 samples (4.31%)

üîπ Individual Model Accuracies:
  - AraBert: 0.9140


KeyError: 'BioBert (2)_Prediction'

In [12]:
# Main script to run all the analyses on the saved ensemble predictions

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import os
import arabic_reshaper
from bidi.algorithm import get_display
from itertools import combinations

# Create directory for visualizations
os.makedirs("visualizations", exist_ok=True)

# Define valid medical categories
valid_categories = [
    "ÿßŸÖÿ±ÿßÿ∂ ŸÜÿ≥ÿßÿ¶Ÿäÿ©",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπÿ∂ŸÑÿßÿ™ ŸàÿßŸÑÿπÿ∏ÿßŸÖ Ÿà ÿßŸÑŸÖŸÅÿßÿµŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨Ÿáÿßÿ≤ ÿßŸÑŸáÿ∂ŸÖŸä",
    "ÿßŸÑÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿ¨ŸÜÿ≥Ÿäÿ©",
    "ÿ∑ÿ® ÿßŸÑÿßÿ≥ŸÜÿßŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑŸÇŸÑÿ® Ÿà ÿßŸÑÿ¥ÿ±ÿßŸäŸäŸÜ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿπŸäŸàŸÜ",
    "ÿßŸÜŸÅ ÿßÿ∞ŸÜ Ÿàÿ≠ŸÜÿ¨ÿ±ÿ©",
    "ÿ¨ÿ±ÿßÿ≠ÿ© ÿ™ÿ¨ŸÖŸäŸÑ",
    "ÿßŸÖÿ±ÿßÿ∂ ÿßŸÑÿØŸÖ"
]

# Original model names from your code
model_names = [
    "AraBert",
    "BioBert (2)",  # Removing the "(2)" here to match the expected column names in the DataFrame
    "distilBert",
    "multiBert",
    "xlmRoBERTaa"  # Fixed typo in "xlmRoBERTaa"
]

# Function to fix Arabic text rendering
def fix_arabic_text(text):
    """
    Reshape Arabic text for proper display in matplotlib
    """
    reshaped_text = arabic_reshaper.reshape(text)
    bidi_text = get_display(reshaped_text)
    return bidi_text

# Define all the visualization functions
def generate_confusion_matrices(results_df, fixed_model_names):
    """Generate confusion matrices for each model and the ensemble with Arabic text support"""
    # Fix Arabic category names
    arabic_categories = [fix_arabic_text(cat) for cat in valid_categories]
    
    plt.figure(figsize=(20, 16))
    
    # Create subplots for each model + ensemble
    models_to_plot = fixed_model_names + ["Ensemble"]
    num_models = len(models_to_plot)
    rows = (num_models + 1) // 2
    
    for i, model_name in enumerate(models_to_plot):
        plt.subplot(rows, 2, i+1)
        
        if model_name == "Ensemble":
            y_true = results_df["True_Category"]
            y_pred = results_df["Ensemble_Prediction"]
        else:
            y_true = results_df["True_Category"]
            original_name = model_names[fixed_model_names.index(model_name)]
            y_pred = results_df[f"{original_name}_Prediction"]
            
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=valid_categories)
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=arabic_categories, yticklabels=arabic_categories)
        plt.title(f"{model_name} Confusion Matrix")
        plt.xlabel("Predicted Category")
        plt.ylabel("True Category")
        plt.xticks(rotation=45, ha="right")
        plt.yticks(rotation=45)
        plt.tight_layout()
    
    plt.savefig("visualizations/confusion_matrices.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # Also save individual high-res confusion matrices
    for model_name in models_to_plot:
        plt.figure(figsize=(12, 10))
        
        if model_name == "Ensemble":
            y_true = results_df["True_Category"]
            y_pred = results_df["Ensemble_Prediction"]
        else:
            y_true = results_df["True_Category"]
            original_name = model_names[fixed_model_names.index(model_name)]
            y_pred = results_df[f"{original_name}_Prediction"]
            
        # Calculate confusion matrix
        cm = confusion_matrix(y_true, y_pred, labels=valid_categories)
        
        # Plot confusion matrix
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=arabic_categories, yticklabels=arabic_categories)
        plt.title(f"{model_name} Confusion Matrix")
        plt.xlabel("Predicted Category")
        plt.ylabel("True Category")
        plt.xticks(rotation=45, ha="right")
        plt.yticks(rotation=45)
        plt.tight_layout()
        
        plt.savefig(f"visualizations/confusion_matrix_{model_name}.png", dpi=300, bbox_inches="tight")
        plt.close()

def generate_accuracy_comparison(results_df, fixed_model_names):
    """Generate bar chart comparing accuracy of all models"""
    # Calculate accuracies
    accuracies = []
    model_labels = []
    
    # Add individual model accuracies
    for i, model_name in enumerate(fixed_model_names):
        original_name = model_names[i]
        accuracy = (results_df["True_Category"] == results_df[f"{original_name}_Prediction"]).mean()
        accuracies.append(accuracy)
        model_labels.append(model_name)
    
    # Add ensemble accuracy
    ensemble_accuracy = (results_df["True_Category"] == results_df["Ensemble_Prediction"]).mean()
    accuracies.append(ensemble_accuracy)
    model_labels.append("Ensemble")
    
    # Create bar chart
    plt.figure(figsize=(12, 8))
    bars = plt.bar(model_labels, accuracies, color=['skyblue']*len(fixed_model_names) + ['darkblue'])
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold')
    
    plt.title("Model Accuracy Comparison", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Model", fontsize=14)
    plt.ylim(0, max(accuracies) * 1.15)  # Add some space above bars
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/accuracy_comparison.png", dpi=300, bbox_inches="tight")
    plt.close()

def generate_category_performance(results_df, fixed_model_names):
    """Generate per-category performance chart for all models"""
    # Fix Arabic category names
    arabic_categories = {cat: fix_arabic_text(cat) for cat in valid_categories}
    
    # Calculate per-category accuracies for each model
    category_performance = {}
    
    # Initialize category performance dictionary
    for category in valid_categories:
        category_performance[arabic_categories[category]] = {}
    
    # Calculate accuracies by category for each model
    all_models = fixed_model_names + ["Ensemble"]
    for i, model_name in enumerate(all_models):
        for category in valid_categories:
            # Filter for this category
            category_mask = results_df["True_Category"] == category
            
            if model_name == "Ensemble":
                correct_predictions = results_df.loc[category_mask, "Ensemble_Prediction"] == category
            else:
                original_name = model_names[fixed_model_names.index(model_name)] if model_name in fixed_model_names else model_name
                correct_predictions = results_df.loc[category_mask, f"{original_name}_Prediction"] == category
            
            # Calculate accuracy
            if sum(category_mask) > 0:
                accuracy = correct_predictions.sum() / sum(category_mask)
            else:
                accuracy = 0
                
            category_performance[arabic_categories[category]][model_name] = accuracy
    
    # Convert to DataFrame for easier plotting
    category_df = pd.DataFrame(category_performance).T
    
    # Plot
    plt.figure(figsize=(16, 10))
    category_df.plot(kind='bar', figsize=(16, 10))
    plt.title("Model Performance by Category", fontsize=16)
    plt.xlabel("Category", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(title="Model", loc='upper left', bbox_to_anchor=(1, 1))
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/category_performance.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # Also save the data
    category_df.to_csv("visualizations/category_performance.csv")

def generate_model_agreement_heatmap(results_df, fixed_model_names):
    """Generate heatmap showing agreement between models"""
    # Create a dictionary to store agreement percentages
    agreement_matrix = {}
    
    # Include all individual models and ensemble
    all_models = fixed_model_names + ["Ensemble"]
    
    # Initialize the matrix
    for model1 in all_models:
        agreement_matrix[model1] = {}
        for model2 in all_models:
            agreement_matrix[model1][model2] = 0.0
    
    # Calculate agreement percentages
    for model1 in all_models:
        for model2 in all_models:
            # Get prediction columns
            if model1 == "Ensemble":
                pred1 = results_df["Ensemble_Prediction"]
            else:
                original_name1 = model_names[fixed_model_names.index(model1)]
                pred1 = results_df[f"{original_name1}_Prediction"]
                
            if model2 == "Ensemble":
                pred2 = results_df["Ensemble_Prediction"]
            else:
                original_name2 = model_names[fixed_model_names.index(model2)]
                pred2 = results_df[f"{original_name2}_Prediction"]
            
            # Calculate agreement percentage
            agreement = np.mean(pred1 == pred2)
            agreement_matrix[model1][model2] = agreement
    
    # Convert to DataFrame
    agreement_df = pd.DataFrame(agreement_matrix)
    
    # Plot heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(agreement_df, annot=True, fmt=".3f", cmap="YlGnBu", vmin=0, vmax=1)
    plt.title("Model Agreement Heatmap", fontsize=16)
    plt.tight_layout()
    
    plt.savefig("visualizations/model_agreement.png", dpi=300, bbox_inches="tight")
    plt.close()

def generate_ensemble_improvement_chart(results_df, fixed_model_names):
    """Generate chart showing where ensemble improves over individual models"""
    # Calculate where ensemble is correct but models are wrong
    improvements = {}
    
    # For each individual model
    for i, model_name in enumerate(fixed_model_names):
        original_name = model_names[i]
        # Cases where ensemble is correct
        ensemble_correct = results_df["True_Category"] == results_df["Ensemble_Prediction"]
        
        # Cases where this model is incorrect
        model_incorrect = results_df["True_Category"] != results_df[f"{original_name}_Prediction"]
        
        # Cases where ensemble improves over this model
        improvement = ensemble_correct & model_incorrect
        improvements[model_name] = improvement.sum()
    
    # Plot improvement chart
    plt.figure(figsize=(12, 6))
    plt.bar(improvements.keys(), improvements.values(), color='green')
    plt.title("Number of Cases Where Ensemble Corrects Individual Model Errors", fontsize=16)
    plt.ylabel("Number of Improvements", fontsize=14)
    plt.xlabel("Base Model", fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45, ha="right")
    
    # Add counts on top of bars
    for i, (model, count) in enumerate(improvements.items()):
        plt.text(i, count + 5, str(count), ha='center', fontweight='bold')
    
    plt.tight_layout()
    
    plt.savefig("visualizations/ensemble_improvements.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # Also calculate the percentage of total samples where ensemble corrects models
    total_samples = len(results_df)
    improvement_percentages = {model: count/total_samples*100 for model, count in improvements.items()}
    
    # Save these stats
    improvement_df = pd.DataFrame({
        'Model': list(improvements.keys()),
        'Improvement_Count': list(improvements.values()),
        'Improvement_Percentage': [improvement_percentages[model] for model in improvements.keys()]
    })
    improvement_df.to_csv("visualizations/ensemble_improvement_stats.csv", index=False)

def generate_ensemble_combinations_analysis(results_df):
    """
    Generate analysis of different ensemble combinations (2, 3, or 4 models)
    using the existing predictions from individual models
    """
    print("\nüîπ Analyzing different ensemble combinations...")
    
    # Fix model names (remove brackets and numbers from BioBert)
    original_model_cols = [f"{name}_Prediction" for name in model_names]
    fixed_model_names = [name.replace("AraBert", "AraBERT") for name in model_names]
    fixed_model_names = [name.replace("distilBert", "Multilingual DistilBERT") for name in fixed_model_names]
    fixed_model_names = [name.replace("BioBert (2)", "BioBERT") for name in fixed_model_names]
    fixed_model_names = [name.replace("multiBert", "mBERT") for name in fixed_model_names]
    fixed_model_names = [name.replace("xlmRoBERTaa", "XLM-RoBERTa") for name in fixed_model_names]
    
    # Get the ground truth
    y_true = results_df["True_Category"]
    
    # Function to perform soft voting ensemble on a combination of models
    def soft_voting_ensemble(model_cols):
        # Get one-hot encodings for each model's predictions
        encodings = []
        for col in model_cols:
            # Get predictions
            preds = results_df[col].values
            
            # Convert to one-hot encoding
            one_hot = np.zeros((len(preds), len(valid_categories)))
            for i, pred in enumerate(preds):
                category_idx = valid_categories.index(pred)
                one_hot[i, category_idx] = 1
            
            encodings.append(one_hot)
        
        # Average the one-hot encodings
        avg_encoding = np.mean(encodings, axis=0)
        
        # Get the most probable category for each sample
        ensemble_preds = [valid_categories[np.argmax(avg_encoding[i])] for i in range(len(avg_encoding))]
        
        # Calculate accuracy
        accuracy = np.mean(np.array(ensemble_preds) == y_true.values)
        
        return ensemble_preds, accuracy
    
    # =============================================================================
    # Analyze pairs of models (2-model ensembles)
    # =============================================================================
    print("Analyzing 2-model combinations...")
    
    # Generate all combinations of 2 models
    pairs = []
    pair_accuracies = []
    pair_names = []
    
    from itertools import combinations
    for combo in combinations(range(len(original_model_cols)), 2):
        model_cols = [original_model_cols[i] for i in combo]
        model_names_combo = [fixed_model_names[i] for i in combo]
        
        # Get predictions and accuracy for this combination
        ensemble_preds, accuracy = soft_voting_ensemble(model_cols)
        
        # Store results
        pairs.append((model_cols, ensemble_preds))
        pair_accuracies.append(accuracy)
        pair_names.append(" + ".join(model_names_combo))
    
    # Create DataFrame with results
    pair_results = pd.DataFrame({
        'Model_Combination': pair_names,
        'Accuracy': pair_accuracies
    })
    
    # Sort by accuracy (descending)
    pair_results = pair_results.sort_values('Accuracy', ascending=False).reset_index(drop=True)
    
    # Save results
    pair_results.to_csv("visualizations/pair_ensemble_results.csv", index=False)
    
    # Plot top pairs
    plt.figure(figsize=(14, 8))
    bars = plt.bar(pair_results['Model_Combination'], pair_results['Accuracy'], color='lightblue')
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
    
    plt.title("Accuracy of 2-Model Ensemble Combinations", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Model Combination", fontsize=14)
    plt.ylim(pair_results['Accuracy'].min() * 0.98, pair_results['Accuracy'].max() * 1.02)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=90)
    plt.tight_layout()
    
    plt.savefig("visualizations/pair_ensemble_accuracy.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # =============================================================================
    # Analyze triplets of models (3-model ensembles)
    # =============================================================================
    print("Analyzing 3-model combinations...")
    
    # Generate all combinations of 3 models
    triplets = []
    triplet_accuracies = []
    triplet_names = []
    
    for combo in combinations(range(len(original_model_cols)), 3):
        model_cols = [original_model_cols[i] for i in combo]
        model_names_combo = [fixed_model_names[i] for i in combo]
        
        # Get predictions and accuracy for this combination
        ensemble_preds, accuracy = soft_voting_ensemble(model_cols)
        
        # Store results
        triplets.append((model_cols, ensemble_preds))
        triplet_accuracies.append(accuracy)
        triplet_names.append(" + ".join(model_names_combo))
    
    # Create DataFrame with results
    triplet_results = pd.DataFrame({
        'Model_Combination': triplet_names,
        'Accuracy': triplet_accuracies
    })
    
    # Sort by accuracy (descending)
    triplet_results = triplet_results.sort_values('Accuracy', ascending=False).reset_index(drop=True)
    
    # Save results
    triplet_results.to_csv("visualizations/triplet_ensemble_results.csv", index=False)
    
    # Plot top triplets
    plt.figure(figsize=(14, 8))
    bars = plt.bar(triplet_results['Model_Combination'], triplet_results['Accuracy'], color='lightgreen')
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
    
    plt.title("Accuracy of 3-Model Ensemble Combinations", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Model Combination", fontsize=14)
    plt.ylim(triplet_results['Accuracy'].min() * 0.98, triplet_results['Accuracy'].max() * 1.02)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=90)
    plt.tight_layout()
    
    plt.savefig("visualizations/triplet_ensemble_accuracy.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # =============================================================================
    # Analyze quartets of models (4-model ensembles) if we have at least 4 models
    # =============================================================================
    if len(original_model_cols) >= 4:
        print("Analyzing 4-model combinations...")
        
        # Generate all combinations of 4 models
        quartets = []
        quartet_accuracies = []
        quartet_names = []
        
        for combo in combinations(range(len(original_model_cols)), 4):
            model_cols = [original_model_cols[i] for i in combo]
            model_names_combo = [fixed_model_names[i] for i in combo]
            
            # Get predictions and accuracy for this combination
            ensemble_preds, accuracy = soft_voting_ensemble(model_cols)
            
            # Store results
            quartets.append((model_cols, ensemble_preds))
            quartet_accuracies.append(accuracy)
            quartet_names.append(" + ".join(model_names_combo))
        
        # Create DataFrame with results
        quartet_results = pd.DataFrame({
            'Model_Combination': quartet_names,
            'Accuracy': quartet_accuracies
        })
        
        # Sort by accuracy (descending)
        quartet_results = quartet_results.sort_values('Accuracy', ascending=False).reset_index(drop=True)
        
        # Save results
        quartet_results.to_csv("visualizations/quartet_ensemble_results.csv", index=False)
        
        # Plot quartets
        plt.figure(figsize=(14, 8))
        bars = plt.bar(quartet_results['Model_Combination'], quartet_results['Accuracy'], color='lightsalmon')
        
        # Add accuracy values on top of bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                    f'{height:.4f}', ha='center', va='bottom', fontweight='bold', fontsize=9)
        
        plt.title("Accuracy of 4-Model Ensemble Combinations", fontsize=16)
        plt.ylabel("Accuracy", fontsize=14)
        plt.xlabel("Model Combination", fontsize=14)
        plt.ylim(quartet_results['Accuracy'].min() * 0.98, quartet_results['Accuracy'].max() * 1.02)
        plt.grid(axis='y', linestyle='--', alpha=0.7)
        plt.xticks(rotation=90)
        plt.tight_layout()
        
        plt.savefig("visualizations/quartet_ensemble_accuracy.png", dpi=300, bbox_inches="tight")
        plt.close()
    
    # =============================================================================
    # Create a summary comparison chart of best combinations from each size
    # =============================================================================
    print("Creating ensemble summary comparison...")
    
    # Collect best models from each group
    best_models = []
    best_accuracies = []
    group_names = []
    
    # Add individual models
    individual_accuracies = []
    for i, model_name in enumerate(fixed_model_names):
        original_name = model_names[i]
        accuracy = (results_df["True_Category"] == results_df[f"{original_name}_Prediction"]).mean()
        individual_accuracies.append((model_name, accuracy))
    
    best_individual = max(individual_accuracies, key=lambda x: x[1])
    best_models.append(best_individual[0])
    best_accuracies.append(best_individual[1])
    group_names.append("Best Single Model")
    
    # Add best pair
    if len(pair_results) > 0:
        best_models.append(pair_results.iloc[0]['Model_Combination'])
        best_accuracies.append(pair_results.iloc[0]['Accuracy'])
        group_names.append("Best 2-Model Ensemble")
    
    # Add best triplet
    if len(triplet_results) > 0:
        best_models.append(triplet_results.iloc[0]['Model_Combination'])
        best_accuracies.append(triplet_results.iloc[0]['Accuracy'])
        group_names.append("Best 3-Model Ensemble")
    
    # Add best quartet
    if len(original_model_cols) >= 4 and 'quartet_results' in locals():
        best_models.append(quartet_results.iloc[0]['Model_Combination'])
        best_accuracies.append(quartet_results.iloc[0]['Accuracy'])
        group_names.append("Best 4-Model Ensemble")
    
    # Add full ensemble
    full_ensemble_accuracy = (results_df["True_Category"] == results_df["Ensemble_Prediction"]).mean()
    best_models.append("All Models")
    best_accuracies.append(full_ensemble_accuracy)
    group_names.append(f"Full Ensemble ({len(original_model_cols)} Models)")
    
    # Create summary DataFrame
    summary_df = pd.DataFrame({
        'Ensemble_Type': group_names,
        'Models': best_models,
        'Accuracy': best_accuracies
    })
    
    # Save summary
    summary_df.to_csv("visualizations/ensemble_summary.csv", index=False)
    
    # Plot summary comparison
    plt.figure(figsize=(12, 8))
    
    # Use a color gradient for the bars
    colors = plt.cm.viridis(np.linspace(0.1, 0.9, len(group_names)))
    
    bars = plt.bar(group_names, best_accuracies, color=colors)
    
    # Add accuracy values on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.005,
                f'{height:.4f}', ha='center', va='bottom', fontweight='bold')
    
    plt.title("Accuracy Comparison of Best Ensemble Combinations", fontsize=16)
    plt.ylabel("Accuracy", fontsize=14)
    plt.xlabel("Ensemble Type", fontsize=14)
    plt.ylim(min(best_accuracies) * 0.98, max(best_accuracies) * 1.02)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/ensemble_summary_comparison.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    # =============================================================================
    # Create a detailed comparison chart showing performance by category for the best models
    # =============================================================================
    print("Creating category-wise performance comparison for best ensembles...")
    
    # Fix Arabic category names
    arabic_categories = {cat: fix_arabic_text(cat) for cat in valid_categories}
    
    # Calculate per-category accuracies for each best ensemble
    category_performance = {}
    
    # Initialize category performance dictionary
    for category in valid_categories:
        category_performance[arabic_categories[category]] = {}
    
    # Function to get predictions for a specific ensemble combination
    def get_ensemble_predictions(model_combination_str, original_model_cols):
        if model_combination_str == "All Models":
            return results_df["Ensemble_Prediction"].values
        
        # Parse model combination string to get individual models
        if " + " in model_combination_str:
            model_names_combo = model_combination_str.split(" + ")
            model_cols = []
            
            for model_name in model_names_combo:
                # Find the corresponding original column
                for i, original_name in enumerate(fixed_model_names):
                    if original_name == model_name:
                        model_cols.append(original_model_cols[i])
                        break
            
            # Get predictions using soft voting
            ensemble_preds, _ = soft_voting_ensemble(model_cols)
            return ensemble_preds
        else:
            # It's a single model
            idx = fixed_model_names.index(model_combination_str)
            return results_df[original_model_cols[idx]].values
    
    # Calculate accuracies by category for each best ensemble
    for ensemble_type, model_combo in zip(group_names, best_models):
        ensemble_predictions = get_ensemble_predictions(model_combo, original_model_cols)
        
        for category in valid_categories:
            # Filter for this category
            category_mask = results_df["True_Category"] == category
            category_indices = np.where(category_mask)[0]
            
            if len(category_indices) > 0:
                # Extract predictions for this category
                category_preds = [ensemble_predictions[i] for i in category_indices]
                
                # Calculate accuracy
                correct = sum(pred == category for pred in category_preds)
                accuracy = correct / len(category_indices)
            else:
                accuracy = 0
                
            category_performance[arabic_categories[category]][ensemble_type] = accuracy
    
    # Convert to DataFrame for easier plotting
    category_comparison_df = pd.DataFrame(category_performance).T
    
    # Save the data
    category_comparison_df.to_csv("visualizations/best_ensembles_category_performance.csv")
    
    # Plot the comparison
    plt.figure(figsize=(18, 10))
    category_comparison_df.plot(kind='bar', figsize=(18, 10))
    plt.title("Category Performance of Best Ensemble Combinations", fontsize=16)
    plt.xlabel("Category", fontsize=14)
    plt.ylabel("Accuracy", fontsize=14)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.legend(title="Ensemble Type", loc='upper left', bbox_to_anchor=(1, 1))
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    
    plt.savefig("visualizations/best_ensembles_category_performance.png", dpi=300, bbox_inches="tight")
    plt.close()
    
    print("\n‚úÖ Ensemble combination analysis completed")

def run_ensemble_analysis():
    """
    Main function to run all analyses on the saved ensemble predictions
    """
    print("üîπ Starting ensemble analysis on saved predictions...")
    
    # Load the saved predictions
    try:
        results_df = pd.read_csv("/kaggle/input/ensembleee/ensemble_individual_prediction.csv")
        print(f"‚úÖ Loaded predictions data with {len(results_df)} samples.")
    except FileNotFoundError:
        print("‚ùå Error: The file 'ensemble_individual_prediction.csv' was not found.")
        print("Please make sure you've run the ensemble prediction code first.")
        return
    
    # Check if the data has the expected columns
    expected_cols = ["True_Category", "Ensemble_Prediction"]
    model_cols = [f"{name}_Prediction" for name in model_names]
    
    missing_cols = [col for col in expected_cols + model_cols if col not in results_df.columns]
    if missing_cols:
        print(f"‚ùå Error: The following expected columns are missing: {missing_cols}")
        return
    
    # Fix model names for display purposes
    fixed_model_names = [name.replace("AraBert", "AraBERT") for name in model_names]
    fixed_model_names = [name.replace("distilBert", "Multilingual DistilBERT") for name in fixed_model_names]
    fixed_model_names = [name.replace("BioBert (2)", "BioBERT") for name in fixed_model_names]
    fixed_model_names = [name.replace("multiBert", "mBERT") for name in fixed_model_names]
    fixed_model_names = [name.replace("xlmRoBERTaa", "XLM-RoBERTa") for name in fixed_model_names]
    
    print("\nüîπ Generating confusion matrices...")
    generate_confusion_matrices(results_df, fixed_model_names)
    
    print("\nüîπ Generating accuracy comparison...")
    generate_accuracy_comparison(results_df, fixed_model_names)
    
    print("\nüîπ Generating category performance analysis...")
    generate_category_performance(results_df, fixed_model_names)
    
    print("\nüîπ Generating model agreement heatmap...")
    generate_model_agreement_heatmap(results_df, fixed_model_names)
    
    print("\nüîπ Analyzing ensemble improvements...")
    generate_ensemble_improvement_chart(results_df, fixed_model_names)
    
    print("\nüîπ Analyzing different ensemble combinations...")
    generate_ensemble_combinations_analysis(results_df)
    
    print("\n‚úÖ All analyses completed successfully!")
    print("üìä Visualizations saved in the 'visualizations' directory.")
    
    # Return the results DataFrame in case it's needed for further analysis
    return results_df

# Execute the analysis if this script is run directly
if __name__ == "__main__":
    run_ensemble_analysis()
    

üîπ Starting ensemble analysis on saved predictions...
‚úÖ Loaded predictions data with 39789 samples.

üîπ Generating confusion matrices...

üîπ Generating accuracy comparison...

üîπ Generating category performance analysis...

üîπ Generating model agreement heatmap...

üîπ Analyzing ensemble improvements...

üîπ Analyzing different ensemble combinations...

üîπ Analyzing different ensemble combinations...
Analyzing 2-model combinations...
Analyzing 3-model combinations...
Analyzing 4-model combinations...
Creating ensemble summary comparison...
Creating category-wise performance comparison for best ensembles...

‚úÖ Ensemble combination analysis completed

‚úÖ All analyses completed successfully!
üìä Visualizations saved in the 'visualizations' directory.


<Figure size 1600x1000 with 0 Axes>

<Figure size 1800x1000 with 0 Axes>

In [13]:
import shutil
    print("\nüîπ Compressing visualizations directory...")
    try:
        shutil.make_archive("visualizations_archive", "zip", "visualizations")
        print("‚úÖ Visualizations compressed to 'visualizations_archive.zip'")
    except Exception as e:
        print(f"‚ùå Error compressing visualizations: {str(e)}")

IndentationError: unexpected indent (<ipython-input-13-1449c162ecd2>, line 2)

In [15]:
# Add this at the appropriate place in your run_ensemble_analysis function
print("\n‚úÖ All analyses completed successfully!")
print("üìä Visualizations saved in the 'visualizations' directory.")

# Compress visualizations directory
import shutil
print("\nüîπ Compressing visualizations directory...")
try:
    shutil.make_archive("visualizations_archive", "zip", "visualizations")
    print("‚úÖ Visualizations compressed to 'visualizations_archive.zip'")
except Exception as e:
    print(f"‚ùå Error compressing visualizations: {str(e)}")

# Return the results DataFrame in case it's needed for further analysis



‚úÖ All analyses completed successfully!
üìä Visualizations saved in the 'visualizations' directory.

üîπ Compressing visualizations directory...
‚úÖ Visualizations compressed to 'visualizations_archive.zip'
