In [1]:
!pip install ragas nltk datasets 

Collecting ragas
  Downloading ragas-0.3.9-py3-none-any.whl.metadata (22 kB)
Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting tiktoken (from ragas)
  Downloading tiktoken-0.12.0-cp310-cp310-win_amd64.whl.metadata (6.9 kB)
Collecting appdirs (from ragas)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting diskcache>=5.6.3 (from ragas)
  Downloading diskcache-5.6.3-py3-none-any.whl.metadata (20 kB)
Collecting instructor (from ragas)
  Downloading instructor-1.13.0-py3-none-any.whl.metadata (11 kB)
Collecting gitpython (from ragas)
  Downloading gitpython-3.1.45-py3-none-any.whl.metadata (13 kB)
Collecting scikit-network (from ragas)
  Downloading scikit_network-0.33.4-cp310-cp310-win_amd64.whl.metadata (4.6 kB)
Collecting langchain-community (from ragas)
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting lang

In [7]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting absl-py (from rouge_score)
  Downloading absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Downloading absl_py-2.3.1-py3-none-any.whl (135 kB)
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=25027 sha256=90694e861ed57adf710955b8356417af74a161b61bbb43cea31927713a9d6872
  Stored in directory: c:\users\rohan\appdata\local\pip\cache\wheels\5f\dd\89\461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: absl-py, rouge_score

   -------------------- ------------------- 1/2 [rouge_score]
   ---------------------------------------- 2/2 [rouge_score]


  DEPRECATION: Building 'rouge_score' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'rouge_score'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [3]:
import pandas as pd
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from ragas import evaluate
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    answer_correctness,
    answer_similarity
)
from datasets import Dataset
from sentence_transformers import SentenceTransformer
from clients.openai_client import OpenAIClientManager
from nltk.translate.meteor_score import meteor_score

from rouge_score import rouge_scorer
# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)

from config import OpenAIConfig
# Initialize LLM for RAGAS (you can use OpenAI or other providers)
llm = OpenAIClientManager(OpenAIConfig).initialize()
embeddings = SentenceTransformer("./bge-small")



âœ“ OpenAI client created successfully
  Base URL: https://api.ai.it.ufl.edu
  Client type: <class 'openai.OpenAI'>
  Has chat attribute: True
  Has completions attribute: True


In [4]:
def calculate_bleu_score(reference, candidate):
    """Calculate BLEU score between reference and candidate answers"""
    try:
        # Handle NaN or None values
        if pd.isna(reference) or pd.isna(candidate):
            return 0.0
        
        reference_tokens = str(reference).lower().split()
        candidate_tokens = str(candidate).lower().split()
        
        # Use smoothing function to avoid zero scores
        smoothie = SmoothingFunction().method4
        score = sentence_bleu([reference_tokens], candidate_tokens, 
                             smoothing_function=smoothie)
        return round(score, 4)
    except Exception as e:
        print(f"Error calculating BLEU score: {e}")
        return 0.0

In [5]:
def calculate_rouge_scores(reference, candidate):
    """Calculate ROUGE-1, ROUGE-2, and ROUGE-L scores"""
    try:
        if pd.isna(reference) or pd.isna(candidate):
            return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}
        
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
        scores = scorer.score(str(reference), str(candidate))
        
        return {
            'rouge1': round(scores['rouge1'].fmeasure, 4),
            'rouge2': round(scores['rouge2'].fmeasure, 4),
            'rougeL': round(scores['rougeL'].fmeasure, 4)
        }
    except Exception as e:
        return {'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0}

In [6]:
def calculate_meteor_score(reference, candidate):
    """Calculate METEOR score"""
    try:
        if pd.isna(reference) or pd.isna(candidate):
            return 0.0
        
        reference_tokens = str(reference).lower().split()
        candidate_tokens = str(candidate).lower().split()
        
        score = meteor_score([reference_tokens], candidate_tokens)
        return round(score, 4)
    except Exception as e:
        return 0.0

In [11]:
def evaluate_answers(excel_file, sheet_name='Sheet1' 
                    ):
    """
    Evaluate answers from Excel sheet with BLEU, ROUGE, METEOR, and Perplexity
    
    Parameters:
    -----------
    excel_file : str
        Path to Excel file
    sheet_name : str
        Name of the sheet to read
    calculate_ppl : bool
        Whether to calculate perplexity (slower)
    ppl_model_name : str
        Model to use for perplexity calculation (e.g., "gpt2", "gpt2-medium")
    use_gpu : bool
        Whether to use GPU for perplexity calculation
    
    Expected columns: Question, Human_Answer, gpt_oss, llama_answer, gemma_answer
    """
    
    # Read Excel file
    df = pd.read_excel(excel_file, sheet_name=sheet_name)
    
    print(f" Loaded {len(df)} rows from Excel")
    print(f" Columns: {df.columns.tolist()}")
    
    # Standardize column names
    df.columns = df.columns.str.strip()
    column_mapping = {}
    for col in df.columns:
        col_lower = col.lower()
        if 'question' in col_lower:
            column_mapping[col] = 'Question'
        elif 'human' in col_lower and 'answer' in col_lower:
            column_mapping[col] = 'Human_Answer'
        elif 'gpt' in col_lower or 'oss' in col_lower:
            column_mapping[col] = 'GPT_Answer'
        elif 'llama' in col_lower:
            column_mapping[col] = 'Llama_Answer'
        elif 'gemma' in col_lower:
            column_mapping[col] = 'Gemma_Answer'
    
    df.rename(columns=column_mapping, inplace=True)
    print(f" Standardized columns: {df.columns.tolist()}\n")
    
    # Identify model columns
    model_columns = [col for col in df.columns if col.endswith('_Answer') and col != 'Human_Answer']
    models = [col.replace('_Answer', '') for col in model_columns]
    
    print(f" Found models: {models}\n")
    
    
    
    # ==================== CALCULATE METRICS ====================
    
    print("="*60)
    print("CALCULATING METRICS")
    print("="*60)
    
    for model in models:
        model_col = f'{model}_Answer'
        
        print(f"\n Evaluating {model}...")
        
        # BLEU Score
        print("   â†’ BLEU...")
        df[f'BLEU_{model}'] = df.apply(
            lambda row: calculate_bleu_score(row['Human_Answer'], row[model_col]), 
            axis=1
        )
        
        # ROUGE Scores
        print("   â†’ ROUGE...")
        rouge_scores = df.apply(
            lambda row: calculate_rouge_scores(row['Human_Answer'], row[model_col]), 
            axis=1
        )
        df[f'ROUGE1_{model}'] = [score['rouge1'] for score in rouge_scores]
        df[f'ROUGE2_{model}'] = [score['rouge2'] for score in rouge_scores]
        df[f'ROUGEL_{model}'] = [score['rougeL'] for score in rouge_scores]
        
        # METEOR Score
        print("   â†’ METEOR...")
        df[f'METEOR_{model}'] = df.apply(
            lambda row: calculate_meteor_score(row['Human_Answer'], row[model_col]), 
            axis=1
        )
        

        
        print(f"   âœ… {model} evaluation completed!")
    
    # ==================== AGGREGATE SCORES ====================
    
    print(f"\n{'='*60}")
    print("CALCULATING AGGREGATE SCORES")
    print("="*60)
    
    for model in models:
        # Average of BLEU, ROUGE-L, and METEOR (higher is better)
     
        score_columns = [
            f'BLEU_{model}',
            f'ROUGEL_{model}',
            f'METEOR_{model}'
        ]
        
        df[f'Aggregate_Score_{model}'] = df[score_columns].mean(axis=1).round(4)
        print(f"âœ… Aggregate score calculated for {model}")
    
    # ==================== SUMMARY STATISTICS ====================
    
    print(f"\n{'='*60}")
    print("GENERATING SUMMARY STATISTICS")
    print("="*60)
    
    summary_data = {
        'Model': [],
        'Avg_BLEU': [],
        'Avg_ROUGE-1': [],
        'Avg_ROUGE-2': [],
        'Avg_ROUGE-L': [],
        'Avg_METEOR': [],
        'Avg_Aggregate_Score': []
    }
    
    for model in models:
        summary_data['Model'].append(model)
        summary_data['Avg_BLEU'].append(round(df[f'BLEU_{model}'].mean(), 4))
        summary_data['Avg_ROUGE-1'].append(round(df[f'ROUGE1_{model}'].mean(), 4))
        summary_data['Avg_ROUGE-2'].append(round(df[f'ROUGE2_{model}'].mean(), 4))
        summary_data['Avg_ROUGE-L'].append(round(df[f'ROUGEL_{model}'].mean(), 4))
        summary_data['Avg_METEOR'].append(round(df[f'METEOR_{model}'].mean(), 4))
        
        
        summary_data['Avg_Aggregate_Score'].append(round(df[f'Aggregate_Score_{model}'].mean(), 4))
    
    summary_df = pd.DataFrame(summary_data)
    
    # ==================== SAVE RESULTS ====================
    
    output_file = excel_file.replace('.xlsx', '_evaluated.xlsx')
    
    print(f"\nðŸ’¾ Saving results to: {output_file}")
    
    with pd.ExcelWriter(output_file, engine='openpyxl') as writer:
        df.to_excel(writer, sheet_name='Detailed_Results', index=False)
        summary_df.to_excel(writer, sheet_name='Summary', index=False)
    
    print(f"âœ… Results saved successfully!\n")
    
    # ==================== DISPLAY RESULTS ====================
    
    print("="*60)
    print("SUMMARY STATISTICS")
    print("="*60)
    print(summary_df.to_string(index=False))
    
    # Display best performing model
    best_model_idx = summary_df['Avg_Aggregate_Score'].idxmax()
    best_model = summary_df.loc[best_model_idx, 'Model']
    best_score = summary_df.loc[best_model_idx, 'Avg_Aggregate_Score']
    print(f"\n Best Performing Model: {best_model} (Aggregate Score: {best_score})")
    
    return df, summary_df

In [12]:
excel_file_path='test_Q_A_with_oss_answers.xlsx'

detailed_results, summary = evaluate_answers(
    excel_file_path,
    sheet_name='Sheet1',

)

 Loaded 51 rows from Excel
 Columns: ['Question', 'Human_Answer', 'Gpt_Answer', 'Gemma_Answer', 'Llama_Answer']
 Standardized columns: ['Question', 'Human_Answer', 'GPT_Answer', 'Gemma_Answer', 'Llama_Answer']

 Found models: ['GPT', 'Gemma', 'Llama']

CALCULATING METRICS

 Evaluating GPT...
   â†’ BLEU...
   â†’ ROUGE...
   â†’ METEOR...
   âœ… GPT evaluation completed!

 Evaluating Gemma...
   â†’ BLEU...
   â†’ ROUGE...
   â†’ METEOR...
   âœ… Gemma evaluation completed!

 Evaluating Llama...
   â†’ BLEU...
   â†’ ROUGE...
   â†’ METEOR...
   âœ… Llama evaluation completed!

CALCULATING AGGREGATE SCORES
âœ… Aggregate score calculated for GPT
âœ… Aggregate score calculated for Gemma
âœ… Aggregate score calculated for Llama

GENERATING SUMMARY STATISTICS

ðŸ’¾ Saving results to: test_Q_A_with_oss_answers_evaluated.xlsx
âœ… Results saved successfully!

SUMMARY STATISTICS
Model  Avg_BLEU  Avg_ROUGE-1  Avg_ROUGE-2  Avg_ROUGE-L  Avg_METEOR  Avg_Aggregate_Score
  GPT    0.0636       0.3407

In [None]:
import seaborn as sns 
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
def create_visualizations(summary_df, detailed_df, output_folder='plots'):
    """
    Create comprehensive visualizations for presentation
    
    Parameters:
    -----------
    summary_df : DataFrame
        Summary statistics dataframe
    detailed_df : DataFrame
        Detailed results dataframe
    output_folder : str
        Folder to save plots
    """
    import os
    os.makedirs(output_folder, exist_ok=True)
    
    models = summary_df['Model'].tolist()
    
    print("\n" + "="*60)
    print("CREATING VISUALIZATIONS")
    print("="*60)
    
    # ==================== PLOT 1: Overall Performance Comparison ====================
    print("\n Creating Plot 1: Overall Performance Comparison...")
    
    fig, ax = plt.subplots(figsize=(12, 6))
    x = np.arange(len(models))
    width = 0.15
    
    metrics = ['Avg_BLEU', 'Avg_ROUGE-L', 'Avg_METEOR', 'Avg_Aggregate_Score']
    colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A']
    
    for i, (metric, color) in enumerate(zip(metrics, colors)):
        values = summary_df[metric].values
        ax.bar(x + i*width, values, width, label=metric.replace('Avg_', ''), color=color, alpha=0.8)
    
    ax.set_xlabel('Models', fontsize=12, fontweight='bold')
    ax.set_ylabel('Score', fontsize=12, fontweight='bold')
    ax.set_title('Overall Performance Comparison Across Models', fontsize=14, fontweight='bold')
    ax.set_xticks(x + width * 1.5)
    ax.set_xticklabels(models)
    ax.legend(loc='upper left', framealpha=0.9)
    ax.set_ylim(0, 1.0)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{output_folder}/01_overall_performance.png', dpi=300, bbox_inches='tight')
    print(f"    Saved: {output_folder}/01_overall_performance.png")
    plt.close()
    
    
    # ==================== PLOT 2: Radar Chart ====================
    print("\n Creating Plot 2: Radar Chart...")
    
    from math import pi
    
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection='polar'))
    
    metrics = ['Avg_BLEU', 'Avg_ROUGE-1', 'Avg_ROUGE-2', 'Avg_ROUGE-L', 'Avg_METEOR']
    num_vars = len(metrics)
    angles = [n / float(num_vars) * 2 * pi for n in range(num_vars)]
    angles += angles[:1]
    
    colors_radar = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#96CEB4']
    
    for idx, model in enumerate(models):
        values = summary_df[summary_df['Model'] == model][metrics].values.flatten().tolist()
        values += values[:1]
        
        ax.plot(angles, values, 'o-', linewidth=2, label=model, color=colors_radar[idx % len(colors_radar)])
        ax.fill(angles, values, alpha=0.15, color=colors_radar[idx % len(colors_radar)])
    
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels([m.replace('Avg_', '').replace('-', '\n') for m in metrics], fontsize=10)
    ax.set_ylim(0, 1)
    ax.set_title('Multi-Metric Performance Comparison\n(Radar Chart)', 
                 fontsize=14, fontweight='bold', pad=20)
    ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
    ax.grid(True)
    
    plt.tight_layout()
    plt.savefig(f'{output_folder}/02_radar_chart.png', dpi=300, bbox_inches='tight')
    print(f"    Saved: {output_folder}/02_radar_chart.png")
    plt.close()
    
    
    # ==================== PLOT 3: Heatmap ====================
    print("\n Creating Plot 3: Heatmap...")
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    heatmap_data = summary_df[['Model', 'Avg_BLEU', 'Avg_ROUGE-1', 'Avg_ROUGE-2', 
                                'Avg_ROUGE-L', 'Avg_METEOR', 'Avg_Aggregate_Score']]
    heatmap_data = heatmap_data.set_index('Model')
    
    sns.heatmap(heatmap_data.T, annot=True, fmt='.3f', cmap='RdYlGn', 
                center=0.5, linewidths=1, cbar_kws={'label': 'Score'},
                vmin=0, vmax=1)
    
    plt.title('Performance Heatmap Across All Metrics', fontsize=14, fontweight='bold')
    plt.xlabel('Models', fontsize=12, fontweight='bold')
    plt.ylabel('Metrics', fontsize=12, fontweight='bold')
    plt.tight_layout()
    plt.savefig(f'{output_folder}/03_heatmap.png', dpi=300, bbox_inches='tight')
    print(f"    Saved: {output_folder}/03_heatmap.png")
    plt.close()
    
    
    # ==================== PLOT 4: Box Plot (Score Distribution) ====================
    print("\n Creating Plot 4: Score Distribution (Box Plot)...")
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    fig.suptitle('Score Distribution Across Models', fontsize=16, fontweight='bold')
    
    metrics_box = ['BLEU', 'ROUGEL', 'METEOR', 'Aggregate_Score']
    metric_names = ['BLEU Score', 'ROUGE-L Score', 'METEOR Score', 'Aggregate Score']
    
    for idx, (metric, name) in enumerate(zip(metrics_box, metric_names)):
        ax = axes[idx // 2, idx % 2]
        
        data_to_plot = []
        for model in models:
            col_name = f'{metric}_{model}'
            if col_name in detailed_df.columns:
                data_to_plot.append(detailed_df[col_name].values)
        
        bp = ax.boxplot(data_to_plot, labels=models, patch_artist=True,
                        notch=True, showmeans=True)
        
        # Color the boxes
        colors_box = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#96CEB4']
        for patch, color in zip(bp['boxes'], colors_box):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)
        
        ax.set_title(name, fontsize=12, fontweight='bold')
        ax.set_ylabel('Score', fontsize=10)
        ax.grid(axis='y', alpha=0.3)
        ax.set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig(f'{output_folder}/04_boxplot_distribution.png', dpi=300, bbox_inches='tight')
    print(f"    Saved: {output_folder}/04_boxplot_distribution.png")
    plt.close()
    
    
    # ==================== PLOT 5: Perplexity Comparison ====================
    if 'Avg_Perplexity' in summary_df.columns and summary_df['Avg_Perplexity'].iloc[0] != 'N/A':
        print("\n Creating Plot 5: Perplexity Comparison...")
        
        fig, ax = plt.subplots(figsize=(10, 6))
        
        ppl_data = summary_df[summary_df['Avg_Perplexity'] != 'N/A'].copy()
        ppl_data['Avg_Perplexity'] = pd.to_numeric(ppl_data['Avg_Perplexity'])
        
        bars = ax.bar(ppl_data['Model'], ppl_data['Avg_Perplexity'], 
                     color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#96CEB4'][:len(ppl_data)],
                     alpha=0.8)
        
        ax.set_xlabel('Models', fontsize=12, fontweight='bold')
        ax.set_ylabel('Perplexity', fontsize=12, fontweight='bold')
        ax.set_title('Perplexity Comparison (Lower is Better)', fontsize=14, fontweight='bold')
        ax.grid(axis='y', alpha=0.3)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.2f}',
                   ha='center', va='bottom', fontsize=10, fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(f'{output_folder}/05_perplexity_comparison.png', dpi=300, bbox_inches='tight')
        print(f"    Saved: {output_folder}/05_perplexity_comparison.png")
        plt.close()
    
    
    # ==================== PLOT 6: Ranking Chart ====================
    print("\nðŸ“Š Creating Plot 6: Model Ranking...")
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Create ranking based on aggregate score
    ranking_df = summary_df.sort_values('Avg_Aggregate_Score', ascending=False).reset_index(drop=True)
    ranking_df['Rank'] = range(1, len(ranking_df) + 1)
    
    colors_rank = ['#FFD700', '#C0C0C0', '#CD7F32', '#87CEEB', '#90EE90']  # Gold, Silver, Bronze, etc.
    
    bars = ax.barh(ranking_df['Model'], ranking_df['Avg_Aggregate_Score'], 
                   color=colors_rank[:len(ranking_df)], alpha=0.8)
    
    # Add rank labels
    for idx, (score, model, rank) in enumerate(zip(ranking_df['Avg_Aggregate_Score'], 
                                                     ranking_df['Model'], 
                                                     ranking_df['Rank'])):
        ax.text(score + 0.01, idx, f'#{rank} - {score:.4f}', 
               va='center', fontsize=11, fontweight='bold')
    
    ax.set_xlabel('Aggregate Score', fontsize=12, fontweight='bold')
    ax.set_ylabel('Models', fontsize=12, fontweight='bold')
    ax.set_title('Model Ranking by Aggregate Performance', fontsize=14, fontweight='bold')
    ax.set_xlim(0, 1.1)
    ax.grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{output_folder}/06_model_ranking.png', dpi=300, bbox_inches='tight')
    print(f"    Saved: {output_folder}/06_model_ranking.png")
    plt.close()
    
    
    # ==================== PLOT 7: ROUGE Variants Comparison ====================
    print("\nðŸ“Š Creating Plot 7: ROUGE Variants Comparison...")
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    x = np.arange(len(models))
    width = 0.25
    
    rouge1 = summary_df['Avg_ROUGE-1'].values
    rouge2 = summary_df['Avg_ROUGE-2'].values
    rougeL = summary_df['Avg_ROUGE-L'].values
    
    ax.bar(x - width, rouge1, width, label='ROUGE-1', color='#FF6B6B', alpha=0.8)
    ax.bar(x, rouge2, width, label='ROUGE-2', color='#4ECDC4', alpha=0.8)
    ax.bar(x + width, rougeL, width, label='ROUGE-L', color='#45B7D1', alpha=0.8)
    
    ax.set_xlabel('Models', fontsize=12, fontweight='bold')
    ax.set_ylabel('ROUGE Score', fontsize=12, fontweight='bold')
    ax.set_title('ROUGE Variants Comparison (1-gram, 2-gram, LCS)', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(models)
    ax.legend(framealpha=0.9)
    ax.set_ylim(0, 1.0)
    ax.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(f'{output_folder}/07_rouge_variants.png', dpi=300, bbox_inches='tight')
    print(f"    Saved: {output_folder}/07_rouge_variants.png")
    plt.close()
    
    
    print("\n" + "="*60)
    print(f" ALL VISUALIZATIONS SAVED IN '{output_folder}/' FOLDER")
    print("="*60)
    print("\n Generated Plots:")
    print("   1. Overall Performance Comparison (Bar Chart)")
    print("   2. Multi-Metric Radar Chart")
    print("   3. Performance Heatmap")
    print("   4. Score Distribution (Box Plots)")
    print("   6. Model Ranking Chart")
    print("   7. ROUGE Variants Comparison")



In [22]:
!pip install seaborn matplotlib



In [13]:
create_visualizations(summary, detailed_results, output_folder='presentation_plots')


CREATING VISUALIZATIONS

ðŸ“Š Creating Plot 1: Overall Performance Comparison...
   âœ… Saved: presentation_plots/01_overall_performance.png

ðŸ“Š Creating Plot 2: Radar Chart...
   âœ… Saved: presentation_plots/02_radar_chart.png

ðŸ“Š Creating Plot 3: Heatmap...
   âœ… Saved: presentation_plots/03_heatmap.png

ðŸ“Š Creating Plot 4: Score Distribution (Box Plot)...


  bp = ax.boxplot(data_to_plot, labels=models, patch_artist=True,
  bp = ax.boxplot(data_to_plot, labels=models, patch_artist=True,
  bp = ax.boxplot(data_to_plot, labels=models, patch_artist=True,
  bp = ax.boxplot(data_to_plot, labels=models, patch_artist=True,


   âœ… Saved: presentation_plots/04_boxplot_distribution.png

ðŸ“Š Creating Plot 6: Model Ranking...
   âœ… Saved: presentation_plots/06_model_ranking.png

ðŸ“Š Creating Plot 7: ROUGE Variants Comparison...
   âœ… Saved: presentation_plots/07_rouge_variants.png

âœ… ALL VISUALIZATIONS SAVED IN 'presentation_plots/' FOLDER

ðŸ“Š Generated Plots:
   1. Overall Performance Comparison (Bar Chart)
   2. Multi-Metric Radar Chart
   3. Performance Heatmap
   4. Score Distribution (Box Plots)
   5. Perplexity Comparison (if calculated)
   6. Model Ranking Chart
   7. ROUGE Variants Comparison
