In [None]:
# Ablation Study 2 - KG Only Evaluation Analysis

import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import warnings
warnings.filterwarnings('ignore')

# Define file mappings for Ablation Study 2 - KG Only (One-Shot only)
FILES = {
    'Mixtral-8x22B': {
        'One-Shot': 'Ablation_2_kg_only_evaluation(mixtral-8x22b-instruct-v0.1)_results.csv'
    },
    'Llama-3.3-70B': {
        'One-Shot': 'Llama_3.3_70B_kg_only_evaluation_results.csv'
    },
    'DeepSeek-R1': {
        'One-Shot': 'DeepSeek-R1_Distill_Llma70B_kg_only_evaluation_results.csv'
    },
    'Qwen3-32B': {
        'One-Shot': 'Qwen3_32B_kg_only_evaluation_results.csv'
    },
    'Gemma-2-27B-IT': {
        'One-Shot': 'Gemma_27B_kg_only_evaluation_results.csv'
    }
}

METRICS = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
NUM_JUDGES = 5 # Define the number of judges (5 different LLM models)

class AblationStudy2EvaluationAnalyzer:
    def __init__(self):
        self.df = None
        self.majority_vote_df = None
        self.model_majority_agreement = {} # New attribute to store per-model agreement

    def load_all_data(self):
        """Load all CSV files and combine them into a single DataFrame."""
        all_data = []
        
        for model, shot_types in FILES.items():
            for shot_type, filename in shot_types.items():
                try:
                    df = pd.read_csv(filename)
                    df['model'] = model
                    df['shot_type'] = shot_type
                    all_data.append(df)
                    print(f"Loaded: {filename} ({len(df)} rows)")
                except Exception as e:
                    print(f"Error loading {filename}: {e}")
        
        if not all_data:
            raise ValueError("No data files could be loaded!")
        
        combined_df = pd.concat(all_data, ignore_index=True)
        print(f"\nTotal records loaded: {len(combined_df)}")
        
        # Ensure Overall_Score is calculated if not present
        if 'Overall_Score' not in combined_df.columns or combined_df['Overall_Score'].isna().any():
            combined_df['Overall_Score'] = combined_df[METRICS].mean(axis=1)
        
        self.df = combined_df
        
        # Print dataset overview
        print(f"\nDataset Overview:")
        print(f"Unique QA pairs: {combined_df['qa_id'].nunique()}")
        print(f"Generation methods: {sorted(combined_df['generation_method'].unique()) if 'generation_method' in combined_df.columns else 'N/A'}")
        print(f"Question types: {sorted(combined_df['question_type'].unique())}")
        print(f"Ablation study: {combined_df['ablation_study'].unique()[0] if 'ablation_study' in combined_df.columns else 'N/A'}")
        
        # Compute majority vote dataframe (and now also per-model agreement stats)
        self.compute_majority_vote()
        
        return combined_df
    
    def compute_majority_vote(self):
        """
        Compute majority vote per QA pair per metric.
        In this structure, each model IS a judge, so we need to compare scores across models for the same qa_id.
        """
        print("\n" + "="*60)
        print("COMPUTING MAJORITY VOTE AND AGREEMENT STATISTICS")
        print("="*60)
        
        # In this data structure:
        # - Each CSV file = one model's evaluations
        # - Each model = one judge
        # - For majority vote, we need to compare scores from all models for the same qa_id
        
        # Group by qa_id to see how many models evaluated each question
        qa_id_counts = self.df.groupby('qa_id')['model'].nunique().reset_index(name='num_models')
        
        # Debug: Show distribution of model counts per qa_id
        print("\nDistribution of models (judges) per qa_id:")
        count_distribution = qa_id_counts['num_models'].value_counts().sort_index()
        for model_count, instances in count_distribution.items():
            print(f"  {model_count} models: {instances} qa_ids")
        
        # Filter to keep only qa_ids that were evaluated by all 5 models
        valid_qa_ids = qa_id_counts[qa_id_counts['num_models'] == NUM_JUDGES]['qa_id'].tolist()
        
        print(f"\nTotal unique qa_ids: {len(qa_id_counts)}")
        print(f"qa_ids evaluated by all {NUM_JUDGES} models: {len(valid_qa_ids)}")
        
        if len(valid_qa_ids) == 0:
            print("\nWARNING: No qa_ids were evaluated by all 5 models!")
            self.majority_vote_df = pd.DataFrame()
            self.model_majority_agreement = {}
            return self.majority_vote_df
        
        majority_vote_data = []
        agreement_stats_by_metric = {metric: {'full_agreement': 0, 'majority_exists': 0, 'no_majority': 0} for metric in METRICS}
        agreement_stats_by_qa = {'all_metrics_majority': 0, 'some_metrics_majority': 0, 'no_metrics_majority': 0}
        
        for qa_id in valid_qa_ids:
            # Get all evaluations for this qa_id (one from each model/judge)
            qa_data = self.df[self.df['qa_id'] == qa_id]
            
            if len(qa_data) != NUM_JUDGES:
                continue  # Skip if we don't have exactly NUM_JUDGES evaluations
            
            # Get metadata from the first row
            first_row = qa_data.iloc[0]
            row_data = {
                'qa_id': qa_id,
                'question_type': first_row['question_type'],
                'question': first_row['question'],
                'answer': first_row['answer']
            }
            
            # Add KG-specific metadata if available
            if 'generation_method' in first_row:
                row_data['generation_method'] = first_row['generation_method']
            if 'num_source_triples' in first_row:
                row_data['num_source_triples'] = first_row['num_source_triples']
            if 'text_context_used' in first_row:
                row_data['text_context_used'] = first_row['text_context_used']
            if 'ablation_study' in first_row:
                row_data['ablation_study'] = first_row['ablation_study']
            
            metrics_with_majority = 0
            
            # Compute majority vote for each metric
            for metric in METRICS:
                # Get scores from all models/judges for this metric
                scores = qa_data[metric].values
                score_counts = Counter(scores)
                
                if not score_counts:
                    row_data[f'{metric}_majority'] = np.nan
                    continue
                
                # Get the most common score and its count
                majority_score, majority_count = score_counts.most_common(1)[0]
                row_data[f'{metric}_majority'] = majority_score
                row_data[f'{metric}_majority_count'] = majority_count
                
                # Check for full agreement (all judges gave the same score)
                if len(score_counts) == 1:
                    agreement_stats_by_metric[metric]['full_agreement'] += 1
                    metrics_with_majority += 1
                # Check for simple majority (> NUM_JUDGES / 2)
                elif majority_count > NUM_JUDGES / 2:
                    agreement_stats_by_metric[metric]['majority_exists'] += 1
                    metrics_with_majority += 1
                else:
                    # No clear majority (tie or no score exceeds half)
                    agreement_stats_by_metric[metric]['no_majority'] += 1
            
            # Track QA-level agreement
            if metrics_with_majority == len(METRICS):
                agreement_stats_by_qa['all_metrics_majority'] += 1
            elif metrics_with_majority > 0:
                agreement_stats_by_qa['some_metrics_majority'] += 1
            else:
                agreement_stats_by_qa['no_metrics_majority'] += 1
            
            # Compute overall score from majority votes
            majority_scores = [row_data[f'{metric}_majority'] for metric in METRICS if f'{metric}_majority' in row_data]
            row_data['Overall_Score_majority'] = np.mean(majority_scores) if majority_scores else np.nan
            
            majority_vote_data.append(row_data)
        
        self.majority_vote_df = pd.DataFrame(majority_vote_data)
        
        # Only proceed if we have data
        if len(self.majority_vote_df) == 0:
            print("\nNo majority vote data to analyze.")
            return self.majority_vote_df
        
        # Print agreement statistics by metric
        print(f"\nTotal qa_ids with majority vote computed: {len(self.majority_vote_df)}")
        print("\nAgreement Statistics by Metric:")
        print("-" * 80)
        print(f"{'Metric':15s} {'Full Agreement':>20s} {'Majority Exists':>20s} {'No Majority':>20s}")
        print("-" * 80)
        
        for metric in METRICS:
            full = agreement_stats_by_metric[metric]['full_agreement']
            majority = agreement_stats_by_metric[metric]['majority_exists']
            no_maj = agreement_stats_by_metric[metric]['no_majority']
            total = full + majority + no_maj
            
            if total > 0:
                full_pct = (full / total) * 100
                majority_pct = (majority / total) * 100
                no_maj_pct = (no_maj / total) * 100
                print(f"{metric:15s} {full:>7d} ({full_pct:>5.1f}%) {majority:>7d} ({majority_pct:>5.1f}%) {no_maj:>7d} ({no_maj_pct:>5.1f}%)")
        
        # Print QA-level agreement statistics
        print("\nAgreement Statistics by QA Pair:")
        print("-" * 50)
        total_qa = sum(agreement_stats_by_qa.values())
        if total_qa > 0:
            all_maj = agreement_stats_by_qa['all_metrics_majority']
            some_maj = agreement_stats_by_qa['some_metrics_majority']
            no_maj = agreement_stats_by_qa['no_metrics_majority']
            
            print(f"All metrics have majority:  {all_maj:>5d} ({(all_maj/total_qa)*100:>5.1f}%)")
            print(f"Some metrics have majority: {some_maj:>5d} ({(some_maj/total_qa)*100:>5.1f}%)")
            print(f"No metrics have majority:   {no_maj:>5d} ({(no_maj/total_qa)*100:>5.1f}%)")
        
        # Calculate per-model agreement statistics (how often each model agrees with majority)
        print("\n" + "="*60)
        print("PER-MODEL AGREEMENT WITH MAJORITY VOTE")
        print("="*60)
        
        model_agreement_stats = {}
        
        for model in self.df['model'].unique():
            total_comparisons = 0
            agreements = 0
            
            # For each qa_id in majority vote results
            for _, maj_row in self.majority_vote_df.iterrows():
                qa_id = maj_row['qa_id']
                
                # Get this model's evaluation for this qa_id
                model_eval = self.df[(self.df['qa_id'] == qa_id) & (self.df['model'] == model)]
                
                if len(model_eval) == 0:
                    continue
                
                model_row = model_eval.iloc[0]
                
                # Compare model's scores with majority votes
                for metric in METRICS:
                    if f'{metric}_majority' in maj_row and not pd.isna(maj_row[f'{metric}_majority']):
                        majority_vote = maj_row[f'{metric}_majority']
                        model_vote = model_row[metric]
                        
                        total_comparisons += 1
                        if model_vote == majority_vote:
                            agreements += 1
            
            if total_comparisons > 0:
                agreement_pct = (agreements / total_comparisons) * 100
                model_agreement_stats[model] = {
                    'agreements': agreements,
                    'total': total_comparisons,
                    'percentage': agreement_pct
                }
                print(f"  {model:25s}: {agreement_pct:.1f}% ({agreements}/{total_comparisons} metric evaluations)")
            else:
                print(f"  {model:25s}: N/A (No comparisons)")
        
        self.model_majority_agreement = model_agreement_stats
        
        return self.majority_vote_df
    
    def normalize_to_percentage(self, score, max_score=5):
        """Convert score to percentage (0-100 scale)."""
        return (score / max_score) * 100
    
    def compute_overall_metrics(self):
        """1. Compute mean and std dev per metric across all QA pairs."""
        print("\n" + "="*60)
        print("1. MEAN AND STD DEV PER METRIC (OVERALL) - ABLATION STUDY 2")
        print("="*60)
        
        results = {}
        for metric in METRICS:
            mean_score = self.df[metric].mean()
            std_score = self.df[metric].std()
            mean_pct = self.normalize_to_percentage(mean_score)
            
            results[metric] = {
                'mean': mean_score,
                'std': std_score,
                'mean_pct': mean_pct
            }
            
            print(f"{metric:15s}: {mean_score:.3f} ± {std_score:.3f} ({mean_pct:.1f}%)")
        
        # Overall score (mean of all metrics)
        overall_mean = self.df['Overall_Score'].mean()
        overall_std = self.df['Overall_Score'].std()
        overall_pct = self.normalize_to_percentage(overall_mean)
        
        print(f"{'Overall Score':15s}: {overall_mean:.3f} ± {overall_std:.3f} ({overall_pct:.1f}%)")
        
        return results
    
    def compute_overall_score_distribution(self):
        """2. Compute overall score distribution."""
        print("\n" + "="*60)
        print("2. OVERALL SCORE DISTRIBUTION - ABLATION STUDY 2")
        print("="*60)
        
        overall_scores = self.df['Overall_Score']
        
        print(f"Mean:          {overall_scores.mean():.3f}")
        print(f"Std Dev:       {overall_scores.std():.3f}")
        print(f"Min:           {overall_scores.min():.3f}")
        print(f"Max:           {overall_scores.max():.3f}")
        
        # Distribution by ranges
        print("\nDistribution by score ranges:")
        ranges = [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (4.0, 5.0)]
        
        for start, end in ranges:
            if start == 4.0:  # Include 5.0 in the last range
                count = ((overall_scores >= start) & (overall_scores <= end)).sum()
            else:
                count = ((overall_scores >= start) & (overall_scores < end)).sum()
            pct = (count / len(overall_scores)) * 100
            print(f"  [{start:.1f}-{end:.1f}]: {count:5d} ({pct:5.1f}%)")
    
    def compute_breakdown_by_question_type(self):
        """3. Breakdown by question type (since we only have One-Shot)."""
        print("\n" + "="*60)
        print("3. BREAKDOWN BY QUESTION TYPE - ABLATION STUDY 2")
        print("="*60)
        
        question_type_results = {}
        
        for q_type in sorted(self.df['question_type'].unique()):
            q_df = self.df[self.df['question_type'] == q_type]
            count = len(q_df)
            
            print(f"\n{q_type} (n={count}):")
            print("-" * 40)
            
            results = {}
            for metric in METRICS:
                mean_score = q_df[metric].mean()
                std_score = q_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                
                results[metric] = {'mean': mean_score, 'std': std_score, 'mean_pct': mean_pct}
                print(f"  {metric:15s}: {mean_score:.3f} ± {std_score:.3f} ({mean_pct:.1f}%)")
            
            # Overall score
            overall_mean = q_df['Overall_Score'].mean()
            overall_std = q_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            
            print(f"  {'Overall Score':15s}: {overall_mean:.3f} ± {overall_std:.3f} ({overall_pct:.1f}%)")
            
            question_type_results[q_type] = {
                'count': count,
                'metrics': results,
                'overall': {'mean': overall_mean, 'std': overall_std, 'pct': overall_pct}
            }
        
        return question_type_results
    
    def compute_breakdown_by_generation_method(self):
        """4. Breakdown by generation method (KG-specific analysis)."""
        print("\n" + "="*60)
        print("4. BREAKDOWN BY GENERATION METHOD - ABLATION STUDY 2")
        print("="*60)
        
        if 'generation_method' not in self.df.columns:
            print("Generation method information not available in the data.")
            return {}
        
        generation_method_results = {}
        
        for gen_method in sorted(self.df['generation_method'].unique()):
            gen_df = self.df[self.df['generation_method'] == gen_method]
            count = len(gen_df)
            
            print(f"\n{gen_method} (n={count}):")
            print("-" * 40)
            
            results = {}
            for metric in METRICS:
                mean_score = gen_df[metric].mean()
                std_score = gen_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                
                results[metric] = {'mean': mean_score, 'std': std_score, 'mean_pct': mean_pct}
                print(f"  {metric:15s}: {mean_score:.3f} ± {std_score:.3f} ({mean_pct:.1f}%)")
            
            # Overall score
            overall_mean = gen_df['Overall_Score'].mean()
            overall_std = gen_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            
            print(f"  {'Overall Score':15s}: {overall_mean:.3f} ± {overall_std:.3f} ({overall_pct:.1f}%)")
            
            generation_method_results[gen_method] = {
                'count': count,
                'metrics': results,
                'overall': {'mean': overall_mean, 'std': overall_std, 'pct': overall_pct}
            }
        
        return generation_method_results
    
    def compute_kg_specific_analysis(self):
        """5. KG-specific analysis (number of source triples, text context usage)."""
        print("\n" + "="*60)
        print("5. KG-SPECIFIC ANALYSIS - ABLATION STUDY 2")
        print("="*60)
        
        # Analysis by number of source triples
        if 'num_source_triples' in self.df.columns:
            print("\n5.1 Analysis by Number of Source Triples:")
            print("-" * 40)
            
            # Create bins for source triples
            if not self.df['num_source_triples'].isna().all():
                source_triple_stats = self.df.groupby('num_source_triples')['Overall_Score'].agg(['mean', 'std', 'count']).reset_index()
                source_triple_stats = source_triple_stats.sort_values('num_source_triples')
                
                for _, row in source_triple_stats.iterrows():
                    triples = int(row['num_source_triples'])
                    mean_score = row['mean']
                    std_score = row['std']
                    count = int(row['count'])
                    mean_pct = self.normalize_to_percentage(mean_score)
                    
                    print(f"  {triples:2d} triples: {mean_score:.3f} ± {std_score:.3f} ({mean_pct:.1f}%) [n={count}]")
        
        # Analysis by text context usage
        if 'text_context_used' in self.df.columns:
            print("\n5.2 Analysis by Text Context Usage:")
            print("-" * 40)
            
            for context_used in sorted(self.df['text_context_used'].unique()):
                context_df = self.df[self.df['text_context_used'] == context_used]
                count = len(context_df)
                
                overall_mean = context_df['Overall_Score'].mean()
                overall_std = context_df['Overall_Score'].std()
                overall_pct = self.normalize_to_percentage(overall_mean)
                
                print(f"  Text context {context_used}: {overall_mean:.3f} ± {overall_std:.3f} ({overall_pct:.1f}%) [n={count}]")
    
    def compute_individual_model_analysis(self):
        """6. Individual model analysis by question type (One-Shot only)."""
        print("\n" + "="*60)
        print("6. INDIVIDUAL MODEL ANALYSIS - ABLATION STUDY 2")
        print("="*60)
        
        models = self.df['model'].unique()
        
        for model in sorted(models):
            print(f"\n{'='*60}")
            print(f"MODEL: {model}")
            print(f"{'='*60}")
            
            model_df = self.df[self.df['model'] == model]
            
            # 6.1 Overall performance for this model
            print(f"\n6.1 Overall Performance for {model}:")
            print("-" * 40)
            for metric in METRICS:
                mean_score = model_df[metric].mean()
                std_score = model_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                print(f"  {metric:15s}: {mean_score:.3f} ± {std_score:.3f} ({mean_pct:.1f}%)")
            
            overall_mean = model_df['Overall_Score'].mean()
            overall_std = model_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            print(f"  {'Overall Score':15s}: {overall_mean:.3f} ± {overall_std:.3f} ({overall_pct:.1f}%)")
            
            # 6.2 By Question Type
            print(f"\n6.2 {model} - Performance by Question Type:")
            print("-" * 40)
            
            for q_type in sorted(model_df['question_type'].unique()):
                q_model_df = model_df[model_df['question_type'] == q_type]
                count = len(q_model_df)
                
                overall_mean = q_model_df['Overall_Score'].mean()
                overall_pct = self.normalize_to_percentage(overall_mean)
                
                print(f"  {q_type:20s}: {overall_mean:.3f} ({overall_pct:.1f}%) [n={count}]")
    
    def compute_model_comparison_summary(self):
        """7. Model comparison summary table."""
        print("\n" + "="*60)
        print("7. MODEL COMPARISON SUMMARY - ABLATION STUDY 2")
        print("="*60)
        
        models = sorted(self.df['model'].unique())
        
        # Create summary table
        print(f"\n{'Model':20s}", end="")
        for metric in METRICS:
            print(f"{metric:>12s}", end="")
        print(f"{'Overall':>12s}")
        print("-" * (20 + 12 * (len(METRICS) + 1)))
        
        for model in models:
            model_df = self.df[self.df['model'] == model]
            print(f"{model:20s}", end="")
            
            for metric in METRICS:
                mean_score = model_df[metric].mean()
                print(f"{mean_score:>12.3f}", end="")
            
            overall_mean = model_df['Overall_Score'].mean()
            print(f"{overall_mean:>12.3f}")
        
        # Add average across all models
        print("-" * (20 + 12 * (len(METRICS) + 1)))
        print(f"{'AVERAGE':20s}", end="")
        
        for metric in METRICS:
            mean_score = self.df[metric].mean()
            print(f"{mean_score:>12.3f}", end="")
        
        overall_mean = self.df['Overall_Score'].mean()
        print(f"{overall_mean:>12.3f}")
    
    def run_all_analyses(self):
        """Main function to run all analyses."""
        print("ABLATION STUDY 2 - KG ONLY EVALUATION ANALYSIS")
        print("=" * 60)
        print("Prompting Strategy: One-Shot Only")
        print("=" * 60)
        
        # Load all data (this also computes majority vote)
        self.load_all_data()
        
        # Run all analyses
        self.compute_overall_metrics()
        self.compute_overall_score_distribution()
        self.compute_breakdown_by_question_type()
        self.compute_breakdown_by_generation_method()
        self.compute_kg_specific_analysis()
        self.compute_individual_model_analysis()
        self.compute_model_comparison_summary()
        
        print("\n" + "="*60)
        print("ABLATION STUDY 2 ANALYSIS COMPLETE")
        print("="*60)

def main():
    analyzer = AblationStudy2EvaluationAnalyzer()
    analyzer.run_all_analyses()

if __name__ == "__main__":
    main()