In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import warnings
import json
from datetime import datetime
warnings.filterwarnings('ignore')

# Define files
FILES = {
    'Mixtral-8x22B': { # Corrected model name to match table
        'Zero-Shot': 'mixtral_zeroshot_evaluation_results.csv',
        'One-Shot': 'mixtral_oneshot_evaluation_results.csv',
        'Few-Shot': 'mixtral_fewshot_evaluation_results.csv'
    },
    'Llama-3.3-70B': { # Corrected model name to match table
        'Zero-Shot': 'Llama_zeroshot_evaluation_results.csv',
        'One-Shot': 'Llama_oneshot_evaluation_results.csv',
        'Few-Shot': 'Llama_fewshot_evaluation_results.csv'
    },
    'DeepSeek-R1': { # Corrected model name to match table
        'Zero-Shot': 'DeepSeek_zeroshot_evaluation_results.csv',
        'One-Shot': 'DeepSeek_oneshot_evaluation_results.csv',
        'Few-Shot': 'DeepSeek_fewshot_evaluation_results.csv'
    },
    'Qwen3-32B': {
        'Zero-Shot': 'Qwen_zeroshot_evaluation_results.csv',
        'One-Shot': 'Qwen_oneshot_evaluation_results.csv',
        'Few-Shot': 'Qwen_fewshot_evaluation_results.csv'
    },
    'Gemma-2-27B-IT': {
        'Zero-Shot': 'gemma_zeroshot_evaluation_results.csv',
        'One-Shot': 'gemma_oneshot_evaluation_results.csv',
        'Few-Shot': 'gemma_fewshot_evaluation_results.csv'
    }
}

METRICS = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
NUM_JUDGES = 5 # Define the number of judges

class MultiModelEvaluationAnalyzer:
    def __init__(self):
        self.df = None
        self.majority_vote_df = None
        self.model_majority_agreement = {} # New attribute to store per-model agreement
        self.analysis_report = {
            'metadata': {},
            'data_loading': {},
            'majority_vote_analysis': {},
            'overall_metrics': {},
            'score_distribution': {},
            'shot_type_analysis': {},
            'question_type_analysis': {},
            'interaction_matrix': {},
            'individual_model_analysis': {}
        }

    def load_all_data(self):
        """Load all CSV files and combine them into a single DataFrame."""
        all_data = []
        loading_info = {'files_loaded': [], 'files_failed': [], 'total_records': 0}
        
        for model, shot_types in FILES.items():
            for shot_type, filename in shot_types.items():
                try:
                    df = pd.read_csv(filename)
                    df['model'] = model
                    df['shot_type'] = shot_type
                    all_data.append(df)
                    loading_info['files_loaded'].append({
                        'filename': filename,
                        'model': model,
                        'shot_type': shot_type,
                        'records': len(df)
                    })
                except Exception as e:
                    loading_info['files_failed'].append({
                        'filename': filename,
                        'model': model,
                        'shot_type': shot_type,
                        'error': str(e)
                    })
        
        if not all_data:
            raise ValueError("No data files could be loaded!")
        
        combined_df = pd.concat(all_data, ignore_index=True)
        loading_info['total_records'] = len(combined_df)
        
        # Ensure Overall_Score is calculated if not present
        if 'Overall_Score' not in combined_df.columns or combined_df['Overall_Score'].isna().any():
            combined_df['Overall_Score'] = combined_df[METRICS].mean(axis=1)
        
        self.df = combined_df
        self.analysis_report['data_loading'] = loading_info
        
        # Compute majority vote dataframe (and now also per-model agreement stats)
        self.compute_majority_vote()
        
        return combined_df
    
    def compute_majority_vote(self):
        """
        Compute majority vote per QA pair per metric.
        In this structure, each model IS a judge, so we need to compare scores across models for the same qa_id.
        """
        majority_analysis = {}
        
        # In this data structure:
        # - Each CSV file = one model's evaluations
        # - Each model = one judge
        # - For majority vote, we need to compare scores from all models for the same qa_id
        
        # Group by qa_id to see how many models evaluated each question
        qa_id_counts = self.df.groupby('qa_id')['model'].nunique().reset_index(name='num_models')
        
        # Debug: Show distribution of model counts per qa_id
        count_distribution = qa_id_counts['num_models'].value_counts().sort_index()
        majority_analysis['model_distribution'] = {
            int(model_count): int(instances) 
            for model_count, instances in count_distribution.items()
        }
        
        # Filter to keep only qa_ids that were evaluated by all 5 models
        valid_qa_ids = qa_id_counts[qa_id_counts['num_models'] == NUM_JUDGES]['qa_id'].tolist()
        
        majority_analysis['total_qa_ids'] = len(qa_id_counts)
        majority_analysis['valid_qa_ids'] = len(valid_qa_ids)
        
        if len(valid_qa_ids) == 0:
            majority_analysis['warning'] = f"No qa_ids were evaluated by all {NUM_JUDGES} models!"
            self.majority_vote_df = pd.DataFrame()
            self.model_majority_agreement = {}
            self.analysis_report['majority_vote_analysis'] = majority_analysis
            return self.majority_vote_df
        
        majority_vote_data = []
        agreement_stats_by_metric = {metric: {'full_agreement': 0, 'majority_exists': 0, 'no_majority': 0} for metric in METRICS}
        agreement_stats_by_qa = {'all_metrics_majority': 0, 'some_metrics_majority': 0, 'no_metrics_majority': 0}
        
        for qa_id in valid_qa_ids:
            # Get all evaluations for this qa_id (one from each model/judge)
            qa_data = self.df[self.df['qa_id'] == qa_id]
            
            if len(qa_data) != NUM_JUDGES:
                continue  # Skip if we don't have exactly NUM_JUDGES evaluations
            
            # Get metadata from the first row
            first_row = qa_data.iloc[0]
            row_data = {
                'qa_id': qa_id,
                'question_type': first_row['question_type'],
                'question': first_row['question'],
                'answer': first_row['answer']
            }
            
            metrics_with_majority = 0
            
            # Compute majority vote for each metric
            for metric in METRICS:
                # Get scores from all models/judges for this metric
                scores = qa_data[metric].values
                score_counts = Counter(scores)
                
                if not score_counts:
                    row_data[f'{metric}_majority'] = np.nan
                    continue
                
                # Get the most common score and its count
                majority_score, majority_count = score_counts.most_common(1)[0]
                row_data[f'{metric}_majority'] = majority_score
                row_data[f'{metric}_majority_count'] = majority_count
                
                # Check for full agreement (all judges gave the same score)
                if len(score_counts) == 1:
                    agreement_stats_by_metric[metric]['full_agreement'] += 1
                    metrics_with_majority += 1
                # Check for simple majority (> NUM_JUDGES / 2)
                elif majority_count > NUM_JUDGES / 2:
                    agreement_stats_by_metric[metric]['majority_exists'] += 1
                    metrics_with_majority += 1
                else:
                    # No clear majority (tie or no score exceeds half)
                    agreement_stats_by_metric[metric]['no_majority'] += 1
            
            # Track QA-level agreement
            if metrics_with_majority == len(METRICS):
                agreement_stats_by_qa['all_metrics_majority'] += 1
            elif metrics_with_majority > 0:
                agreement_stats_by_qa['some_metrics_majority'] += 1
            else:
                agreement_stats_by_qa['no_metrics_majority'] += 1
            
            # Compute overall score from majority votes
            majority_scores = [row_data[f'{metric}_majority'] for metric in METRICS if f'{metric}_majority' in row_data]
            row_data['Overall_Score_majority'] = np.mean(majority_scores) if majority_scores else np.nan
            
            majority_vote_data.append(row_data)
        
        self.majority_vote_df = pd.DataFrame(majority_vote_data)
        
        # Store agreement statistics
        majority_analysis['agreement_by_metric'] = agreement_stats_by_metric
        majority_analysis['agreement_by_qa'] = agreement_stats_by_qa
        majority_analysis['total_majority_vote_pairs'] = len(self.majority_vote_df)
        
        # Calculate per-model agreement statistics (how often each model agrees with majority)
        model_agreement_stats = {}
        
        for model in self.df['model'].unique():
            total_comparisons = 0
            agreements = 0
            
            # For each qa_id in majority vote results
            for _, maj_row in self.majority_vote_df.iterrows():
                qa_id = maj_row['qa_id']
                
                # Get this model's evaluation for this qa_id
                model_eval = self.df[(self.df['qa_id'] == qa_id) & (self.df['model'] == model)]
                
                if len(model_eval) == 0:
                    continue
                
                model_row = model_eval.iloc[0]
                
                # Compare model's scores with majority votes
                for metric in METRICS:
                    if f'{metric}_majority' in maj_row and not pd.isna(maj_row[f'{metric}_majority']):
                        majority_vote = maj_row[f'{metric}_majority']
                        model_vote = model_row[metric]
                        
                        total_comparisons += 1
                        if model_vote == majority_vote:
                            agreements += 1
            
            if total_comparisons > 0:
                agreement_pct = (agreements / total_comparisons) * 100
                model_agreement_stats[model] = {
                    'agreements': agreements,
                    'total': total_comparisons,
                    'percentage': agreement_pct
                }
        
        self.model_majority_agreement = model_agreement_stats
        majority_analysis['model_agreement_stats'] = model_agreement_stats
        self.analysis_report['majority_vote_analysis'] = majority_analysis
        
        return self.majority_vote_df
    
    def normalize_to_percentage(self, score, max_score=5):
        """Convert score to percentage (0-100 scale)."""
        return (score / max_score) * 100
    
    def compute_overall_metrics(self):
        """1. Compute mean and std dev per metric across all QA pairs."""
        results = {}
        for metric in METRICS:
            mean_score = self.df[metric].mean()
            std_score = self.df[metric].std()
            mean_pct = self.normalize_to_percentage(mean_score)
            
            results[metric] = {
                'mean': float(mean_score),
                'std': float(std_score),
                'mean_pct': float(mean_pct)
            }
        
        # Overall score (mean of all metrics)
        overall_mean = self.df['Overall_Score'].mean()
        overall_std = self.df['Overall_Score'].std()
        overall_pct = self.normalize_to_percentage(overall_mean)
        
        results['Overall_Score'] = {
            'mean': float(overall_mean),
            'std': float(overall_std),
            'mean_pct': float(overall_pct)
        }
        
        self.analysis_report['overall_metrics'] = results
        return results
    
    def compute_overall_score_distribution(self):
        """2. Compute overall score distribution."""
        overall_scores = self.df['Overall_Score']
        
        distribution = {
            'statistics': {
                'mean': float(overall_scores.mean()),
                'std': float(overall_scores.std()),
                'min': float(overall_scores.min()),
                'max': float(overall_scores.max())
            },
            'ranges': {}
        }
        
        # Distribution by ranges
        ranges = [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (4.0, 5.0)]
        
        for start, end in ranges:
            if start == 4.0:  # Include 5.0 in the last range
                count = ((overall_scores >= start) & (overall_scores <= end)).sum()
            else:
                count = ((overall_scores >= start) & (overall_scores < end)).sum()
            pct = (count / len(overall_scores)) * 100
            
            distribution['ranges'][f'{start:.1f}-{end:.1f}'] = {
                'count': int(count),
                'percentage': float(pct)
            }
        
        self.analysis_report['score_distribution'] = distribution
        return distribution
    
    def compute_breakdown_by_shot_type(self):
        """3. Breakdown by shot type."""
        shot_type_results = {}
        
        for shot_type in sorted(self.df['shot_type'].unique()):
            shot_df = self.df[self.df['shot_type'] == shot_type]
            count = len(shot_df)
            
            results = {}
            for metric in METRICS:
                mean_score = shot_df[metric].mean()
                std_score = shot_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                
                results[metric] = {
                    'mean': float(mean_score), 
                    'std': float(std_score), 
                    'mean_pct': float(mean_pct)
                }
            
            # Overall score
            overall_mean = shot_df['Overall_Score'].mean()
            overall_std = shot_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            
            shot_type_results[shot_type] = {
                'count': count,
                'metrics': results,
                'overall': {
                    'mean': float(overall_mean), 
                    'std': float(overall_std), 
                    'pct': float(overall_pct)
                }
            }
        
        self.analysis_report['shot_type_analysis'] = shot_type_results
        return shot_type_results
    
    def compute_breakdown_by_question_type(self):
        """4. Breakdown by question type."""
        question_type_results = {}
        
        for q_type in sorted(self.df['question_type'].unique()):
            q_df = self.df[self.df['question_type'] == q_type]
            count = len(q_df)
            
            results = {}
            for metric in METRICS:
                mean_score = q_df[metric].mean()
                std_score = q_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                
                results[metric] = {
                    'mean': float(mean_score), 
                    'std': float(std_score), 
                    'mean_pct': float(mean_pct)
                }
            
            # Overall score
            overall_mean = q_df['Overall_Score'].mean()
            overall_std = q_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            
            question_type_results[q_type] = {
                'count': count,
                'metrics': results,
                'overall': {
                    'mean': float(overall_mean), 
                    'std': float(overall_std), 
                    'pct': float(overall_pct)
                }
            }
        
        self.analysis_report['question_type_analysis'] = question_type_results
        return question_type_results
    
    def compute_shot_type_x_question_type_matrix(self):
        """5. Create 2D matrix of shot type × question type."""
        # Create pivot table
        pivot_df = self.df.pivot_table(
            values='Overall_Score',
            index='question_type',
            columns='shot_type',
            aggfunc='mean'
        )
        
        # Convert to dictionary format for JSON serialization
        matrix_data = {}
        shot_types = sorted(self.df['shot_type'].unique())
        q_types = sorted(self.df['question_type'].unique())
        
        for q_type in q_types:
            matrix_data[q_type] = {}
            for shot_type in shot_types:
                if q_type in pivot_df.index and shot_type in pivot_df.columns:
                    score = pivot_df.loc[q_type, shot_type]
                    if pd.notna(score):
                        pct = self.normalize_to_percentage(score)
                        matrix_data[q_type][shot_type] = {
                            'score': float(score),
                            'percentage': float(pct)
                        }
                    else:
                        matrix_data[q_type][shot_type] = None
                else:
                    matrix_data[q_type][shot_type] = None
        
        self.analysis_report['interaction_matrix'] = matrix_data
        return matrix_data
    
    def compute_individual_model_analysis(self):
        """6. Individual model analysis by question type and shot type."""
        models = self.df['model'].unique()
        model_analysis = {}
        
        for model in sorted(models):
            model_df = self.df[self.df['model'] == model]
            
            # 6.1 Overall performance for this model
            overall_performance = {}
            for metric in METRICS:
                mean_score = model_df[metric].mean()
                std_score = model_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                overall_performance[metric] = {
                    'mean': float(mean_score),
                    'std': float(std_score),
                    'mean_pct': float(mean_pct)
                }
            
            overall_mean = model_df['Overall_Score'].mean()
            overall_std = model_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            overall_performance['Overall_Score'] = {
                'mean': float(overall_mean),
                'std': float(overall_std),
                'mean_pct': float(overall_pct)
            }
            
            # 6.2 By Question Type
            by_question_type = {}
            for q_type in sorted(model_df['question_type'].unique()):
                q_model_df = model_df[model_df['question_type'] == q_type]
                count = len(q_model_df)
                
                overall_mean = q_model_df['Overall_Score'].mean()
                overall_pct = self.normalize_to_percentage(overall_mean)
                
                by_question_type[q_type] = {
                    'mean': float(overall_mean),
                    'percentage': float(overall_pct),
                    'count': count
                }
            
            # 6.3 By Shot Type
            by_shot_type = {}
            for shot_type in sorted(model_df['shot_type'].unique()):
                shot_model_df = model_df[model_df['shot_type'] == shot_type]
                count = len(shot_model_df)
                
                overall_mean = shot_model_df['Overall_Score'].mean()
                overall_pct = self.normalize_to_percentage(overall_mean)
                
                by_shot_type[shot_type] = {
                    'mean': float(overall_mean),
                    'percentage': float(overall_pct),
                    'count': count
                }
            
            # 6.4 Shot Type × Question Type Matrix for this model
            pivot_df = model_df.pivot_table(
                values='Overall_Score',
                index='question_type',
                columns='shot_type',
                aggfunc='mean'
            )
            
            matrix = {}
            if not pivot_df.empty:
                shot_types = sorted(model_df['shot_type'].unique())
                q_types = sorted(model_df['question_type'].unique())
                
                for q_type in q_types:
                    matrix[q_type] = {}
                    for shot_type in shot_types:
                        if q_type in pivot_df.index and shot_type in pivot_df.columns:
                            score = pivot_df.loc[q_type, shot_type]
                            if pd.notna(score):
                                pct = self.normalize_to_percentage(score)
                                matrix[q_type][shot_type] = {
                                    'score': float(score),
                                    'percentage': float(pct)
                                }
                            else:
                                matrix[q_type][shot_type] = None
                        else:
                            matrix[q_type][shot_type] = None
            
            model_analysis[model] = {
                'overall_performance': overall_performance,
                'by_question_type': by_question_type,
                'by_shot_type': by_shot_type,
                'interaction_matrix': matrix
            }
        
        self.analysis_report['individual_model_analysis'] = model_analysis
        return model_analysis
    
    def generate_analysis_report(self):
        """Generate and save comprehensive analysis report."""
        # Add metadata
        self.analysis_report['metadata'] = {
            'analysis_timestamp': datetime.now().isoformat(),
            'total_models': len(self.df['model'].unique()) if self.df is not None else 0,
            'total_shot_types': len(self.df['shot_type'].unique()) if self.df is not None else 0,
            'total_question_types': len(self.df['question_type'].unique()) if self.df is not None else 0,
            'total_records': len(self.df) if self.df is not None else 0,
            'metrics_evaluated': METRICS,
            'num_judges': NUM_JUDGES
        }
        
        # Save as JSON
        with open('comprehensive_analysis_report.json', 'w', encoding='utf-8') as f:
            json.dump(self.analysis_report, f, indent=2, ensure_ascii=False)
        
        return self.analysis_report
    
    def print_summary_report(self):
        """Print a formatted summary of the analysis."""
        print("\n" + "="*80)
        print("COMPREHENSIVE LLM-BASED QA EVALUATION ANALYSIS REPORT")
        print("="*80)
        
        # Metadata
        metadata = self.analysis_report['metadata']
        print(f"\nANALYSIS METADATA:")
        print(f"  Timestamp: {metadata['analysis_timestamp']}")
        print(f"  Total Models: {metadata['total_models']}")
        print(f"  Total Records: {metadata['total_records']}")
        print(f"  Metrics: {', '.join(metadata['metrics_evaluated'])}")
        
        # Data Loading Summary
        loading = self.analysis_report['data_loading']
        print(f"\nDATA LOADING SUMMARY:")
        print(f"  Files Successfully Loaded: {len(loading['files_loaded'])}")
        print(f"  Files Failed: {len(loading['files_failed'])}")
        print(f"  Total Records: {loading['total_records']}")
        
        # Overall Metrics
        overall = self.analysis_report['overall_metrics']
        print(f"\nOVERALL PERFORMANCE:")
        for metric, stats in overall.items():
            print(f"  {metric:15s}: {stats['mean']:.3f} ± {stats['std']:.3f} ({stats['mean_pct']:.1f}%)")
        
        # Best Performing Models (by overall score)
        model_scores = []
        for model, data in self.analysis_report['individual_model_analysis'].items():
            overall_score = data['overall_performance']['Overall_Score']['mean']
            model_scores.append((model, overall_score))
        
        model_scores.sort(key=lambda x: x[1], reverse=True)
        print(f"\nMODEL RANKING (by Overall Score):")
        for i, (model, score) in enumerate(model_scores, 1):
            pct = self.normalize_to_percentage(score)
            print(f"  {i}. {model:25s}: {score:.3f} ({pct:.1f}%)")
        
        # Majority Vote Summary
        if 'majority_vote_analysis' in self.analysis_report:
            majority = self.analysis_report['majority_vote_analysis']
            if 'model_agreement_stats' in majority:
                print(f"\nMODEL AGREEMENT WITH MAJORITY VOTE:")
                for model, stats in majority['model_agreement_stats'].items():
                    print(f"  {model:25s}: {stats['percentage']:.1f}% ({stats['agreements']}/{stats['total']})")
        
        print(f"\nDETAILED REPORT SAVED TO: comprehensive_analysis_report.json")
        print("="*80)
    
    def run_all_analyses(self):
        """Main function to run all analyses and generate single report."""
        # Load all data (this also computes majority vote)
        self.load_all_data()
        
        # Run all analyses
        self.compute_overall_metrics()
        self.compute_overall_score_distribution()
        self.compute_breakdown_by_shot_type()
        self.compute_breakdown_by_question_type()
        self.compute_shot_type_x_question_type_matrix()
        self.compute_individual_model_analysis()
        
        # Generate comprehensive report
        self.generate_analysis_report()
        
        # Print summary
        self.print_summary_report()
        
        return self.analysis_report

def main():
    analyzer = MultiModelEvaluationAnalyzer()
    report = analyzer.run_all_analyses()
    return report

if __name__ == "__main__":
    main()