In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
import warnings
import json
from datetime import datetime
warnings.filterwarnings('ignore')

# Define file mappings for human evaluators
HUMAN_FILES = {
    'Evaluator_01': 'Ev_Ramita_01.csv',
    'Evaluator_02': 'Ev_Sibgha_02.csv',
    'Evaluator_03': 'Ev_Saim_03.csv',
    'Evaluator_04': 'Ev_Shahzad_04.csv',
    'Evaluator_05': 'Ev_Ammar_05.csv',
    
}

# Define file mappings for LLM evaluations
LLM_FILES = {
    'Mixtral-8x22B': {
        'Zero-Shot': 'mixtral_zeroshot_evaluation_results.csv',
        'One-Shot': 'mixtral_oneshot_evaluation_results.csv',
        'Few-Shot': 'mixtral_fewshot_evaluation_results.csv'
    },
    'Llama-3.3-70B': {
        'Zero-Shot': 'Llama_zeroshot_evaluation_results.csv',
        'One-Shot': 'Llama_oneshot_evaluation_results.csv',
        'Few-Shot': 'Llama_fewshot_evaluation_results.csv'
    },
    'DeepSeek-R1': {
        'Zero-Shot': 'DeepSeek_zeroshot_evaluation_results.csv',
        'One-Shot': 'DeepSeek_oneshot_evaluation_results.csv',
        'Few-Shot': 'DeepSeek_fewshot_evaluation_results.csv'
    },
    'Qwen3-32B': {
        'Zero-Shot': 'Qwen_zeroshot_evaluation_results.csv',
        'One-Shot': 'Qwen_oneshot_evaluation_results.csv',
        'Few-Shot': 'Qwen_fewshot_evaluation_results.csv'
    },
    'Gemma-2-27B-IT': {
        'Zero-Shot': 'gemma_zeroshot_evaluation_results.csv',
        'One-Shot': 'gemma_oneshot_evaluation_results.csv',
        'Few-Shot': 'gemma_fewshot_evaluation_results.csv'
    }
}

# Column name mappings for human evaluation files
HUMAN_COLUMN_MAPPINGS = {
    'QA_ID': 'qa_id',
    'Question_Type': 'question_type',
    'Shot_Type': 'shot_type',
    'Question': 'question',
    'Answer': 'answer',
    'Relevance (1–5)': 'Relevance',
    'Accuracy  (1–5)': 'Accuracy',
    'Completeness  (1–5)': 'Completeness',
    'Fluency  (1–5)': 'Fluency',
    'KG Alignment  (1–5)': 'KG_Alignment'
}

METRICS = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']

class HumanLLMComparisonAnalyzer:
    def __init__(self):
        self.human_df = None
        self.llm_data = {}
        self.comparison_results = None
        self.analysis_report = {
            'metadata': {},
            'data_loading': {},
            'comparison_results': {},
            'model_performance': {},
            'metric_performance': {},
            'overall_statistics': {},
            'detailed_comparisons': {}
        }
    
    def round_to_half(self, x):
        """Round to nearest 0.5"""
        return round(x * 2) / 2

    def load_human_evaluations(self):
        """Load all human evaluation files and compute mean scores per QA_ID"""
        loading_info = {'files_loaded': [], 'files_failed': [], 'total_evaluations': 0, 'unique_qa_pairs': 0}
        
        all_human_data = []

        for evaluator, filename in HUMAN_FILES.items():
            try:
                df = pd.read_csv(filename)
                # Rename columns to match expected format
                df = df.rename(columns=HUMAN_COLUMN_MAPPINGS)
                # Keep only relevant columns
                relevant_cols = ['qa_id', 'question_type', 'shot_type', 'question', 'answer'] + METRICS
                df = df[relevant_cols]
                all_human_data.append(df)
                loading_info['files_loaded'].append({
                    'evaluator': evaluator,
                    'filename': filename,
                    'evaluations': len(df)
                })
            except Exception as e:
                loading_info['files_failed'].append({
                    'evaluator': evaluator,
                    'filename': filename,
                    'error': str(e)
                })

        if not all_human_data:
            raise ValueError("No human evaluation files could be loaded!")

        # Combine all human evaluations
        human_df_all = pd.concat(all_human_data, ignore_index=True)
        loading_info['total_evaluations'] = len(human_df_all)

        # Compute mean scores per QA_ID for each metric
        metric_means = human_df_all.groupby('qa_id')[METRICS].mean().reset_index()
        metadata = human_df_all.groupby('qa_id').first()[['question_type', 'shot_type', 'question', 'answer']].reset_index()
        
        # Merge metadata with mean scores
        self.human_df = pd.merge(metadata, metric_means, on='qa_id')
        loading_info['unique_qa_pairs'] = len(self.human_df)
        
        self.analysis_report['data_loading']['human_evaluations'] = loading_info
        return self.human_df

    def load_llm_evaluations(self, human_qa_ids):
        """Load all LLM evaluation files and filter to match human QA_IDs"""
        llm_loading_info = {'models_processed': [], 'models_failed': []}
        
        for model_name, shot_files in LLM_FILES.items():
            model_info = {'model_name': model_name, 'shot_types': [], 'total_qa_pairs': 0}
            model_data = []

            for shot_type, filename in shot_files.items():
                try:
                    df = pd.read_csv(filename)
                    # Filter to keep only QA_IDs that exist in human evaluations
                    df_filtered = df[df['qa_id'].isin(human_qa_ids)]

                    if len(df_filtered) > 0:
                        df_filtered['model'] = model_name
                        df_filtered['shot_type'] = shot_type
                        model_data.append(df_filtered)
                        model_info['shot_types'].append({
                            'shot_type': shot_type,
                            'filename': filename,
                            'matching_pairs': len(df_filtered),
                            'total_pairs': len(df)
                        })
                    else:
                        model_info['shot_types'].append({
                            'shot_type': shot_type,
                            'filename': filename,
                            'matching_pairs': 0,
                            'total_pairs': len(df)
                        })

                except Exception as e:
                    model_info['shot_types'].append({
                        'shot_type': shot_type,
                        'filename': filename,
                        'error': str(e)
                    })

            if model_data:
                # Combine all shot types for this model
                model_df = pd.concat(model_data, ignore_index=True)
                # Average scores across shot types for each qa_id
                model_avg = model_df.groupby('qa_id')[METRICS].mean().reset_index()
                self.llm_data[model_name] = model_avg
                model_info['total_qa_pairs'] = len(model_avg)
                llm_loading_info['models_processed'].append(model_info)
            else:
                llm_loading_info['models_failed'].append(model_info)

        self.analysis_report['data_loading']['llm_evaluations'] = llm_loading_info
        return self.llm_data

    def compute_exact_match_percentage(self, human_scores, llm_scores):
        """Compute exact match percentage after rounding to nearest 0.5"""
        human_rounded = np.array([self.round_to_half(x) for x in human_scores])
        llm_rounded = np.array([self.round_to_half(x) for x in llm_scores])

        matches = (human_rounded == llm_rounded).sum()
        total = len(human_scores)

        return (matches / total) * 100 if total > 0 else 0

    def compute_f1_macro(self, human_scores, llm_scores):
        """Compute macro F1 score between rounded scores"""
        human_rounded = np.array([self.round_to_half(x) for x in human_scores])
        llm_rounded = np.array([self.round_to_half(x) for x in llm_scores])

        # Convert to integer labels for sklearn (multiply by 2 to avoid decimals)
        human_labels = (human_rounded * 2).astype(int)
        llm_labels = (llm_rounded * 2).astype(int)

        try:
            return f1_score(human_labels, llm_labels, average='macro', zero_division=0)
        except:
            return 0.0

    def compare_evaluations(self):
        """Compare human and LLM evaluations for all models and metrics"""
        results = []
        detailed_comparisons = {}

        for model_name, llm_df in self.llm_data.items():
            # Merge human and LLM data on qa_id
            merged_df = pd.merge(self.human_df[['qa_id'] + METRICS],
                                llm_df[['qa_id'] + METRICS],
                                on='qa_id',
                                suffixes=('_human', '_llm'))

            if len(merged_df) == 0:
                continue

            model_comparisons = {'matched_qa_pairs': len(merged_df), 'metrics': {}}

            for metric in METRICS:
                human_col = f'{metric}_human'
                llm_col = f'{metric}_llm'

                # Get scores
                human_scores = merged_df[human_col].values
                llm_scores = merged_df[llm_col].values

                # Compute metrics
                exact_match = self.compute_exact_match_percentage(human_scores, llm_scores)
                f1 = self.compute_f1_macro(human_scores, llm_scores)

                results.append({
                    'Model': model_name,
                    'Metric': metric,
                    'Exact Match (%)': exact_match,
                    'F1 Score': f1
                })

                model_comparisons['metrics'][metric] = {
                    'exact_match_percentage': float(exact_match),
                    'f1_score': float(f1),
                    'human_mean': float(human_scores.mean()),
                    'llm_mean': float(llm_scores.mean()),
                    'human_std': float(human_scores.std()),
                    'llm_std': float(llm_scores.std())
                }

            detailed_comparisons[model_name] = model_comparisons

        self.comparison_results = pd.DataFrame(results)
        self.analysis_report['detailed_comparisons'] = detailed_comparisons
        return self.comparison_results

    def analyze_model_performance(self):
        """Analyze performance by model"""
        if self.comparison_results is None or self.comparison_results.empty:
            return {}

        numeric_cols = ['Exact Match (%)', 'F1 Score']
        model_averages = self.comparison_results.groupby('Model')[numeric_cols].mean().reset_index()
        
        model_performance = {}
        for _, row in model_averages.iterrows():
            model_performance[row['Model']] = {
                'avg_exact_match': float(row['Exact Match (%)']),
                'avg_f1_score': float(row['F1 Score'])
            }

        # Sort models by performance
        sorted_models = sorted(model_performance.items(), 
                              key=lambda x: x[1]['avg_f1_score'], reverse=True)
        
        model_performance_analysis = {
            'rankings': {
                'by_exact_match': sorted(model_performance.items(), 
                                       key=lambda x: x[1]['avg_exact_match'], reverse=True),
                'by_f1_score': sorted_models
            },
            'performance_data': model_performance
        }

        self.analysis_report['model_performance'] = model_performance_analysis
        return model_performance_analysis

    def analyze_metric_performance(self):
        """Analyze performance by metric"""
        if self.comparison_results is None or self.comparison_results.empty:
            return {}

        numeric_cols = ['Exact Match (%)', 'F1 Score']
        metric_averages = self.comparison_results.groupby('Metric')[numeric_cols].mean().reset_index()
        
        metric_performance = {}
        for _, row in metric_averages.iterrows():
            metric_performance[row['Metric']] = {
                'avg_exact_match': float(row['Exact Match (%)']),
                'avg_f1_score': float(row['F1 Score'])
            }

        # Sort metrics by difficulty (lower agreement = more difficult)
        sorted_metrics = sorted(metric_performance.items(), 
                               key=lambda x: x[1]['avg_f1_score'])
        
        metric_performance_analysis = {
            'rankings': {
                'easiest_to_hardest_f1': sorted_metrics,
                'easiest_to_hardest_exact_match': sorted(metric_performance.items(), 
                                                        key=lambda x: x[1]['avg_exact_match'])
            },
            'performance_data': metric_performance
        }

        self.analysis_report['metric_performance'] = metric_performance_analysis
        return metric_performance_analysis

    def compute_overall_statistics(self):
        """Compute overall statistics"""
        if self.comparison_results is None or self.comparison_results.empty:
            return {}

        overall_stats = {
            'overall_exact_match': float(self.comparison_results['Exact Match (%)'].mean()),
            'overall_f1_score': float(self.comparison_results['F1 Score'].mean()),
            'total_comparisons': len(self.comparison_results),
            'total_models': len(self.llm_data),
            'total_metrics': len(METRICS),
            'best_performing_model': {
                'by_exact_match': self.comparison_results.groupby('Model')['Exact Match (%)'].mean().idxmax(),
                'by_f1_score': self.comparison_results.groupby('Model')['F1 Score'].mean().idxmax()
            },
            'most_agreeable_metric': {
                'by_exact_match': self.comparison_results.groupby('Metric')['Exact Match (%)'].mean().idxmax(),
                'by_f1_score': self.comparison_results.groupby('Metric')['F1 Score'].mean().idxmax()
            }
        }

        self.analysis_report['overall_statistics'] = overall_stats
        return overall_stats

    def generate_analysis_report(self):
        """Generate and save comprehensive analysis report"""
        # Add metadata
        self.analysis_report['metadata'] = {
            'analysis_timestamp': datetime.now().isoformat(),
            'analysis_type': 'Human vs LLM Evaluation Comparison',
            'total_human_evaluators': len(HUMAN_FILES),
            'total_llm_models': len(LLM_FILES),
            'metrics_compared': METRICS,
            'comparison_method': 'Exact Match and F1 Score (rounded to nearest 0.5)'
        }

        # Perform all analyses
        self.analyze_model_performance()
        self.analyze_metric_performance()
        self.compute_overall_statistics()

        # Save as JSON
        with open('human_llm_comparison_analysis_report.json', 'w', encoding='utf-8') as f:
            # Convert DataFrame to dict for JSON serialization
            if self.comparison_results is not None:
                self.analysis_report['comparison_results'] = self.comparison_results.to_dict('records')
            json.dump(self.analysis_report, f, indent=2, ensure_ascii=False)

        return self.analysis_report

    def print_summary_report(self):
        """Print a formatted summary of the comparison analysis"""
        print("\n" + "="*80)
        print("COMPREHENSIVE HUMAN vs LLM EVALUATION COMPARISON REPORT")
        print("="*80)

        # Metadata
        metadata = self.analysis_report['metadata']
        print(f"\nANALYSIS METADATA:")
        print(f"  Timestamp: {metadata['analysis_timestamp']}")
        print(f"  Human Evaluators: {metadata['total_human_evaluators']}")
        print(f"  LLM Models: {metadata['total_llm_models']}")
        print(f"  Metrics Compared: {', '.join(metadata['metrics_compared'])}")

        # Data Loading Summary
        loading = self.analysis_report['data_loading']
        print(f"\nDATA LOADING SUMMARY:")
        human_loading = loading['human_evaluations']
        print(f"  Human Files Loaded: {len(human_loading['files_loaded'])}")
        print(f"  Total Human Evaluations: {human_loading['total_evaluations']}")
        print(f"  Unique QA Pairs: {human_loading['unique_qa_pairs']}")
        
        llm_loading = loading['llm_evaluations']
        print(f"  LLM Models Processed: {len(llm_loading['models_processed'])}")

        # Overall Statistics
        overall = self.analysis_report['overall_statistics']
        print(f"\nOVERALL AGREEMENT STATISTICS:")
        print(f"  Overall Exact Match: {overall['overall_exact_match']:.1f}%")
        print(f"  Overall F1 Score: {overall['overall_f1_score']:.3f}")
        print(f"  Total Comparisons: {overall['total_comparisons']}")

        # Best Performing Models
        print(f"\nBEST PERFORMING MODELS:")
        print(f"  By Exact Match: {overall['best_performing_model']['by_exact_match']}")
        print(f"  By F1 Score: {overall['best_performing_model']['by_f1_score']}")

        # Model Rankings
        model_perf = self.analysis_report['model_performance']
        print(f"\nMODEL RANKINGS (by F1 Score):")
        for i, (model, score) in enumerate(model_perf['rankings']['by_f1_score'], 1):
            print(f"  {i}. {model:25s}: F1={score['avg_f1_score']:.3f}, EM={score['avg_exact_match']:.1f}%")

        # Metric Difficulty Analysis
        metric_perf = self.analysis_report['metric_performance']
        print(f"\nMETRIC DIFFICULTY (easiest to hardest by F1 Score):")
        for i, (metric, score) in enumerate(reversed(metric_perf['rankings']['easiest_to_hardest_f1']), 1):
            print(f"  {i}. {metric:15s}: F1={score['avg_f1_score']:.3f}, EM={score['avg_exact_match']:.1f}%")

        print(f"\nDETAILED REPORT SAVED TO: human_llm_comparison_analysis_report.json")
        print("="*80)

    def run_comparison_analysis(self):
        """Main function to run all analyses and generate single report"""
        # Load human evaluations
        self.load_human_evaluations()
        human_qa_ids = self.human_df['qa_id'].unique()

        # Load LLM evaluations
        self.load_llm_evaluations(human_qa_ids)

        if not self.llm_data:
            raise ValueError("No LLM data could be loaded!")

        # Compare evaluations
        self.compare_evaluations()

        if self.comparison_results is None or self.comparison_results.empty:
            raise ValueError("No comparison results generated!")

        # Generate comprehensive report
        self.generate_analysis_report()

        # Print summary
        self.print_summary_report()

        # Save detailed CSV
        self.comparison_results.to_csv('human_llm_comparison_results.csv', index=False)

        return self.analysis_report

def main():
    """Main function to run the comparison analysis"""
    analyzer = HumanLLMComparisonAnalyzer()
    
    try:
        report = analyzer.run_comparison_analysis()
        return report
    except Exception as e:
        print(f"\nERROR: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()