In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict, Counter
import warnings
import json
from datetime import datetime
warnings.filterwarnings('ignore')

# Define file mappings for human evaluators
FILES = {
    'Evaluator_01': 'Ev_Ramita_01.csv',
    'Evaluator_02': 'Ev_Sibgha_02.csv',
    'Evaluator_03': 'Ev_Saim_03.csv',
    'Evaluator_04': 'Ev_Shahzad_04.csv',
    'Evaluator_05': 'Ev_Ammar_05.csv'
}

# Column name mappings (human evaluation uses different column names)
COLUMN_MAPPINGS = {
    'QA_ID': 'qa_id',
    'Question_Type': 'question_type',
    'Shot_Type': 'shot_type',
    'Question': 'question',
    'Answer': 'answer',
    'Relevance (1–5)': 'Relevance',
    'Accuracy  (1–5)': 'Accuracy',
    'Completeness  (1–5)': 'Completeness',
    'Fluency  (1–5)': 'Fluency',
    'KG Alignment  (1–5)': 'KG_Alignment',
    'Evaluator ID': 'evaluator_id',
    'Comments': 'comments'
}

METRICS = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
NUM_EVALUATORS = 5  # 5 human evaluators

class HumanEvaluationAnalyzer:
    def __init__(self):
        self.df = None
        self.majority_vote_df = None
        self.evaluator_majority_agreement = {}
        self.total_qa_pairs = 160  # Total QA pairs evaluated by humans
        self.analysis_report = {
            'metadata': {},
            'data_loading': {},
            'majority_vote_analysis': {},
            'overall_metrics': {},
            'score_distribution': {},
            'shot_type_analysis': {},
            'question_type_analysis': {},
            'interaction_matrix': {},
            'individual_evaluator_analysis': {},
            'inter_evaluator_variance': {}
        }
        
    def load_all_data(self):
        """Load all CSV files and combine them into a single DataFrame."""
        all_data = []
        loading_info = {'files_loaded': [], 'files_failed': [], 'total_records': 0, 'expected_records': 0}
        
        for evaluator, filename in FILES.items():
            try:
                df = pd.read_csv(filename)
                # Rename columns to match LLM evaluation format
                df = df.rename(columns=COLUMN_MAPPINGS)
                df['evaluator'] = evaluator
                all_data.append(df)
                loading_info['files_loaded'].append({
                    'filename': filename,
                    'evaluator': evaluator,
                    'records': len(df)
                })
            except Exception as e:
                loading_info['files_failed'].append({
                    'filename': filename,
                    'evaluator': evaluator,
                    'error': str(e)
                })
        
        if not all_data:
            raise ValueError("No data files could be loaded!")
        
        combined_df = pd.concat(all_data, ignore_index=True)
        loading_info['total_records'] = len(combined_df)
        loading_info['expected_records'] = self.total_qa_pairs * NUM_EVALUATORS
        
        # Ensure Overall_Score is calculated if not present
        if 'Overall_Score' not in combined_df.columns or combined_df['Overall_Score'].isna().any():
            combined_df['Overall_Score'] = combined_df[METRICS].mean(axis=1)
        
        self.df = combined_df
        self.analysis_report['data_loading'] = loading_info
        
        # Compute majority vote dataframe
        self.compute_majority_vote()
        
        return combined_df
    
    def compute_majority_vote(self):
        """
        Compute majority vote per QA pair per metric.
        Each human evaluator is a judge, so we compare scores across evaluators for the same qa_id.
        """
        majority_analysis = {}
        
        # Group by qa_id to see how many evaluators evaluated each question
        qa_id_counts = self.df.groupby('qa_id')['evaluator'].nunique().reset_index(name='num_evaluators')
        
        # Debug: Show distribution of evaluator counts per qa_id
        count_distribution = qa_id_counts['num_evaluators'].value_counts().sort_index()
        majority_analysis['evaluator_distribution'] = {
            int(evaluator_count): int(instances) 
            for evaluator_count, instances in count_distribution.items()
        }
        
        # Filter to keep only qa_ids that were evaluated by all 5 evaluators
        valid_qa_ids = qa_id_counts[qa_id_counts['num_evaluators'] == NUM_EVALUATORS]['qa_id'].tolist()
        
        majority_analysis['total_qa_ids'] = len(qa_id_counts)
        majority_analysis['valid_qa_ids'] = len(valid_qa_ids)
        
        if len(valid_qa_ids) == 0:
            majority_analysis['warning'] = f"No qa_ids were evaluated by all {NUM_EVALUATORS} evaluators!"
            self.majority_vote_df = pd.DataFrame()
            self.evaluator_majority_agreement = {}
            self.analysis_report['majority_vote_analysis'] = majority_analysis
            return self.majority_vote_df
        
        majority_vote_data = []
        agreement_stats_by_metric = {metric: {'full_agreement': 0, 'majority_exists': 0, 'no_majority': 0} for metric in METRICS}
        agreement_stats_by_qa = {'all_metrics_majority': 0, 'some_metrics_majority': 0, 'no_metrics_majority': 0}
        
        for qa_id in valid_qa_ids:
            # Get all evaluations for this qa_id (one from each evaluator)
            qa_data = self.df[self.df['qa_id'] == qa_id]
            
            if len(qa_data) != NUM_EVALUATORS:
                continue  # Skip if we don't have exactly NUM_EVALUATORS evaluations
            
            # Get metadata from the first row
            first_row = qa_data.iloc[0]
            row_data = {
                'qa_id': qa_id,
                'question_type': first_row['question_type'],
                'shot_type': first_row['shot_type'],
                'question': first_row['question'],
                'answer': first_row['answer']
            }
            
            metrics_with_majority = 0
            
            # Compute majority vote for each metric
            for metric in METRICS:
                # Get scores from all evaluators for this metric
                scores = qa_data[metric].values
                score_counts = Counter(scores)
                
                if not score_counts:
                    row_data[f'{metric}_majority'] = np.nan
                    continue
                
                # Get the most common score and its count
                majority_score, majority_count = score_counts.most_common(1)[0]
                row_data[f'{metric}_majority'] = majority_score
                row_data[f'{metric}_majority_count'] = majority_count
                
                # Check for full agreement (all evaluators gave the same score)
                if len(score_counts) == 1:
                    agreement_stats_by_metric[metric]['full_agreement'] += 1
                    metrics_with_majority += 1
                # Check for simple majority (> NUM_EVALUATORS / 2)
                elif majority_count > NUM_EVALUATORS / 2:
                    agreement_stats_by_metric[metric]['majority_exists'] += 1
                    metrics_with_majority += 1
                else:
                    # No clear majority (tie or no score exceeds half)
                    agreement_stats_by_metric[metric]['no_majority'] += 1
            
            # Track QA-level agreement
            if metrics_with_majority == len(METRICS):
                agreement_stats_by_qa['all_metrics_majority'] += 1
            elif metrics_with_majority > 0:
                agreement_stats_by_qa['some_metrics_majority'] += 1
            else:
                agreement_stats_by_qa['no_metrics_majority'] += 1
            
            # Compute overall score from majority votes
            majority_scores = [row_data[f'{metric}_majority'] for metric in METRICS if f'{metric}_majority' in row_data]
            row_data['Overall_Score_majority'] = np.mean(majority_scores) if majority_scores else np.nan
            
            majority_vote_data.append(row_data)
        
        self.majority_vote_df = pd.DataFrame(majority_vote_data)
        
        # Store agreement statistics
        majority_analysis['agreement_by_metric'] = agreement_stats_by_metric
        majority_analysis['agreement_by_qa'] = agreement_stats_by_qa
        majority_analysis['total_majority_vote_pairs'] = len(self.majority_vote_df)
        
        # Calculate per-evaluator agreement statistics
        evaluator_agreement_stats = {}
        
        for evaluator in self.df['evaluator'].unique():
            total_comparisons = 0
            agreements = 0
            
            # For each qa_id in majority vote results
            for _, maj_row in self.majority_vote_df.iterrows():
                qa_id = maj_row['qa_id']
                
                # Get this evaluator's evaluation for this qa_id
                evaluator_eval = self.df[(self.df['qa_id'] == qa_id) & (self.df['evaluator'] == evaluator)]
                
                if len(evaluator_eval) == 0:
                    continue
                
                evaluator_row = evaluator_eval.iloc[0]
                
                # Compare evaluator's scores with majority votes
                for metric in METRICS:
                    if f'{metric}_majority' in maj_row and not pd.isna(maj_row[f'{metric}_majority']):
                        majority_vote = maj_row[f'{metric}_majority']
                        evaluator_vote = evaluator_row[metric]
                        
                        total_comparisons += 1
                        if evaluator_vote == majority_vote:
                            agreements += 1
            
            if total_comparisons > 0:
                agreement_pct = (agreements / total_comparisons) * 100
                evaluator_agreement_stats[evaluator] = {
                    'agreements': agreements,
                    'total': total_comparisons,
                    'percentage': agreement_pct
                }
        
        self.evaluator_majority_agreement = evaluator_agreement_stats
        majority_analysis['evaluator_agreement_stats'] = evaluator_agreement_stats
        self.analysis_report['majority_vote_analysis'] = majority_analysis
        
        return self.majority_vote_df
    
    def normalize_to_percentage(self, score, max_score=5):
        """Convert score to percentage (0-100 scale)."""
        return (score / max_score) * 100
    
    def compute_overall_metrics(self):
        """1. Compute mean and std dev per metric across all QA pairs."""
        results = {}
        for metric in METRICS:
            mean_score = self.df[metric].mean()
            std_score = self.df[metric].std()
            mean_pct = self.normalize_to_percentage(mean_score)
            
            results[metric] = {
                'mean': float(mean_score),
                'std': float(std_score),
                'mean_pct': float(mean_pct)
            }
        
        # Overall score (mean of all metrics)
        overall_mean = self.df['Overall_Score'].mean()
        overall_std = self.df['Overall_Score'].std()
        overall_pct = self.normalize_to_percentage(overall_mean)
        
        results['Overall_Score'] = {
            'mean': float(overall_mean),
            'std': float(overall_std),
            'mean_pct': float(overall_pct)
        }
        
        self.analysis_report['overall_metrics'] = results
        return results
    
    def compute_overall_score_distribution(self):
        """2. Compute overall score distribution."""
        overall_scores = self.df['Overall_Score']
        
        distribution = {
            'statistics': {
                'mean': float(overall_scores.mean()),
                'std': float(overall_scores.std()),
                'min': float(overall_scores.min()),
                'max': float(overall_scores.max())
            },
            'ranges': {}
        }
        
        # Distribution by ranges
        ranges = [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (4.0, 5.0)]
        
        for start, end in ranges:
            if start == 4.0:  # Include 5.0 in the last range
                count = ((overall_scores >= start) & (overall_scores <= end)).sum()
            else:
                count = ((overall_scores >= start) & (overall_scores < end)).sum()
            pct = (count / len(overall_scores)) * 100
            
            distribution['ranges'][f'{start:.1f}-{end:.1f}'] = {
                'count': int(count),
                'percentage': float(pct)
            }
        
        self.analysis_report['score_distribution'] = distribution
        return distribution
    
    def compute_breakdown_by_shot_type(self):
        """3. Breakdown by shot type."""
        shot_type_results = {}
        
        for shot_type in sorted([s for s in self.df['shot_type'].dropna().unique() if str(s).strip()]):
            shot_df = self.df[self.df['shot_type'] == shot_type]
            count = len(shot_df)
            unique_qa = shot_df['qa_id'].nunique()
            
            results = {}
            for metric in METRICS:
                mean_score = shot_df[metric].mean()
                std_score = shot_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                
                results[metric] = {
                    'mean': float(mean_score), 
                    'std': float(std_score), 
                    'mean_pct': float(mean_pct)
                }
            
            # Overall score
            overall_mean = shot_df['Overall_Score'].mean()
            overall_std = shot_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            
            shot_type_results[shot_type] = {
                'count': count,
                'unique_qa': unique_qa,
                'metrics': results,
                'overall': {
                    'mean': float(overall_mean), 
                    'std': float(overall_std), 
                    'pct': float(overall_pct)
                }
            }
        
        self.analysis_report['shot_type_analysis'] = shot_type_results
        return shot_type_results
    
    def compute_breakdown_by_question_type(self):
        """4. Breakdown by question type."""
        question_type_results = {}
        
        for q_type in sorted([q for q in self.df['question_type'].dropna().unique() if str(q).strip()]):
            q_df = self.df[self.df['question_type'] == q_type]
            count = len(q_df)
            unique_qa = q_df['qa_id'].nunique()
            
            results = {}
            for metric in METRICS:
                mean_score = q_df[metric].mean()
                std_score = q_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                
                results[metric] = {
                    'mean': float(mean_score), 
                    'std': float(std_score), 
                    'mean_pct': float(mean_pct)
                }
            
            # Overall score
            overall_mean = q_df['Overall_Score'].mean()
            overall_std = q_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            
            question_type_results[q_type] = {
                'count': count,
                'unique_qa': unique_qa,
                'metrics': results,
                'overall': {
                    'mean': float(overall_mean), 
                    'std': float(overall_std), 
                    'pct': float(overall_pct)
                }
            }
        
        self.analysis_report['question_type_analysis'] = question_type_results
        return question_type_results
    
    def compute_shot_type_x_question_type_matrix(self):
        """5. Create 2D matrix of shot type × question type."""
        # Create pivot table
        pivot_df = self.df.pivot_table(
            values='Overall_Score',
            index='question_type',
            columns='shot_type',
            aggfunc='mean'
        )
        
        # Convert to dictionary format for JSON serialization
        matrix_data = {}
        shot_types = sorted([s for s in self.df['shot_type'].dropna().unique() if str(s).strip()])
        q_types = sorted([q for q in self.df['question_type'].dropna().unique() if str(q).strip()])
        
        for q_type in q_types:
            matrix_data[q_type] = {}
            for shot_type in shot_types:
                if q_type in pivot_df.index and shot_type in pivot_df.columns:
                    score = pivot_df.loc[q_type, shot_type]
                    if pd.notna(score):
                        pct = self.normalize_to_percentage(score)
                        matrix_data[q_type][shot_type] = {
                            'score': float(score),
                            'percentage': float(pct)
                        }
                    else:
                        matrix_data[q_type][shot_type] = None
                else:
                    matrix_data[q_type][shot_type] = None
        
        self.analysis_report['interaction_matrix'] = matrix_data
        return matrix_data
    
    def compute_individual_evaluator_analysis(self):
        """6. Individual evaluator analysis by question type and shot type."""
        evaluators = self.df['evaluator'].unique()
        evaluator_analysis = {}
        
        for evaluator in sorted(evaluators):
            evaluator_df = self.df[self.df['evaluator'] == evaluator]
            
            # 6.1 Overall performance for this evaluator
            overall_performance = {}
            for metric in METRICS:
                mean_score = evaluator_df[metric].mean()
                std_score = evaluator_df[metric].std()
                mean_pct = self.normalize_to_percentage(mean_score)
                overall_performance[metric] = {
                    'mean': float(mean_score),
                    'std': float(std_score),
                    'mean_pct': float(mean_pct)
                }
            
            overall_mean = evaluator_df['Overall_Score'].mean()
            overall_std = evaluator_df['Overall_Score'].std()
            overall_pct = self.normalize_to_percentage(overall_mean)
            overall_performance['Overall_Score'] = {
                'mean': float(overall_mean),
                'std': float(overall_std),
                'mean_pct': float(overall_pct)
            }
            
            # 6.2 By Question Type
            by_question_type = {}
            for q_type in sorted([q for q in evaluator_df['question_type'].dropna().unique() if str(q).strip()]):
                q_evaluator_df = evaluator_df[evaluator_df['question_type'] == q_type]
                count = len(q_evaluator_df)
                
                overall_mean = q_evaluator_df['Overall_Score'].mean()
                overall_pct = self.normalize_to_percentage(overall_mean)
                
                by_question_type[q_type] = {
                    'mean': float(overall_mean),
                    'percentage': float(overall_pct),
                    'count': count
                }
            
            # 6.3 By Shot Type
            by_shot_type = {}
            for shot_type in sorted([s for s in evaluator_df['shot_type'].dropna().unique() if str(s).strip()]):
                shot_evaluator_df = evaluator_df[evaluator_df['shot_type'] == shot_type]
                count = len(shot_evaluator_df)
                
                overall_mean = shot_evaluator_df['Overall_Score'].mean()
                overall_pct = self.normalize_to_percentage(overall_mean)
                
                by_shot_type[shot_type] = {
                    'mean': float(overall_mean),
                    'percentage': float(overall_pct),
                    'count': count
                }
            
            # 6.4 Shot Type × Question Type Matrix for this evaluator
            pivot_df = evaluator_df.pivot_table(
                values='Overall_Score',
                index='question_type',
                columns='shot_type',
                aggfunc='mean'
            )
            
            matrix = {}
            if not pivot_df.empty:
                shot_types = sorted([s for s in evaluator_df['shot_type'].dropna().unique() if str(s).strip()])
                q_types = sorted([q for q in evaluator_df['question_type'].dropna().unique() if str(q).strip()])
                
                for q_type in q_types:
                    matrix[q_type] = {}
                    for shot_type in shot_types:
                        if q_type in pivot_df.index and shot_type in pivot_df.columns:
                            score = pivot_df.loc[q_type, shot_type]
                            if pd.notna(score):
                                pct = self.normalize_to_percentage(score)
                                matrix[q_type][shot_type] = {
                                    'score': float(score),
                                    'percentage': float(pct)
                                }
                            else:
                                matrix[q_type][shot_type] = None
                        else:
                            matrix[q_type][shot_type] = None
            
            evaluator_analysis[evaluator] = {
                'overall_performance': overall_performance,
                'by_question_type': by_question_type,
                'by_shot_type': by_shot_type,
                'interaction_matrix': matrix
            }
        
        self.analysis_report['individual_evaluator_analysis'] = evaluator_analysis
        return evaluator_analysis
        
    def compute_inter_evaluator_variance(self):
        """7. Compute inter-evaluator variance for each metric."""
        # For each qa_id, compute the variance across evaluators
        variance_data = []
        
        qa_ids = self.df['qa_id'].unique()
        for qa_id in qa_ids:
            qa_data = self.df[self.df['qa_id'] == qa_id]
            
            if len(qa_data) < 2:  # Need at least 2 evaluators for variance
                continue
            
            variance_row = {'qa_id': qa_id}
            for metric in METRICS:
                scores = qa_data[metric].values
                variance_row[f'{metric}_variance'] = np.var(scores)
                variance_row[f'{metric}_std'] = np.std(scores)
            
            variance_row['Overall_Score_variance'] = np.var(qa_data['Overall_Score'].values)
            variance_row['Overall_Score_std'] = np.std(qa_data['Overall_Score'].values)
            
            variance_data.append(variance_row)
        
        variance_df = pd.DataFrame(variance_data)
        
        # Calculate average variance and std dev for each metric
        variance_analysis = {}
        for metric in METRICS:
            if f'{metric}_variance' in variance_df.columns:
                avg_var = variance_df[f'{metric}_variance'].mean()
                avg_std = variance_df[f'{metric}_std'].mean()
                variance_analysis[metric] = {
                    'average_variance': float(avg_var),
                    'average_std': float(avg_std)
                }
        
        if 'Overall_Score_variance' in variance_df.columns:
            avg_overall_var = variance_df['Overall_Score_variance'].mean()
            avg_overall_std = variance_df['Overall_Score_std'].mean()
            variance_analysis['Overall_Score'] = {
                'average_variance': float(avg_overall_var),
                'average_std': float(avg_overall_std)
            }
        
        self.analysis_report['inter_evaluator_variance'] = variance_analysis
        return variance_analysis
    
    def generate_analysis_report(self):
        """Generate and save comprehensive analysis report."""
        # Add metadata
        self.analysis_report['metadata'] = {
            'analysis_timestamp': datetime.now().isoformat(),
            'analysis_type': 'Human Evaluation',
            'total_evaluators': len(self.df['evaluator'].unique()) if self.df is not None else 0,
            'total_shot_types': len([s for s in self.df['shot_type'].dropna().unique() if str(s).strip()]) if self.df is not None else 0,
            'total_question_types': len([q for q in self.df['question_type'].dropna().unique() if str(q).strip()]) if self.df is not None else 0,
            'total_records': len(self.df) if self.df is not None else 0,
            'expected_qa_pairs': self.total_qa_pairs,
            'metrics_evaluated': METRICS,
            'num_evaluators': NUM_EVALUATORS
        }
        
        # Save as JSON
        with open('human_evaluation_analysis_report.json', 'w', encoding='utf-8') as f:
            json.dump(self.analysis_report, f, indent=2, ensure_ascii=False)
        
        return self.analysis_report
    
    def print_summary_report(self):
        """Print a formatted summary of the analysis."""
        print("\n" + "="*80)
        print("COMPREHENSIVE HUMAN-BASED QA EVALUATION ANALYSIS REPORT")
        print("="*80)
        
        # Metadata
        metadata = self.analysis_report['metadata']
        print(f"\nANALYSIS METADATA:")
        print(f"  Timestamp: {metadata['analysis_timestamp']}")
        print(f"  Total Evaluators: {metadata['total_evaluators']}")
        print(f"  Total Records: {metadata['total_records']}")
        print(f"  Expected QA Pairs: {metadata['expected_qa_pairs']}")
        print(f"  Metrics: {', '.join(metadata['metrics_evaluated'])}")
        
        # Data Loading Summary
        loading = self.analysis_report['data_loading']
        print(f"\nDATA LOADING SUMMARY:")
        print(f"  Files Successfully Loaded: {len(loading['files_loaded'])}")
        print(f"  Files Failed: {len(loading['files_failed'])}")
        print(f"  Total Records: {loading['total_records']}")
        print(f"  Expected Records: {loading['expected_records']}")
        
        # Overall Metrics
        overall = self.analysis_report['overall_metrics']
        print(f"\nOVERALL PERFORMANCE:")
        for metric, stats in overall.items():
            print(f"  {metric:15s}: {stats['mean']:.3f} ± {stats['std']:.3f} ({stats['mean_pct']:.1f}%)")
        
        # Best Performing Evaluators (by overall score)
        evaluator_scores = []
        for evaluator, data in self.analysis_report['individual_evaluator_analysis'].items():
            overall_score = data['overall_performance']['Overall_Score']['mean']
            evaluator_scores.append((evaluator, overall_score))
        
        evaluator_scores.sort(key=lambda x: x[1], reverse=True)
        print(f"\nEVALUATOR RANKING (by Overall Score):")
        for i, (evaluator, score) in enumerate(evaluator_scores, 1):
            pct = self.normalize_to_percentage(score)
            print(f"  {i}. {evaluator:30s}: {score:.3f} ({pct:.1f}%)")
        
        # Majority Vote Summary
        if 'majority_vote_analysis' in self.analysis_report:
            majority = self.analysis_report['majority_vote_analysis']
            if 'evaluator_agreement_stats' in majority:
                print(f"\nEVALUATOR AGREEMENT WITH MAJORITY VOTE:")
                for evaluator, stats in majority['evaluator_agreement_stats'].items():
                    print(f"  {evaluator:30s}: {stats['percentage']:.1f}% ({stats['agreements']}/{stats['total']})")
        
        # Inter-evaluator Variance
        if 'inter_evaluator_variance' in self.analysis_report:
            variance = self.analysis_report['inter_evaluator_variance']
            print(f"\nINTER-EVALUATOR VARIANCE:")
            for metric, stats in variance.items():
                print(f"  {metric:15s}: Avg Variance={stats['average_variance']:.3f}, Avg Std={stats['average_std']:.3f}")
        
        print(f"\nDETAILED REPORT SAVED TO: human_evaluation_analysis_report.json")
        print("="*80)
    
    def run_all_analyses(self):
        """Main function to run all analyses and generate single report."""
        # Load all data (this also computes majority vote)
        self.load_all_data()
        
        # Run all analyses
        self.compute_overall_metrics()
        self.compute_overall_score_distribution()
        self.compute_breakdown_by_shot_type()
        self.compute_breakdown_by_question_type()
        self.compute_shot_type_x_question_type_matrix()
        self.compute_individual_evaluator_analysis()
        self.compute_inter_evaluator_variance()
        
        # Generate comprehensive report
        self.generate_analysis_report()
        
        # Print summary
        self.print_summary_report()
        
        return self.analysis_report

def main():
    analyzer = HumanEvaluationAnalyzer()
    report = analyzer.run_all_analyses()
    return report

if __name__ == "__main__":
    main()