In [1]:
import json
import pandas as pd
import requests
import time
from typing import Dict, List, Optional
import logging
from tqdm import tqdm
import re
import os
from datetime import datetime

class ImprovedLLMJudgeEvaluator:
    """
     LLM Judge Evaluation System with detailed scoring criteria 
    aligned with human evaluation guidelines.
    """
    
    def __init__(self, api_key: str, model_name: str = "mistralai/mixtral-8x22b-instruct-v0.1"):
        """
        Initialize the LLM Judge Evaluator.
        
        Args:
            api_key: NScale API key
            model_name: Model identifier for the LLM judge
        """
        self.api_key = api_key
        self.model_name = model_name
        self.base_url = "https://inference.api.nscale.com/v1/chat/completions"
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
        
        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # evaluation prompt with detailed criteria
        self.evaluation_prompt = """You are a STRICT QA evaluator following precise scoring guidelines.

You will evaluate a model-generated question-answer pair on FIVE metrics using a 1-5 scale where:
5 = Excellent  
4 = Good  
3 = Fair  
2 = Poor  
1 = Very Poor

==============================
DETAILED SCORING CRITERIA
==============================

1. **RELEVANCE (1–5):** Does the question appropriately relate to the source text?  
   5: Perfectly relevant to the source, clearly grounded in the text  
   4: Mostly relevant, with minor off-topic elements  
   3: Addresses the main question but misses some important points  
   2: Loosely related, with significant tangents or irrelevance  
   1: Entirely irrelevant or unrelated to the source content

2. **ACCURACY (1–5):** Is the answer factually correct based on the source text?  
   5: All facts are accurate and fully verifiable in the context  
   4: Mostly accurate; contains only minor factual issues  
   3: Some factual inconsistencies or assumptions  
   2: Several factual errors that affect reliability  
   1: Mostly inaccurate or misleading information

3. **COMPLETENESS (1–5):** Does the answer fully address the question?  
   5: Thorough and complete response  
   4: Covers most parts but misses minor aspects  
   3: Addresses main part, omits some key details  
   2: Partial answer with significant gaps  
   1: Severely incomplete or off-topic

4. **FLUENCY (1–5):** Is the answer well-written and grammatically correct?  
   5: Excellent grammar and clarity; highly readable  
   4: Minor grammatical or structural issues  
   3: Understandable, but contains noticeable language errors  
   2: Somewhat unclear due to poor grammar or phrasing  
   1: Difficult to read or understand

5. **KG ALIGNMENT (1–5):** How well does the answer reflect the KG triples?  
   5: Effectively uses KG relationships; no contradictions; may include additional relevant info  
   4: Uses most relevant KG information correctly; may omit minor details, no contradictions  
   3: Uses some KG information; may miss important relationships but generally consistent  
   2: Limited use of KG information; may contain minor contradictions or misinterpretations  
   1: Ignores KG information entirely or includes clear contradictions

==============================
INPUT
==============================
**Question:** {question}  
**Answer:** {answer}  
**Source Context:** {source_context}  
**Knowledge Graph Triples:** {kg_triples}

==============================
RESPONSE FORMAT
==============================

Relevance: X  
Accuracy: X  
Completeness: X  
Fluency: X  
KG_Alignment: X

Where X is a number from 1 to 5 """



    def extract_triples_from_item(self, item: Dict) -> str:
        """Extract and format KG triples from a QA item."""
        def extract_all_triples(obj):
            triples = []
            if isinstance(obj, dict):
                if set(obj.keys()) >= {"subject", "predicate", "object"}:
                    triple = f"{obj['subject']} → {obj['predicate']} → {obj['object']}"
                    triples.append(triple)
                for value in obj.values():
                    triples.extend(extract_all_triples(value))
            elif isinstance(obj, list):
                for item in obj:
                    triples.extend(extract_all_triples(item))
            return triples
        
        triples = extract_all_triples(item)
        if triples:
            formatted_triples = []
            for i, triple in enumerate(triples, 1):
                formatted_triples.append(f"{i}. {triple}")
            return "\n".join(formatted_triples)
        return "No KG triples available"

    def get_source_context(self, item: Dict) -> str:
        """Extract source context from QA item."""
        # Check if source_context is nested under ground_truth
        if "ground_truth" in item and isinstance(item["ground_truth"], dict):
            return item["ground_truth"].get("source_context", "")
        
        # Fallback to root level
        return item.get("source_context", "No source context available")

    def call_llm_judge(self, question: str, answer: str, source_context: str, kg_triples: str, 
                      max_retries: int = 3) -> Optional[Dict[str, int]]:
        """
        Call the LLM judge to evaluate a QA pair with retry logic.
        """
        # Format the prompt
        prompt = self.evaluation_prompt.format(
            question=question,
            answer=answer,
            source_context=source_context,
            kg_triples=kg_triples
        )
        
        # Prepare payload
        payload = {
            "model": self.model_name,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.0,  
            "max_tokens": 800,
            "top_p": 0.8
        }
    
        for attempt in range(max_retries):
            try:
                # API call
                response = requests.post(
                    self.base_url,
                    headers=self.headers,
                    json=payload,
                    timeout=45
                )
                
                if response.status_code == 200:
                    result = response.json()
                    content = result['choices'][0]['message']['content']
                    
                    if content:
                        parsed_result = self.parse_evaluation_response(content)
                        if parsed_result:
                            return parsed_result
                    else:
                        self.logger.warning(f"Empty response content from Mixtral")
                
                elif response.status_code == 429:
                    wait_time = 2 ** attempt
                    self.logger.warning(f"Rate limit hit, waiting {wait_time}s before retry {attempt + 1}/{max_retries}")
                    time.sleep(wait_time)
                else:
                    self.logger.error(f"API Error {response.status_code}: {response.text}")
            
            except requests.exceptions.Timeout:
                self.logger.warning(f"Timeout on attempt {attempt + 1}/{max_retries}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
            
            except Exception as e:
                self.logger.error(f"Request failed on attempt {attempt + 1}: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
    
        return None

    def parse_evaluation_response(self, response: str) -> Optional[Dict[str, int]]:
        """
       Enhanced parser that can handle DeepSeek reasoning + extract scores
        """
        try:
            scores = {}
            
            # Method 1: Standard format parsing (existing logic)
            lines = response.strip().split('\n')
            for line in lines:
                line = line.strip()
                if ':' in line:
                    parts = line.split(':', 1)
                    if len(parts) == 2:
                        metric = parts[0].strip()
                        score_text = parts[1].strip()
                        
                        score_match = re.search(r':\s*([1-5])(?:/5)?', line)
                        if score_match:
                            score = int(score_match.group(1))
                            
                            metric_lower = metric.lower()
                            if 'relevance' in metric_lower:
                                scores['Relevance'] = score
                            elif 'accuracy' in metric_lower:
                                scores['Accuracy'] = score
                            elif 'completeness' in metric_lower:
                                scores['Completeness'] = score
                            elif 'fluency' in metric_lower:
                                scores['Fluency'] = score
                            elif 'kg' in metric_lower or 'alignment' in metric_lower:
                                scores['KG_Alignment'] = score
            
            # If standard parsing fails, extract from reasoning text
            expected_metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
            if len(scores) < 5:
                # Extract from reasoning patterns like "Relevance should be a 5"
                text_lower = response.lower()
                
                if 'Relevance' not in scores:
                    match = re.search(r'relevance.*?(?:should be|is).*?([1-5])', text_lower)
                    if match:
                        scores['Relevance'] = int(match.group(1))
                
                if 'Accuracy' not in scores:
                    match = re.search(r'accuracy.*?(?:should be|is).*?([1-5])', text_lower)
                    if match:
                        scores['Accuracy'] = int(match.group(1))
                
                if 'Completeness' not in scores:
                    match = re.search(r'completeness.*?(?:should be|is).*?([1-5])', text_lower)
                    if match:
                        scores['Completeness'] = int(match.group(1))
                
                if 'Fluency' not in scores:
                    match = re.search(r'fluency.*?(?:should be|is).*?([1-5])', text_lower)
                    if match:
                        scores['Fluency'] = int(match.group(1))
                
                if 'KG_Alignment' not in scores:
                    match = re.search(r'(?:kg|alignment).*?(?:should be|is).*?([1-5])', text_lower)
                    if match:
                        scores['KG_Alignment'] = int(match.group(1))
            
            
           
            if len(scores) > 0 and len(scores) < 5:
                self.logger.warning(f"Got partial response with {len(scores)} scores: {scores}")
                # Fill missing scores with average of existing ones
                if len(scores) >= 2:
                    avg_score = round(sum(scores.values()) / len(scores))
                    expected_metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
                    for metric in expected_metrics:
                        if metric not in scores:
                            scores[metric] = avg_score
                            self.logger.info(f"Filled missing {metric} with average {avg_score}")
                    return scores
            
            # Final validation (existing code)
            if all(metric in scores for metric in expected_metrics):
                return scores
            else:
                self.logger.warning(f"Missing metrics in response: {response}")
                return self._parse_with_fallback(response)
                
        except Exception as e:
            self.logger.error(f"Failed to parse response: {response}. Error: {str(e)}")
            return None
    
    def _parse_with_fallback(self, response: str) -> Optional[Dict[str, int]]:
        """Fallback parsing with multiple regex patterns."""
        patterns = [
            r':\s*([1-5])(?:/5)?',          #  pattern
            r'\b([1-5])\b',                 
            r'([1-5])\s*(?:out of 5|/5)?'   
        ]
        
        for pattern in patterns:
            try:
                scores = {}
                lines = response.strip().split('\n')
                
                for line in lines:
                    if ':' in line:
                        parts = line.split(':', 1)
                        if len(parts) == 2:
                            metric = parts[0].strip()
                            score_text = parts[1].strip()
                            
                            score_match = re.search(pattern, score_text)
                            if score_match:
                                score = int(score_match.group(1))
                                
                                # Normalize metric names
                                metric_lower = metric.lower()
                                if 'relevance' in metric_lower:
                                    scores['Relevance'] = score
                                elif 'accuracy' in metric_lower:
                                    scores['Accuracy'] = score
                                elif 'completeness' in metric_lower:
                                    scores['Completeness'] = score
                                elif 'fluency' in metric_lower:
                                    scores['Fluency'] = score
                                elif 'kg' in metric_lower or 'alignment' in metric_lower:
                                    scores['KG_Alignment'] = score
                
                # Check if this pattern worked
                expected_metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
                if all(metric in scores for metric in expected_metrics):
                    return scores
                    
            except Exception:
                continue
        
        return None

    def validate_evaluation_scores(self, evaluation: Dict[str, int], qa_id: str) -> Dict[str, int]:
        """
        Validate and clamp evaluation scores to valid range [1, 5].
        """
        validated = {}
        issues = []
        
        for metric, score in evaluation.items():
            original_score = score
            
            # Handle non-numeric values
            if not isinstance(score, (int, float)):
                try:
                    score = int(score)
                except (ValueError, TypeError):
                    score = 3  # Default to middle score
                    issues.append(f"{metric}: non-numeric '{original_score}' → {score}")
            
            # Clamp to valid range
            clamped_score = min(max(int(score), 1), 5)
            validated[metric] = clamped_score
            
            # Log if clamped
            if clamped_score != original_score:
                issues.append(f"{metric}: {original_score} → {clamped_score}")
        
        # Log any issues
        if issues:
            self.logger.warning(f"Score validation issues for {qa_id}: {'; '.join(issues)}")
        
        return validated

    def load_qa_dataset(self, file_path: str) -> List[Dict]:
        """Load QA dataset from JSON file."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            queries = data.get('queries', [])
            self.logger.info(f"Loaded {len(queries)} QA pairs from {file_path}")
            return queries
            
        except Exception as e:
            self.logger.error(f"Failed to load dataset: {str(e)}")
            return []

    def save_checkpoint(self, results: List[Dict], checkpoint_path: str):
        """Save evaluation results as checkpoint."""
        try:
            df = pd.DataFrame(results)
            df.to_csv(checkpoint_path, index=False)
            self.logger.info(f"Checkpoint saved: {len(results)} results to {checkpoint_path}")
        except Exception as e:
            self.logger.error(f"Failed to save checkpoint: {str(e)}")

    def load_checkpoint(self, checkpoint_path: str) -> List[Dict]:
        """Load evaluation results from checkpoint."""
        try:
            if os.path.exists(checkpoint_path):
                df = pd.read_csv(checkpoint_path)
                results = df.to_dict('records')
                self.logger.info(f"Loaded checkpoint: {len(results)} results from {checkpoint_path}")
                return results
            return []
        except Exception as e:
            self.logger.error(f"Failed to load checkpoint: {str(e)}")
            return []

    def evaluate_dataset(self, dataset_path: str, output_path: str, 
                        sample_size: Optional[int] = None, 
                        delay_seconds: float = 1.0,
                        checkpoint_interval: int = 50) -> pd.DataFrame:
        """
        Evaluate QA dataset using LLM judge with checkpointing.
        
        Args:
            dataset_path: Path to the QA dataset JSON file
            output_path: Path to save evaluation results
            sample_size: Number of samples to evaluate (None for all)
            delay_seconds: Delay between API calls to avoid rate limits
            checkpoint_interval: Save checkpoint every N evaluations
            
        Returns:
            DataFrame with evaluation results
        """
        # Load dataset
        qa_items = self.load_qa_dataset(dataset_path)
        if not qa_items:
            self.logger.error("No QA items loaded. Exiting.")
            return pd.DataFrame()
        
        # Sample subset if requested
        if sample_size and sample_size < len(qa_items):
            import random
            random.seed(42)  # For reproducibility
            qa_items = random.sample(qa_items, sample_size)
            self.logger.info(f"Sampling {sample_size} items for evaluation")
        
        # Setup checkpoint
        checkpoint_path = f"{output_path}.checkpoint"
        results = self.load_checkpoint(checkpoint_path)
        
        # Track processed items
        processed_ids = {result['qa_id'] for result in results} if results else set()
        
        # Filter unprocessed items
        remaining_items = [item for item in qa_items 
                          if item.get('id', f'item_{qa_items.index(item)}') not in processed_ids]
        
        if processed_ids:
            self.logger.info(f"Resuming from checkpoint: {len(results)} completed, {len(remaining_items)} remaining")
        
        failed_evaluations = 0
        
        # Progress bar with correct total and initial values
        total_items = len(qa_items)
        completed_items = len(results)
        pbar = tqdm(
            remaining_items, 
            desc="Evaluating QA pairs",
            total=total_items,
            initial=completed_items,
            unit="items"
        )
        
        for i, item in enumerate(remaining_items):
            try:
                # Extract data from item
                qa_id = item.get('id', f'item_{qa_items.index(item)}')
                question = item.get('question', '')
                answer = item.get('answer', '')
                question_type = item.get('question_type', 'unknown')
                
                # Extract source context and KG triples
                source_context = self.get_source_context(item)
                kg_triples = self.extract_triples_from_item(item)
                
                # Skip if essential data is missing
                if not question or not answer:
                    self.logger.warning(f"Skipping item {qa_id}: missing question or answer")
                    pbar.update(1)
                    continue
                
                # Call LLM judge
                evaluation = self.call_llm_judge(question, answer, source_context, kg_triples)
                
                if evaluation:
                    evaluation = self.validate_evaluation_scores(evaluation, qa_id)   

                    # Store results
                    result = {
                        'qa_id': qa_id,
                        'question_type': question_type,
                        'question': question,
                        'answer': answer,
                        'Relevance': evaluation['Relevance'],
                        'Accuracy': evaluation['Accuracy'],
                        'Completeness': evaluation['Completeness'],
                        'Fluency': evaluation['Fluency'],
                        'KG_Alignment': evaluation['KG_Alignment'],
                        'Overall_Score': sum(evaluation.values()) / len(evaluation)
                    }
                    results.append(result)
                    
                    # Update progress bar with detailed status
                    pbar.set_postfix({
                        'Completed': len(results),
                        'Failed': failed_evaluations,
                        'Success_Rate': f"{len(results)/(len(results)+failed_evaluations)*100:.1f}%",
                        'Last_Score': f"{result['Overall_Score']:.1f}"
                    })
                    
                    # Save checkpoint
                    if len(results) % checkpoint_interval == 0:
                        self.save_checkpoint(results, checkpoint_path)
                    
                else:
                    failed_evaluations += 1
                    self.logger.warning(f"Failed to evaluate item {qa_id}")
                
                # Update progress bar
                pbar.update(1)
                
                # Rate limiting
                if delay_seconds > 0:
                    time.sleep(delay_seconds)
                    
            except Exception as e:
                failed_evaluations += 1
                self.logger.error(f"Error processing item {qa_id}: {str(e)}")
                pbar.update(1)
                continue
        
        pbar.close()
        
        # Final save
        if results:
            df = pd.DataFrame(results)
            df.to_csv(output_path, index=False)
            
            # Clean up checkpoint
            if os.path.exists(checkpoint_path):
                os.remove(checkpoint_path)
            
            # Print summary statistics
            self.print_evaluation_summary(df, failed_evaluations)
            
            return df
        else:
            self.logger.error("No successful evaluations completed")
            return pd.DataFrame()

    def print_evaluation_summary(self, df: pd.DataFrame, failed_count: int):
        """Print comprehensive summary statistics of the evaluation."""
        print(f"\n{'='*70}")
        print(f" LLM JUDGE EVALUATION SUMMARY")
        print(f"{'='*70}")
        
        print(f" Evaluation Statistics:")
        print(f"   Total Evaluated: {len(df)}")
        print(f"   Failed Evaluations: {failed_count}")
        print(f"   Success Rate: {len(df)/(len(df)+failed_count)*100:.1f}%")
        print(f"   Evaluation Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        print(f"\n Average Scores by Metric:")
        metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
        for metric in metrics:
            mean_score = df[metric].mean()
            std_score = df[metric].std()
            min_score = df[metric].min()
            max_score = df[metric].max()
            print(f"   {metric}: {mean_score:.2f} ± {std_score:.2f} (range: {min_score}-{max_score})")
        
        print(f"\n Overall Performance:")
        print(f"   Mean Overall Score: {df['Overall_Score'].mean():.2f}")
        print(f"   Median Overall Score: {df['Overall_Score'].median():.2f}")
        print(f"   Best Score: {df['Overall_Score'].max():.2f}")
        print(f"   Worst Score: {df['Overall_Score'].min():.2f}")
        
        # Score distribution
        print(f"\n Score Distribution:")
        score_ranges = [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (4.0, 5.0)]
        for min_score, max_score in score_ranges:
            count = len(df[(df['Overall_Score'] >= min_score) & (df['Overall_Score'] < max_score)])
            percentage = (count / len(df)) * 100
            print(f"   {min_score:.1f}-{max_score:.1f}: {count} ({percentage:.1f}%)")
        
        # Perfect scores
        perfect_scores = len(df[df['Overall_Score'] == 5.0])
        print(f"   Perfect (5.0): {perfect_scores} ({(perfect_scores/len(df))*100:.1f}%)")
        
        if 'question_type' in df.columns:
            print(f"\n Performance by Question Type:")
            type_summary = df.groupby('question_type').agg({
                'Overall_Score': ['mean', 'std', 'count'],
                'Relevance': 'mean',
                'Accuracy': 'mean',
                'Completeness': 'mean',
                'Fluency': 'mean',
                'KG_Alignment': 'mean'
            }).round(3)
            
            for qtype in type_summary.index:
                stats = type_summary.loc[qtype]
                mean_score = stats[('Overall_Score', 'mean')]
                std_score = stats[('Overall_Score', 'std')]
                count = stats[('Overall_Score', 'count')]
                print(f"   {qtype}: {mean_score:.2f} ± {std_score:.2f} (n={count})")

def run_multi_dataset_evaluation():
    """Run evaluation on multiple datasets sequentially."""
    
    # Configuration https://console.nscale.com/
    
    API_KEY = ""  # your NScale api key
    
    # Dataset configurations
    datasets = [
        {
            'name': 'Zero-Shot',
            'path': 'Zero-Shot_qa_dataset.json',
            'output': 'mixtral_zeroshot_evaluation_results.csv'
        },
        {
            'name': 'One-Shot',
            'path': 'One-Shot_qa_dataset.json',
            'output': 'mixtral_oneshot_evaluation_results.csv'
        },
        {
            'name': 'Few-Shot',
            'path': 'Few-Shot_qa_dataset.json',
            'output': 'mixtral_fewshot_evaluation_results.csv'
        }
    ]
    
    # Initialize evaluator with DeepSeek model
    evaluator = ImprovedLLMJudgeEvaluator(
        api_key=API_KEY,
        model_name="mistralai/mixtral-8x22b-instruct-v0.1"
    )
    
    print("="*80)
    print("MULTI-DATASET EVALUATION WITH Mixtral-8x22b-instruct-v0.1")
    print("="*80)
    print(f"Model: {evaluator.model_name}")
    print(f"Datasets to evaluate: {len(datasets)}")
    for i, dataset in enumerate(datasets, 1):
        print(f"  {i}. {dataset['name']}: {dataset['path']}")
    print("="*80)
    
    # Confirm before starting
    response = input("\nProceed with multi-dataset evaluation? (y/n): ").strip().lower()
    if response != 'y':
        print("Evaluation cancelled.")
        return
    
    # Track overall statistics
    overall_start_time = time.time()
    all_results = []
    
    # Process each dataset
    for i, dataset in enumerate(datasets, 1):
        print(f"\n{'='*60}")
        print(f"PROCESSING DATASET {i}/{len(datasets)}: {dataset['name']}")
        print(f"{'='*60}")
        print(f"Input: {dataset['path']}")
        print(f"Output: {dataset['output']}")
        print(f"Model: {evaluator.model_name}")
        
        # Check if dataset file exists
        if not os.path.exists(dataset['path']):
            print(f" ERROR: Dataset file not found: {dataset['path']}")
            print(f"Skipping {dataset['name']} dataset...")
            continue
        
        dataset_start_time = time.time()
        
        # Run evaluation
        try:
            results_df = evaluator.evaluate_dataset(
                dataset_path=dataset['path'],
                output_path=dataset['output'],
                sample_size=None,  # Process all items
                delay_seconds=1.0,  # 0.5 second delay between requests
                checkpoint_interval=50  # Save every 50 evaluations
            )
            
            dataset_end_time = time.time()
            dataset_duration = dataset_end_time - dataset_start_time
            
            if not results_df.empty:
                # Add dataset type to results
                results_df['dataset_type'] = dataset['name']
                all_results.append(results_df)
                
                print(f"\n {dataset['name']} EVALUATION COMPLETED!")
                print(f"   Duration: {dataset_duration/60:.1f} minutes")
                print(f"   Evaluated: {len(results_df)} QA pairs")
                print(f"   Results saved to: {dataset['output']}")
                print(f"   Average Score: {results_df['Overall_Score'].mean():.2f}")
                
            else:
                print(f" {dataset['name']} evaluation failed. Check logs for details.")
                
        except Exception as e:
            print(f" Error processing {dataset['name']}: {str(e)}")
            continue
    
    # Overall summary
    overall_end_time = time.time()
    overall_duration = overall_end_time - overall_start_time
    
    print(f"\n{'='*80}")
    print("MULTI-DATASET EVALUATION SUMMARY")
    print(f"{'='*80}")
    print(f"Total Duration: {overall_duration/60:.1f} minutes")
    print(f"Model Used: {evaluator.model_name}")
    print(f"Datasets Processed: {len(all_results)}/{len(datasets)}")
    
    if all_results:
        # Combine all results for comparative analysis
        combined_df = pd.concat(all_results, ignore_index=True)
        combined_output = "Mixtral_combined_evaluation_results.csv"
        combined_df.to_csv(combined_output, index=False)
        
        print(f"\n COMPARATIVE ANALYSIS:")
        print(f"Combined results saved to: {combined_output}")
        print(f"Total QA pairs evaluated: {len(combined_df)}")
        
        # Performance by dataset
        print(f"\n PERFORMANCE BY DATASET:")
        dataset_summary = combined_df.groupby('dataset_type').agg({
            'Overall_Score': ['mean', 'std', 'count'],
            'Relevance': 'mean',
            'Accuracy': 'mean',
            'Completeness': 'mean',
            'Fluency': 'mean',
            'KG_Alignment': 'mean'
        }).round(3)
        
        for dataset_type in dataset_summary.index:
            stats = dataset_summary.loc[dataset_type]
            mean_score = stats[('Overall_Score', 'mean')]
            std_score = stats[('Overall_Score', 'std')]
            count = stats[('Overall_Score', 'count')]
            print(f"   {dataset_type}: {mean_score:.2f} ± {std_score:.2f} (n={count})")
        
        # Overall statistics
        print(f"\n OVERALL PERFORMANCE:")
        print(f"   Mean Overall Score: {combined_df['Overall_Score'].mean():.2f}")
        print(f"   Best Dataset: {dataset_summary[('Overall_Score', 'mean')].idxmax()}")
        print(f"   Score Range: {combined_df['Overall_Score'].min():.2f} - {combined_df['Overall_Score'].max():.2f}")
        
        # Processing efficiency
        total_items = len(combined_df)
        items_per_minute = total_items / (overall_duration / 60)
        print(f"\n⚡ PROCESSING EFFICIENCY:")
        print(f"   Items per minute: {items_per_minute:.1f}")
        print(f"   Average time per item: {overall_duration/total_items:.2f} seconds")
        
    else:
        print(" No datasets were successfully processed.")
    
    print(f"\n{'='*80}")
    print("EVALUATION COMPLETE")
    print(f"{'='*80}")

if __name__ == "__main__":
    run_multi_dataset_evaluation()

MULTI-DATASET EVALUATION WITH Mixtral-8x22b-instruct-v0.1
Model: mistralai/mixtral-8x22b-instruct-v0.1
Datasets to evaluate: 3
  1. Zero-Shot: Zero-Shot_qa_dataset.json
  2. One-Shot: One-Shot_qa_dataset.json
  3. Few-Shot: Few-Shot_qa_dataset.json



Proceed with multi-dataset evaluation? (y/n):  y


INFO:__main__:Loaded 1127 QA pairs from Zero-Shot_qa_dataset.json



PROCESSING DATASET 1/3: Zero-Shot
Input: Zero-Shot_qa_dataset.json
Output: mixtral-8x22b-instruct-v0.1_zeroshot_evaluation_results.csv
Model: mistralai/mixtral-8x22b-instruct-v0.1


Evaluating QA pairs:   4%| | 49/1127 [06:35<2:08:36,  7.16s/items, Completed=50,INFO:__main__:Checkpoint saved: 50 results to mixtral-8x22b-instruct-v0.1_zeroshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:   9%| | 99/1127 [13:11<2:45:56,  9.69s/items, Completed=100INFO:__main__:Checkpoint saved: 100 results to mixtral-8x22b-instruct-v0.1_zeroshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  13%|▏| 149/1127 [18:51<1:54:37,  7.03s/items, Completed=15INFO:__main__:Checkpoint saved: 150 results to mixtral-8x22b-instruct-v0.1_zeroshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  18%|▏| 199/1127 [24:28<2:04:17,  8.04s/items, Completed=20INFO:__main__:Checkpoint saved: 200 results to mixtral-8x22b-instruct-v0.1_zeroshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  22%|▏| 249/1127 [30:03<2:06:59,  8.68s/items, Completed=25INFO:__main__:Checkpoint saved: 250 results to mixtral-8x22b-instruct-v0.1_zeroshot_evaluation_results.csv.checkpoint
Evaluating 


 IMPROVED LLM JUDGE EVALUATION SUMMARY
 Evaluation Statistics:
   Total Evaluated: 1127
   Failed Evaluations: 0
   Success Rate: 100.0%
   Evaluation Time: 2025-07-17 00:27:58

 Average Scores by Metric:
   Relevance: 4.98 ± 0.15 (range: 3-5)
   Accuracy: 4.97 ± 0.18 (range: 3-5)
   Completeness: 4.88 ± 0.34 (range: 3-5)
   Fluency: 5.00 ± 0.00 (range: 5-5)
   KG_Alignment: 4.93 ± 0.30 (range: 1-5)

 Overall Performance:
   Mean Overall Score: 4.95
   Median Overall Score: 5.00
   Best Score: 5.00
   Worst Score: 3.20

 Score Distribution:
   1.0-2.0: 0 (0.0%)
   2.0-3.0: 0 (0.0%)
   3.0-4.0: 3 (0.3%)
   4.0-5.0: 149 (13.2%)
   Perfect (5.0): 975 (86.5%)

 Performance by Question Type:
   comparative: 4.92 ± 0.20 (n=270.0)
   factual: 4.97 ± 0.08 (n=305.0)
   inferential: 4.95 ± 0.16 (n=275.0)
   relationship: 4.95 ± 0.17 (n=277.0)

 Zero-Shot EVALUATION COMPLETED!
   Duration: 139.2 minutes
   Evaluated: 1127 QA pairs
   Results saved to: mixtral-8x22b-instruct-v0.1_zeroshot_evaluat

Evaluating QA pairs:   5%| | 49/1080 [05:59<2:04:37,  7.25s/items, Completed=50,INFO:__main__:Checkpoint saved: 50 results to mixtral-8x22b-instruct-v0.1_oneshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:   9%| | 99/1080 [12:21<2:12:20,  8.09s/items, Completed=100INFO:__main__:Checkpoint saved: 100 results to mixtral-8x22b-instruct-v0.1_oneshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  14%|▏| 149/1080 [17:31<1:16:35,  4.94s/items, Completed=15INFO:__main__:Checkpoint saved: 150 results to mixtral-8x22b-instruct-v0.1_oneshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  18%|▏| 199/1080 [24:07<2:25:42,  9.92s/items, Completed=20INFO:__main__:Checkpoint saved: 200 results to mixtral-8x22b-instruct-v0.1_oneshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  23%|▏| 249/1080 [30:59<2:04:02,  8.96s/items, Completed=25INFO:__main__:Checkpoint saved: 250 results to mixtral-8x22b-instruct-v0.1_oneshot_evaluation_results.csv.checkpoint
Evaluating QA pa


 IMPROVED LLM JUDGE EVALUATION SUMMARY
 Evaluation Statistics:
   Total Evaluated: 1080
   Failed Evaluations: 0
   Success Rate: 100.0%
   Evaluation Time: 2025-07-17 02:44:25

 Average Scores by Metric:
   Relevance: 4.98 ± 0.14 (range: 3-5)
   Accuracy: 4.96 ± 0.21 (range: 3-5)
   Completeness: 4.84 ± 0.39 (range: 3-5)
   Fluency: 5.00 ± 0.00 (range: 5-5)
   KG_Alignment: 4.92 ± 0.29 (range: 3-5)

 Overall Performance:
   Mean Overall Score: 4.94
   Median Overall Score: 5.00
   Best Score: 5.00
   Worst Score: 3.40

 Score Distribution:
   1.0-2.0: 0 (0.0%)
   2.0-3.0: 0 (0.0%)
   3.0-4.0: 6 (0.6%)
   4.0-5.0: 199 (18.4%)
   Perfect (5.0): 875 (81.0%)

 Performance by Question Type:
   comparative: 4.93 ± 0.17 (n=275.0)
   factual: 4.97 ± 0.08 (n=281.0)
   inferential: 4.92 ± 0.23 (n=269.0)
   relationship: 4.93 ± 0.13 (n=255.0)

 One-Shot EVALUATION COMPLETED!
   Duration: 136.4 minutes
   Evaluated: 1080 QA pairs
   Results saved to: mixtral-8x22b-instruct-v0.1_oneshot_evaluatio

Evaluating QA pairs:   4%| | 49/1107 [06:10<2:32:13,  8.63s/items, Completed=50,INFO:__main__:Checkpoint saved: 50 results to mixtral-8x22b-instruct-v0.1_fewshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:   9%| | 99/1107 [13:05<2:40:25,  9.55s/items, Completed=100INFO:__main__:Checkpoint saved: 100 results to mixtral-8x22b-instruct-v0.1_fewshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  13%|▏| 149/1107 [19:07<1:51:49,  7.00s/items, Completed=15INFO:__main__:Checkpoint saved: 150 results to mixtral-8x22b-instruct-v0.1_fewshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  18%|▏| 199/1107 [25:30<2:14:06,  8.86s/items, Completed=20INFO:__main__:Checkpoint saved: 200 results to mixtral-8x22b-instruct-v0.1_fewshot_evaluation_results.csv.checkpoint
Evaluating QA pairs:  22%|▏| 249/1107 [32:35<2:15:38,  9.49s/items, Completed=25INFO:__main__:Checkpoint saved: 250 results to mixtral-8x22b-instruct-v0.1_fewshot_evaluation_results.csv.checkpoint
Evaluating QA pa


 IMPROVED LLM JUDGE EVALUATION SUMMARY
 Evaluation Statistics:
   Total Evaluated: 1107
   Failed Evaluations: 0
   Success Rate: 100.0%
   Evaluation Time: 2025-07-17 05:05:43

 Average Scores by Metric:
   Relevance: 4.98 ± 0.17 (range: 2-5)
   Accuracy: 4.96 ± 0.24 (range: 1-5)
   Completeness: 4.86 ± 0.40 (range: 1-5)
   Fluency: 5.00 ± 0.00 (range: 5-5)
   KG_Alignment: 4.89 ± 0.36 (range: 1-5)

 Overall Performance:
   Mean Overall Score: 4.94
   Median Overall Score: 5.00
   Best Score: 5.00
   Worst Score: 2.00

 Score Distribution:
   1.0-2.0: 0 (0.0%)
   2.0-3.0: 2 (0.2%)
   3.0-4.0: 0 (0.0%)
   4.0-5.0: 195 (17.6%)
   Perfect (5.0): 910 (82.2%)

 Performance by Question Type:
   comparative: 4.94 ± 0.16 (n=265.0)
   factual: 4.95 ± 0.13 (n=281.0)
   inferential: 4.91 ± 0.29 (n=271.0)
   relationship: 4.95 ± 0.14 (n=290.0)

 Few-Shot EVALUATION COMPLETED!
   Duration: 141.3 minutes
   Evaluated: 1107 QA pairs
   Results saved to: mixtral-8x22b-instruct-v0.1_fewshot_evaluatio


