In [None]:
import json
import pandas as pd
import time
from typing import Dict, List, Optional
import logging
from tqdm import tqdm
import re
import os
from datetime import datetime
import subprocess
import sys

# Install and import together package
try:
    from together import Together
except ImportError:
    print("Installing together package...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "together"])
    from together import Together

class KGOnlyLLMJudgeEvaluator:
    """
    Second Ablation Study: KG-Only LLM Judge Evaluation System
    Evaluates QA pairs generated using only Knowledge Graph information without text chunks.
    With Google Gemma model.
    
    Final Production Version - Ready for Testing
    """
    
    def __init__(self, api_key: str, model_name: str = "google/gemma-2-27b-it"):
        """
        Initialize the KG-Only LLM Judge Evaluator with Together.ai.
        
        Args:
            api_key: Together.ai API key
            model_name: Model identifier for the LLM judge
        """
        self.api_key = api_key
        self.model_name = model_name
        
        # Set environment variable and initialize Together client
        os.environ["TOGETHER_API_KEY"] = api_key
        
        try:
            self.client = Together()
            self.logger = self._setup_logging()
            self.logger.info(f"✓ Together.ai client initialized with model: {model_name}")
        except Exception as e:
            raise RuntimeError(f"Failed to initialize Together.ai client: {e}")
        
        self.evaluation_prompt = """You are a STRICT QA evaluator following precise scoring guidelines for a KNOWLEDGE GRAPH-ONLY.

You will evaluate a model-generated question-answer pair on FIVE metrics using a 1-5 scale where:
5 = Excellent  
4 = Good  
3 = Fair  
2 = Poor  
1 = Very Poor


=============================================
DETAILED SCORING CRITERIA FOR KG-ONLY SYSTEM
=============================================

1. RELEVANCE (1-5): Does the question appropriately relate to the available KG information?
   5: Perfectly relevant to the KG triples, clearly grounded in the graph structure
   4: Mostly relevant, with minor deviations from KG scope
   3: Addresses KG information but may miss some connections
   2: Loosely related to KG, with significant irrelevant elements
   1: Entirely irrelevant or unrelated to the available KG triples

2. ACCURACY (1-5): Is the answer factually correct based on the KG triples?
   5: All facts are accurate and fully derivable from KG triples
   4: Mostly accurate; contains only minor factual issues
   3: Some factual inconsistencies or unsupported assumptions
   2: Several factual errors that contradict KG information
   1: Mostly inaccurate or contradicts KG triples

3. COMPLETENESS (1-5): Does the answer fully address the question using available KG information?
   5: Thorough response using all relevant KG connections
   4: Covers most aspects but may miss some KG relationships
   3: Addresses main question but omits important KG details
   2: Partial answer with significant gaps in KG utilization
   1: Severely incomplete or fails to use relevant KG information

4. FLUENCY (1-5): Is the answer well-written and grammatically correct?
   5: Excellent grammar and clarity; highly readable despite KG-only constraints
   4: Minor grammatical or structural issues
   3: Understandable, but contains noticeable language errors
   2: Somewhat unclear due to poor grammar or phrasing
   1: Difficult to read or understand

5. KG ALIGNMENT (1-5): How well does the answer reflect the KG triples?
   5: Effectively uses KG relationships; no contradictions; may include additional relevant info
   4: Uses most relevant KG information correctly; may omit minor details, no contradictions
   3: Uses some KG information; may miss important relationships but generally consistent
   2: Limited use of KG information; may contain minor contradictions or misinterpretations
   1: Ignores KG information entirely or includes clear contradictions

EVALUATION GUIDELINES FOR KG-ONLY SYSTEM:
• Remember: This system has NO access to source text chunks
• Focus on how well the system leverages graph relationships
• Penalize hallucinations not supported by KG triples
• Reward effective connection of multiple KG relationships
• Consider that some limitations are expected due to KG-only constraint


==============================
INPUT
==============================
**Question:** {question}
**Answer:** {answer}
**Available Knowledge Graph Triples:** {kg_triples}

==============================
RESPONSE FORMAT
==============================
Provide ONLY the numerical scores in this exact format (no explanation):

Relevance: X  
Accuracy: X  
Completeness: X  
Fluency: X  
KG_Alignment: X


Where X is a number from 1 to 5."""

    def _setup_logging(self) -> logging.Logger:
        """Setup comprehensive logging for the evaluation process."""
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)
        
        # Clear existing handlers
        for handler in logger.handlers[:]:
            logger.removeHandler(handler)
        
        # Create formatter
        formatter = logging.Formatter(
            '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
            datefmt='%Y-%m-%d %H:%M:%S'
        )
        
        # Console handler
        console_handler = logging.StreamHandler()
        console_handler.setLevel(logging.INFO)
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)
        
        # File handler
        try:
            log_filename = f"kg_only_evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
            file_handler = logging.FileHandler(log_filename)
            file_handler.setLevel(logging.DEBUG)
            file_handler.setFormatter(formatter)
            logger.addHandler(file_handler)
            logger.info(f"Log file created: {log_filename}")
        except Exception as e:
            logger.warning(f"Could not create log file: {e}")
        
        return logger

    def extract_triples_from_item(self, item: Dict) -> str:
        """Extract and format KG triples from a QA item with robust error handling."""
        triples = []
        
        try:
            # Extract from ground_truth.source_triples structure
            if "ground_truth" in item and "source_triples" in item["ground_truth"]:
                source_triples = item["ground_truth"]["source_triples"]
                
                # Handle list of triple dictionaries
                if isinstance(source_triples, list):
                    for triple_dict in source_triples:
                        if isinstance(triple_dict, dict) and all(key in triple_dict for key in ["subject", "predicate", "object"]):
                            triple = f"{triple_dict['subject']} → {triple_dict['predicate']} → {triple_dict['object']}"
                            triples.append(triple)
                
                # Handle single triple dictionary
                elif isinstance(source_triples, dict) and all(key in source_triples for key in ["subject", "predicate", "object"]):
                    triple = f"{source_triples['subject']} → {source_triples['predicate']} → {source_triples['object']}"
                    triples.append(triple)
            
            # Fallback: comprehensive search for triples
            if not triples:
                def extract_all_triples(obj, path=""):
                    found_triples = []
                    if isinstance(obj, dict):
                        # Check if this dict is a triple
                        if set(obj.keys()) >= {"subject", "predicate", "object"}:
                            triple = f"{obj['subject']} → {obj['predicate']} → {obj['object']}"
                            found_triples.append(triple)
                        # Recursively search nested structures
                        for key, value in obj.items():
                            found_triples.extend(extract_all_triples(value, f"{path}.{key}" if path else key))
                    elif isinstance(obj, list):
                        for i, item in enumerate(obj):
                            found_triples.extend(extract_all_triples(item, f"{path}[{i}]" if path else f"[{i}]"))
                    return found_triples
                
                triples = extract_all_triples(item)
            
            # Format triples for output
            if triples:
                # Remove duplicates while preserving order
                unique_triples = list(dict.fromkeys(triples))
                formatted_triples = []
                for i, triple in enumerate(unique_triples, 1):
                    formatted_triples.append(f"{i}. {triple}")
                return "\n".join(formatted_triples)
            
        except Exception as e:
            self.logger.error(f"Error extracting triples from item: {e}")
        
        return "No KG triples available"

    def call_llm_judge(self, question: str, answer: str, kg_triples: str, 
                      max_retries: int = 3) -> Optional[Dict[str, int]]:
        """
        Call the LLM judge with robust error handling and retry logic.
        
        Args:
            question: The question to evaluate
            answer: The answer to evaluate
            kg_triples: Knowledge graph triples (only source of information)
            max_retries: Maximum number of retry attempts
            
        Returns:
            Dict with evaluation scores or None if failed
        """
        if not question.strip() or not answer.strip() or not kg_triples.strip():
            self.logger.warning("Empty question, answer, or KG triples provided")
            return None
        
        # Format the prompt for KG-only evaluation
        prompt = self.evaluation_prompt.format(
            question=question.strip(),
            answer=answer.strip(),
            kg_triples=kg_triples.strip()
        )
        
        for attempt in range(max_retries):
            try:
                # Add timeout and improved parameters for Gemma model
                response = self.client.chat.completions.create(
                    model=self.model_name,
                    messages=[{"role": "user", "content": prompt}],
                    temperature=0.0,  # Deterministic for consistent scoring
                    max_tokens=200,   # Increased for better response parsing
                    top_p=0.9,       # Slightly higher for Gemma
                    repetition_penalty=1.1,  # Reduce repetition
                    stop=["###", "---", "END"]  # Stop sequences
                )
                
                content = response.choices[0].message.content.strip()
                self.logger.debug(f"LLM Response (attempt {attempt + 1}): {content}")
                
                parsed_result = self.parse_evaluation_response(content)
                if parsed_result:
                    return parsed_result
                else:
                    self.logger.warning(f"Failed to parse response on attempt {attempt + 1}: {content}")
                    
            except Exception as e:
                error_msg = str(e).lower()
                if any(term in error_msg for term in ["rate", "limit", "quota", "429"]):
                    wait_time = min(2 ** attempt, 30)  # Cap at 30 seconds
                    self.logger.warning(f"Rate limit hit, waiting {wait_time}s before retry {attempt + 1}/{max_retries}")
                    time.sleep(wait_time)
                elif "timeout" in error_msg:
                    wait_time = 5 * (attempt + 1)
                    self.logger.warning(f"Timeout error, waiting {wait_time}s before retry {attempt + 1}/{max_retries}")
                    time.sleep(wait_time)
                else:
                    self.logger.error(f"API Error on attempt {attempt + 1}: {e}")
                    if attempt < max_retries - 1:
                        time.sleep(2 ** attempt)
        
        self.logger.error(f"Failed to get valid response after {max_retries} attempts")
        return None

    def parse_evaluation_response(self, response: str) -> Optional[Dict[str, int]]:
        """
        Parse LLM response with improved pattern matching and validation.
        """
        if not response or not response.strip():
            return None
            
        try:
            scores = {}
            lines = response.strip().split('\n')
            
            # Primary parsing with exact format matching
            for line in lines:
                line = line.strip()
                if ':' in line and not line.startswith('#'):
                    parts = line.split(':', 1)
                    if len(parts) == 2:
                        metric = parts[0].strip()
                        score_part = parts[1].strip()
                        
                        # Extract numeric score with multiple patterns
                        score_patterns = [
                            r'^([1-5])(?:/5)?(?:\s|$)',  # Start with score
                            r':\s*([1-5])(?:/5)?',       # After colon
                            r'\b([1-5])(?:/5)?\b'        # Anywhere in text
                        ]
                        
                        score = None
                        for pattern in score_patterns:
                            match = re.search(pattern, score_part)
                            if match:
                                score = int(match.group(1))
                                break
                        
                        if score is not None and 1 <= score <= 5:
                            # Normalize metric names with fuzzy matching
                            metric_lower = metric.lower().replace('_', ' ')
                            if any(term in metric_lower for term in ['relevance', 'relevant']):
                                scores['Relevance'] = score
                            elif any(term in metric_lower for term in ['accuracy', 'accurate']):
                                scores['Accuracy'] = score
                            elif any(term in metric_lower for term in ['completeness', 'complete']):
                                scores['Completeness'] = score
                            elif any(term in metric_lower for term in ['fluency', 'fluent']):
                                scores['Fluency'] = score
                            elif any(term in metric_lower for term in ['kg', 'alignment', 'align']):
                                scores['KG_Alignment'] = score
            
            # Validate all required metrics are present
            expected_metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
            if all(metric in scores for metric in expected_metrics):
                # Additional validation: check score ranges
                if all(1 <= score <= 5 for score in scores.values()):
                    self.logger.debug(f"Successfully parsed scores: {scores}")
                    return scores
                else:
                    self.logger.warning(f"Scores out of valid range (1-5): {scores}")
            else:
                missing = set(expected_metrics) - set(scores.keys())
                self.logger.warning(f"Missing metrics: {missing}. Found: {scores}")
                
                # Attempt fallback parsing
                return self._parse_with_fallback(response)
                
        except Exception as e:
            self.logger.error(f"Error parsing response: {e}")
            return self._parse_with_fallback(response)
        
        return None
    
    def _parse_with_fallback(self, response: str) -> Optional[Dict[str, int]]:
        """Enhanced fallback parsing with multiple strategies."""
        fallback_patterns = [
            # Pattern 1: Numbers after metric names
            (r'relevance[:\s]*([1-5])', 'Relevance'),
            (r'accuracy[:\s]*([1-5])', 'Accuracy'),
            (r'completeness[:\s]*([1-5])', 'Completeness'),
            (r'fluency[:\s]*([1-5])', 'Fluency'),
            (r'(?:kg|alignment)[:\s]*([1-5])', 'KG_Alignment'),
        ]
        
        scores = {}
        response_lower = response.lower()
        
        for pattern, metric in fallback_patterns:
            matches = re.findall(pattern, response_lower)
            if matches:
                try:
                    score = int(matches[0])
                    if 1 <= score <= 5:
                        scores[metric] = score
                except (ValueError, IndexError):
                    continue
        
        # If we found all 5 metrics, return the scores
        expected_metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
        if len(scores) == 5 and all(metric in scores for metric in expected_metrics):
            self.logger.info(f"Fallback parsing successful: {scores}")
            return scores
        
        # Last resort: extract any 5 numbers between 1-5
        all_numbers = re.findall(r'\b([1-5])\b', response)
        if len(all_numbers) >= 5:
            try:
                result = {}
                for i, metric in enumerate(expected_metrics):
                    result[metric] = int(all_numbers[i])
                self.logger.warning(f"Used last resort parsing: {result}")
                return result
            except (ValueError, IndexError):
                pass
        
        return None

    def load_qa_dataset(self, file_path: str) -> List[Dict]:
        """Load KG-only QA dataset with comprehensive error handling."""
        if not os.path.exists(file_path):
            self.logger.error(f"Dataset file not found: {file_path}")
            return []
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Handle various dataset structures
            queries = []
            if isinstance(data, list):
                queries = data
            elif isinstance(data, dict):
                # Try different possible keys
                for key in ["metadata.queries", "queries", "data", "items"]:
                    if "." in key:
                        # Handle nested keys
                        current = data
                        for part in key.split("."):
                            if isinstance(current, dict) and part in current:
                                current = current[part]
                            else:
                                current = None
                                break
                        if isinstance(current, list):
                            queries = current
                            break
                    elif key in data and isinstance(data[key], list):
                        queries = data[key]
                        break
            
            if not queries:
                self.logger.error(f"No valid queries found in dataset structure: {list(data.keys()) if isinstance(data, dict) else 'list'}")
                return []
            
            self.logger.info(f"✓ Loaded {len(queries)} KG-only QA pairs from {file_path}")
            
            # Log sample structure for debugging
            if queries:
                sample = queries[0]
                self.logger.info(f"Sample structure keys: {list(sample.keys())}")
                if "ground_truth" in sample:
                    self.logger.info(f"Ground truth keys: {list(sample['ground_truth'].keys())}")
                    
                # Validate essential fields
                required_fields = ['question', 'answer']
                missing_fields = [field for field in required_fields if field not in sample]
                if missing_fields:
                    self.logger.warning(f"Missing required fields in sample: {missing_fields}")
            
            return queries
            
        except json.JSONDecodeError as e:
            self.logger.error(f"Invalid JSON format in dataset file: {e}")
        except Exception as e:
            self.logger.error(f"Failed to load dataset: {e}")
        
        return []

    def save_checkpoint(self, results: List[Dict], checkpoint_path: str):
        """Save evaluation results with error handling."""
        try:
            df = pd.DataFrame(results)
            df.to_csv(checkpoint_path, index=False)
            self.logger.info(f" Checkpoint saved: {len(results)} results to {checkpoint_path}")
        except Exception as e:
            self.logger.error(f"Failed to save checkpoint: {e}")

    def load_checkpoint(self, checkpoint_path: str) -> List[Dict]:
        """Load evaluation results from checkpoint with validation."""
        try:
            if os.path.exists(checkpoint_path):
                df = pd.read_csv(checkpoint_path)
                
                # Validate checkpoint format
                required_columns = ['qa_id', 'Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
                missing_columns = [col for col in required_columns if col not in df.columns]
                if missing_columns:
                    self.logger.warning(f"Checkpoint missing required columns: {missing_columns}")
                    return []
                
                results = df.to_dict('records')
                self.logger.info(f"✓ Loaded checkpoint: {len(results)} results from {checkpoint_path}")
                return results
            return []
        except Exception as e:
            self.logger.error(f"Failed to load checkpoint: {e}")
            return []

    def validate_dataset_item(self, item: Dict, qa_id: str) -> bool:
        """Validate individual dataset item for required fields."""
        required_fields = ['question', 'answer']
        for field in required_fields:
            if field not in item or not str(item[field]).strip():
                self.logger.warning(f"Item {qa_id}: missing or empty field '{field}'")
                return False
        return True

    def evaluate_dataset(self, dataset_path: str, output_path: str, 
                        sample_size: Optional[int] = None, 
                        delay_seconds: float = 1.0,
                        checkpoint_interval: int = 25) -> pd.DataFrame:
        """
        Evaluate KG-only QA dataset with comprehensive monitoring and error handling.
        
        Args:
            dataset_path: Path to the KG-only QA dataset JSON file
            output_path: Path to save evaluation results
            sample_size: Number of samples to evaluate (None for all)
            delay_seconds: Delay between API calls to avoid rate limits
            checkpoint_interval: Save checkpoint every N evaluations
            
        Returns:
            DataFrame with evaluation results
        """
        self.logger.info(f" Starting KG-only evaluation...")
        self.logger.info(f"Dataset: {dataset_path}")
        self.logger.info(f"Output: {output_path}")
        self.logger.info(f"Model: {self.model_name}")
        
        # Load dataset
        qa_items = self.load_qa_dataset(dataset_path)
        if not qa_items:
            self.logger.error(" No QA items loaded. Exiting.")
            return pd.DataFrame()
        
        # Sample subset if requested
        if sample_size and sample_size < len(qa_items):
            import random
            random.seed(42)  # For reproducibility
            qa_items = random.sample(qa_items, sample_size)
            self.logger.info(f" Sampling {sample_size} items for evaluation")
        
        # Setup checkpoint
        checkpoint_path = f"{output_path}.checkpoint"
        results = self.load_checkpoint(checkpoint_path)
        
        # Track processed items
        processed_ids = {result['qa_id'] for result in results} if results else set()
        
        # Filter unprocessed items
        remaining_items = []
        for i, item in enumerate(qa_items):
            qa_id = item.get('id', f'item_{i}')
            if qa_id not in processed_ids:
                if self.validate_dataset_item(item, qa_id):
                    remaining_items.append((i, item))
        
        if processed_ids:
            self.logger.info(f" Resuming from checkpoint: {len(results)} completed, {len(remaining_items)} remaining")
        
        if not remaining_items:
            self.logger.info("✅ All items already processed!")
            if results:
                return pd.DataFrame(results)
            return pd.DataFrame()
        
        failed_evaluations = 0
        skipped_items = 0
        
        # Progress bar with detailed tracking
        total_items = len(qa_items)
        completed_items = len(results)
        pbar = tqdm(
            remaining_items, 
            desc=" Evaluating KG-only QA pairs",
            total=total_items,
            initial=completed_items,
            unit="items",
            ncols=100
        )
        
        for item_index, item in pbar:
            try:
                # Extract data from item
                qa_id = item.get('id', f'item_{item_index}')
                question = str(item.get('question', '')).strip()
                answer = str(item.get('answer', '')).strip()
                question_type = item.get('question_type', 'unknown')
                generation_method = item.get('generation_method', 'unknown')
                
                # Extract KG triples (only source of information for ablation study)
                kg_triples = self.extract_triples_from_item(item)
                
                # Skip if no KG triples available
                if kg_triples == "No KG triples available":
                    self.logger.warning(f" Skipping item {qa_id}: no KG triples found")
                    skipped_items += 1
                    pbar.update(1)
                    continue
                
                # Call LLM judge (KG-only evaluation)
                evaluation = self.call_llm_judge(question, answer, kg_triples)
                
                if evaluation:
                    # Store results with comprehensive metadata
                    result = {
                        'qa_id': qa_id,
                        'question_type': question_type,
                        'generation_method': generation_method,
                        'question': question,
                        'answer': answer,
                        'kg_triples': kg_triples,
                        'num_source_triples': len(kg_triples.split('\n')) if kg_triples != "No KG triples available" else 0,
                        'text_context_used': item.get('ground_truth', {}).get('text_context_used', False),
                        'ablation_study': item.get('ground_truth', {}).get('ablation_study', 'kg_only'),
                        'Relevance': evaluation['Relevance'],
                        'Accuracy': evaluation['Accuracy'],
                        'Completeness': evaluation['Completeness'],
                        'Fluency': evaluation['Fluency'],
                        'KG_Alignment': evaluation['KG_Alignment'],
                        'Overall_Score': sum(evaluation.values()) / len(evaluation),
                        'evaluation_timestamp': datetime.now().isoformat()
                    }
                    results.append(result)
                    
                    # Update progress bar with rich status
                    success_rate = len(results) / (len(results) + failed_evaluations + skipped_items) * 100
                    pbar.set_postfix({
                        'Success': f"{success_rate:.1f}%",
                        'Failed': failed_evaluations,
                        'Skipped': skipped_items,
                        'Last': f"{result['Overall_Score']:.1f}"
                    })
                    
                    # Save checkpoint at intervals
                    if len(results) % checkpoint_interval == 0:
                        self.save_checkpoint(results, checkpoint_path)
                    
                else:
                    failed_evaluations += 1
                    self.logger.warning(f" Failed to evaluate item {qa_id}")
                
                # Rate limiting
                if delay_seconds > 0:
                    time.sleep(delay_seconds)
                    
            except KeyboardInterrupt:
                self.logger.info(" Evaluation interrupted by user")
                break
            except Exception as e:
                failed_evaluations += 1
                self.logger.error(f" Error processing item {qa_id}: {e}")
                continue
        
        pbar.close()
        
        # Final save and cleanup
        if results:
            try:
                df = pd.DataFrame(results)
                df.to_csv(output_path, index=False)
                
                # Clean up checkpoint file
                if os.path.exists(checkpoint_path):
                    os.remove(checkpoint_path)
                    self.logger.info(f" Cleaned up checkpoint file")
                
                # Print comprehensive summary
                self.print_evaluation_summary(df, failed_evaluations, skipped_items)
                
                return df
                
            except Exception as e:
                self.logger.error(f"Failed to save final results: {e}")
                return pd.DataFrame()
        else:
            self.logger.error(" No successful evaluations completed")
            return pd.DataFrame()

    def print_evaluation_summary(self, df: pd.DataFrame, failed_count: int, skipped_count: int):
        """Print comprehensive and visually appealing evaluation summary."""
        print(f"\n{'='*80}")
        print(f" SECOND ABLATION STUDY: KG-ONLY EVALUATION SUMMARY")
        print(f"{'='*80}")
        
        # Basic statistics
        total_attempted = len(df) + failed_count + skipped_count
        success_rate = (len(df) / total_attempted * 100) if total_attempted > 0 else 0
        
        print(f" Evaluation Statistics:")
        print(f"   Total Items Processed: {total_attempted}")
        print(f"   Successfully Evaluated: {len(df)}")
        print(f"   Failed Evaluations: {failed_count}")
        print(f"   Skipped Items: {skipped_count}")
        print(f"   Success Rate: {success_rate:.1f}%")
        print(f"   Model Used: {self.model_name}")
        print(f"   Evaluation Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        if len(df) == 0:
            print("\n No data to analyze!")
            return
        
        # Metric-by-metric analysis
        print(f"\n Average Scores by Metric (KG-Only System):")
        metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
        
        for metric in metrics:
            if metric in df.columns:
                mean_score = df[metric].mean()
                std_score = df[metric].std()
                min_score = df[metric].min()
                max_score = df[metric].max()
                median_score = df[metric].median()
                
                # Add performance indicator
                if mean_score >= 4.0:
                    indicator = "🟢"
                elif mean_score >= 3.0:
                    indicator = "🟡"
                else:
                    indicator = "🔴"
                
                print(f"   {indicator} {metric}: {mean_score:.2f} ± {std_score:.2f} "
                      f"(range: {min_score}-{max_score}, median: {median_score:.1f})")
        
        # Overall performance metrics
        print(f"\n Overall Performance:")
        overall_stats = {
            'Mean': df['Overall_Score'].mean(),
            'Median': df['Overall_Score'].median(),
            'Best': df['Overall_Score'].max(),
            'Worst': df['Overall_Score'].min(),
            'Std Dev': df['Overall_Score'].std()
        }
        
        for stat_name, value in overall_stats.items():
            print(f"   {stat_name}: {value:.2f}")
        
        # Score distribution analysis
        print(f"\n Score Distribution:")
        score_ranges = [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (4.0, 5.0)]
        for min_score, max_score in score_ranges:
            count = len(df[(df['Overall_Score'] >= min_score) & (df['Overall_Score'] < max_score)])
            percentage = (count / len(df)) * 100
            bar = "" * int(percentage / 5)  # Visual bar
            print(f"   {min_score:.1f}-{max_score:.1f}: {count:3d} ({percentage:5.1f}%) {bar}")
        
        # Perfect scores
        perfect_scores = len(df[df['Overall_Score'] == 5.0])
        print(f"   Perfect (5.0): {perfect_scores:3d} ({(perfect_scores/len(df))*100:5.1f}%)")
        
        # KG-specific analysis
        print(f"\n KG-Only System Analysis:")
        print(f"   Average KG Alignment: {df['KG_Alignment'].mean():.2f}")
        print(f"   KG Alignment Range: {df['KG_Alignment'].min():.1f} - {df['KG_Alignment'].max():.1f}")
        
        # Dataset insights
        if 'num_source_triples' in df.columns:
            print(f"   Average Source Triples: {df['num_source_triples'].mean():.1f}")
            print(f"   Triple Count Range: {df['num_source_triples'].min()}-{df['num_source_triples'].max()}")
            
            # Correlation analysis
            if len(df) > 1:
                correlation = df['num_source_triples'].corr(df['Overall_Score'])
                print(f"   Triple Count vs Performance Correlation: {correlation:.3f}")
        
        # Performance by question type
        if 'question_type' in df.columns and df['question_type'].nunique() > 1:
            print(f"\n🔍 Performance by Question Type:")
            type_summary = df.groupby('question_type').agg({
                'Overall_Score': ['mean', 'std', 'count'],
                'KG_Alignment': 'mean'
            }).round(3)
            
            for qtype in type_summary.index:
                stats = type_summary.loc[qtype]
                mean_score = stats[('Overall_Score', 'mean')]
                std_score = stats[('Overall_Score', 'std')]
                count = stats[('Overall_Score', 'count')]
                kg_align = stats[('KG_Alignment', 'mean')]
                print(f"   {qtype}: {mean_score:.2f} ± {std_score:.2f} (n={count}, KG={kg_align:.2f})")
        
        # Quality assessment
        print(f"\n Quality Assessment:")
        high_quality = len(df[df['Overall_Score'] >= 4.0])
        medium_quality = len(df[(df['Overall_Score'] >= 3.0) & (df['Overall_Score'] < 4.0)])
        low_quality = len(df[df['Overall_Score'] < 3.0])
        
        print(f"   High Quality (≥4.0): {high_quality} ({(high_quality/len(df))*100:.1f}%)")
        print(f"   Medium Quality (3.0-3.9): {medium_quality} ({(medium_quality/len(df))*100:.1f}%)")
        print(f"   Low Quality (<3.0): {low_quality} ({(low_quality/len(df))*100:.1f}%)")
        
        # KG alignment specific insights
        print(f"\n KG Alignment Insights:")
        excellent_kg = len(df[df['KG_Alignment'] >= 4])
        poor_kg = len(df[df['KG_Alignment'] <= 2])
        print(f"   Excellent KG Alignment (≥4): {excellent_kg} ({(excellent_kg/len(df))*100:.1f}%)")
        print(f"   Poor KG Alignment (≤2): {poor_kg} ({(poor_kg/len(df))*100:.1f}%)")
        
        # Recommendations
        print(f"\n Key Insights & Recommendations:")
        avg_score = df['Overall_Score'].mean()
        if avg_score >= 4.0:
            print(f"    Excellent performance! The KG-only system shows strong capabilities.")
        elif avg_score >= 3.5:
            print(f"    Good performance with room for improvement in specific areas.")
        elif avg_score >= 3.0:
            print(f"    Moderate performance. Consider enhancing KG utilization strategies.")
        else:
            print(f"    Performance below expectations. Review KG extraction and reasoning.")
        
        # Specific recommendations based on metrics
        if df['KG_Alignment'].mean() < 3.5:
            print(f"    Focus on improving KG triple utilization and alignment.")
        if df['Completeess'].mean() < 3.5:
            print(f"    Enhance answer completeness by leveraging more KG relationships.")
        if df['Accuracy'].mean() < 3.5:
            print(f"    Improve factual accuracy by better grounding in available KG triples.")
        
        print(f"\n Next Steps:")
        print(f"   1. Compare results with baseline/original system")
        print(f"   2. Analyze high-performing vs low-performing examples")
        print(f"   3. Identify question types that benefit most from KG-only approach")
        print(f"   4. Investigate correlation between triple count and performance")
        print(f"   5. Consider hybrid approaches for completeness improvement")

def run_kg_only_evaluation():
    """
    Main function to run KG-only evaluation for the second ablation study.
    Production-ready with comprehensive error handling and user guidance.
    """
    
    # Configuration
    API_KEY = "" # enter you api
    MODEL_NAME = "google/gemma-2-27b-it"
    DATASET_PATH = "Ablation_2_kg_only_qa_dataset.json"
    OUTPUT_PATH = "Gemma_27B_kg_only_evaluation_results.csv"
    
    # Display startup banner
    print("┌" + "─" * 78 + "┐")
    print("│" + " " * 78 + "│")
    print("│   SECOND ABLATION STUDY: KG-ONLY EVALUATION SYSTEM" + " " * 20 + "│")
    print("│" + " " * 78 + "│")
    print("│   Evaluates QA pairs generated using ONLY Knowledge Graph triples" + " " * 8 + "│")
    print("│   Using Google Gemma 2-27B model" + " " * 22 + "│")
    print("│   Evaluation with comprehensive monitoring" + " " * 12 + "│")
    print("│" + " " * 78 + "│")
    print("└" + "─" * 78 + "┘")
    
    # Pre-flight checks
    print(f"\n Pre-flight Checks:")
    
    # Check dataset file
    if os.path.exists(DATASET_PATH):
        print(f"    Dataset file found: {DATASET_PATH}")
        try:
            with open(DATASET_PATH, 'r') as f:
                data = json.load(f)
            print(f"    Dataset file is valid JSON")
        except:
            print(f"   Dataset file has invalid JSON format")
            return
    else:
        print(f"    Dataset file not found: {DATASET_PATH}")
        print(f"    Please ensure the dataset file exists in the current directory")
        return
    
    # Check API key
    if API_KEY and len(API_KEY) > 20:
        print(f"    API key configured")
    else:
        print(f"    API key not properly configured")
        return
    
    # Check output path
    output_dir = os.path.dirname(OUTPUT_PATH) if os.path.dirname(OUTPUT_PATH) else "."
    if os.access(output_dir, os.W_OK):
        print(f"    Output directory is writable")
    else:
        print(f"    Cannot write to output directory: {output_dir}")
        return
    
    print(f"\n Configuration:")
    print(f"    Model: {MODEL_NAME}")
    print(f"    Dataset: {DATASET_PATH}")
    print(f"    Output: {OUTPUT_PATH}")
    print(f"    API: Together.ai")
    
    # User confirmation
    print(f"\n" + "─" * 60)
    response = input(" Ready to start evaluation? (y/n): ").strip().lower()
    if response != 'y':
        print(" Evaluation cancelled.")
        return
    
    # Initialize evaluator
    try:
        print(f"\n Initializing evaluator...")
        evaluator = KGOnlyLLMJudgeEvaluator(api_key=API_KEY, model_name=MODEL_NAME)
        print(f" Evaluator initialized successfully!")
    except Exception as e:
        print(f" Failed to initialize evaluator: {e}")
        return
    
    # Run evaluation
    start_time = time.time()
    print(f"\n Starting evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    
    try:
        results_df = evaluator.evaluate_dataset(
            dataset_path=DATASET_PATH,
            output_path=OUTPUT_PATH,
            sample_size=None,  # Process all items
            delay_seconds=1.0,  # 1 second delay for stability
            checkpoint_interval=25  # Save every 25 evaluations
        )
        
        end_time = time.time()
        duration = end_time - start_time
        
        if not results_df.empty:
            print(f"\n EVALUATION COMPLETED SUCCESSFULLY!")
            print(f"  Total time: {duration/60:.1f} minutes")
            print(f" Items evaluated: {len(results_df)}")
            print(f" Results saved to: {OUTPUT_PATH}")
            print(f" Average score: {results_df['Overall_Score'].mean():.2f}/5.0")
            
            # Performance stats
            items_per_minute = len(results_df) / (duration / 60) if duration > 0 else 0
            print(f"⚡ Processing rate: {items_per_minute:.1f} items/minute")
            
            # Quick quality check
            high_quality = len(results_df[results_df['Overall_Score'] >= 4.0])
            quality_rate = (high_quality / len(results_df)) * 100
            print(f" High quality responses (≥4.0): {quality_rate:.1f}%")
            
            print(f"\n✨ Evaluation complete! Check the detailed summary above for insights.")
            
        else:
            print(f" Evaluation failed - no results generated.")
            print(f" Check the logs for detailed error information.")
            
    except KeyboardInterrupt:
        print(f"\n  Evaluation interrupted by user")
        print(f" Partial results may be saved in checkpoint file")
    except Exception as e:
        print(f"\n Evaluation failed with error: {e}")
        print(f" Check the logs for detailed error information")

def main():
    """Entry point with additional safety checks."""
    try:
        # Ensure required packages are available
        required_packages = ['pandas', 'tqdm', 'together']
        missing_packages = []
        
        for package in required_packages:
            try:
                __import__(package)
            except ImportError:
                missing_packages.append(package)
        
        if missing_packages:
            print(f" Installing missing packages: {', '.join(missing_packages)}")
            for package in missing_packages:
                subprocess.check_call([sys.executable, "-m", "pip", "install", package])
            print(f" All packages installed successfully!")
        
        # Run the evaluation
        run_kg_only_evaluation()
        
    except Exception as e:
        print(f" Fatal error: {e}")
        print(f" Please check your Python environment and try again")

if __name__ == "__main__":
    main()