In [None]:
import json
import pandas as pd
import requests
import time
from typing import Dict, List, Optional
import logging
from tqdm import tqdm
import re
import os
from datetime import datetime

class KGOnlyLLMJudgeEvaluator:
    """
    Second Ablation Study: KG-Only LLM Judge Evaluation System
    Evaluates QA pairs generated using only Knowledge Graph information without text chunks.
    """
    
    def __init__(self, api_key: str, model_name: str = "deepseek-ai/DeepSeek-R1-Distill-Llama-70B"):
        """
        Initialize the KG-Only LLM Judge Evaluator.
        
        Args:
            api_key: NScale API key
            model_name: Model identifier for the LLM judge
        """
        self.api_key = api_key
        self.model_name = model_name
        self.base_url = "https://inference.api.nscale.com/v1/chat/completions"
        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }
        
        # Setup logging
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # KG-Only evaluation prompt (modified for second ablation study)
        self.evaluation_prompt = """You are a STRICT QA evaluator following precise scoring guidelines for a KNOWLEDGE GRAPH-ONLY.

You will evaluate a model-generated question-answer pair on FIVE metrics using a 1-5 scale where:
5 = Excellent  
4 = Good  
3 = Fair  
2 = Poor  
1 = Very Poor


=============================================
DETAILED SCORING CRITERIA FOR KG-ONLY SYSTEM
=============================================

1. RELEVANCE (1-5): Does the question appropriately relate to the available KG information?
   5: Perfectly relevant to the KG triples, clearly grounded in the graph structure
   4: Mostly relevant, with minor deviations from KG scope
   3: Addresses KG information but may miss some connections
   2: Loosely related to KG, with significant irrelevant elements
   1: Entirely irrelevant or unrelated to the available KG triples

2. ACCURACY (1-5): Is the answer factually correct based on the KG triples?
   5: All facts are accurate and fully derivable from KG triples
   4: Mostly accurate; contains only minor factual issues
   3: Some factual inconsistencies or unsupported assumptions
   2: Several factual errors that contradict KG information
   1: Mostly inaccurate or contradicts KG triples

3. COMPLETENESS (1-5): Does the answer fully address the question using available KG information?
   5: Thorough response using all relevant KG connections
   4: Covers most aspects but may miss some KG relationships
   3: Addresses main question but omits important KG details
   2: Partial answer with significant gaps in KG utilization
   1: Severely incomplete or fails to use relevant KG information

4. FLUENCY (1-5): Is the answer well-written and grammatically correct?
   5: Excellent grammar and clarity; highly readable despite KG-only constraints
   4: Minor grammatical or structural issues
   3: Understandable, but contains noticeable language errors
   2: Somewhat unclear due to poor grammar or phrasing
   1: Difficult to read or understand

5. KG ALIGNMENT (1-5): How well does the answer reflect the KG triples?
   5: Effectively uses KG relationships; no contradictions; may include additional relevant info
   4: Uses most relevant KG information correctly; may omit minor details, no contradictions
   3: Uses some KG information; may miss important relationships but generally consistent
   2: Limited use of KG information; may contain minor contradictions or misinterpretations
   1: Ignores KG information entirely or includes clear contradictions

EVALUATION GUIDELINES FOR KG-ONLY SYSTEM:
• Remember: This system has NO access to source text chunks
• Focus on how well the system leverages graph relationships
• Penalize hallucinations not supported by KG triples
• Reward effective connection of multiple KG relationships
• Consider that some limitations are expected due to KG-only constraint


==============================
INPUT
==============================
**Question:** {question}
**Answer:** {answer}
**Available Knowledge Graph Triples:** {kg_triples}

==============================
RESPONSE FORMAT
==============================
Provide ONLY the numerical scores in this exact format (no explanation):

Relevance: X  
Accuracy: X  
Completeness: X  
Fluency: X  
KG_Alignment: X


Where X is a number from 1 to 5."""

    def extract_triples_from_item(self, item: Dict) -> str:
        """Extract and format KG triples from a QA item with KG-only dataset structure."""
        triples = []
        
        # Extract from ground_truth.source_triples structure
        if "ground_truth" in item and "source_triples" in item["ground_truth"]:
            source_triples = item["ground_truth"]["source_triples"]
            
            # Handle list of triple dictionaries
            if isinstance(source_triples, list):
                for triple_dict in source_triples:
                    if isinstance(triple_dict, dict) and all(key in triple_dict for key in ["subject", "predicate", "object"]):
                        triple = f"{triple_dict['subject']} → {triple_dict['predicate']} → {triple_dict['object']}"
                        triples.append(triple)
            
            # Handle single triple dictionary
            elif isinstance(source_triples, dict) and all(key in source_triples for key in ["subject", "predicate", "object"]):
                triple = f"{source_triples['subject']} → {source_triples['predicate']} → {source_triples['object']}"
                triples.append(triple)
        
        # Fallback: try to find triples in other locations
        if not triples:
            def extract_all_triples(obj):
                found_triples = []
                if isinstance(obj, dict):
                    if set(obj.keys()) >= {"subject", "predicate", "object"}:
                        triple = f"{obj['subject']} → {obj['predicate']} → {obj['object']}"
                        found_triples.append(triple)
                    for value in obj.values():
                        found_triples.extend(extract_all_triples(value))
                elif isinstance(obj, list):
                    for item in obj:
                        found_triples.extend(extract_all_triples(item))
                return found_triples
            
            triples = extract_all_triples(item)
        
        if triples:
            formatted_triples = []
            for i, triple in enumerate(triples, 1):
                formatted_triples.append(f"{i}. {triple}")
            return "\n".join(formatted_triples)
        return "No KG triples available"

    def call_llm_judge(self, question: str, answer: str, kg_triples: str, 
                      max_retries: int = 3) -> Optional[Dict[str, int]]:
        """
        Call the LLM judge to evaluate a KG-only QA pair with retry logic.
        
        Args:
            question: The question to evaluate
            answer: The answer to evaluate
            kg_triples: Knowledge graph triples (only source of information)
            max_retries: Maximum number of retry attempts
            
        Returns:
            Dict with evaluation scores or None if failed
        """
        # Format the prompt for KG-only evaluation
        prompt = self.evaluation_prompt.format(
            question=question,
            answer=answer,
            kg_triples=kg_triples
        )
        
        # Prepare API request
        payload = {
            "model": self.model_name,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.0,  # Deterministic for consistent scoring
            "max_tokens": 800,   # Short response expected
            "top_p": 0.8
        }
        
        for attempt in range(max_retries):
            try:
                response = requests.post(
                    self.base_url,
                    headers=self.headers,
                    json=payload,
                    timeout=45
                )
                
                if response.status_code == 200:
                    result = response.json()
                    msg = result['choices'][0].get('message', {})
                    content = msg.get("content", "") or msg.get("reasoning_content", "")

                    if content:
                        parsed_result = self.parse_evaluation_response(content)
                        if parsed_result:
                            return parsed_result
                    else:
                        self.logger.warning(f"Empty response content from model")
                
                elif response.status_code == 429:
                    wait_time = 2 ** attempt
                    self.logger.warning(f"Rate limit hit, waiting {wait_time}s before retry {attempt + 1}/{max_retries}")
                    time.sleep(wait_time)
                else:
                    self.logger.error(f"API Error {response.status_code}: {response.text}")
            
            except requests.exceptions.Timeout:
                self.logger.warning(f"Timeout on attempt {attempt + 1}/{max_retries}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
            
            except Exception as e:
                self.logger.error(f"Request failed on attempt {attempt + 1}: {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
    
        return None

    def parse_evaluation_response(self, response: str) -> Optional[Dict[str, int]]:
        """
        Parse the LLM response to extract numerical scores for KG-only evaluation.
        
        Expected format includes KG_Utilization instead of KG_Alignment:
        Relevance: 4
        Accuracy: 5
        Completeness: 3
        Fluency: 4
        Kg Alignement 2
        """
        try:
            scores = {}
            lines = response.strip().split('\n')
            
            for line in lines:
                line = line.strip()
                if ':' in line:
                    parts = line.split(':', 1)
                    if len(parts) == 2:
                        metric = parts[0].strip()
                        score_text = parts[1].strip()
                        
                        # Improved regex pattern
                        score_match = re.search(r':\s*([1-5])(?:/5)?', line)
                        if score_match:
                            score = int(score_match.group(1))
                            
                            # Normalize metric names (adapted for KG-only)
                            metric_lower = metric.lower()
                            if 'relevance' in metric_lower:
                                scores['Relevance'] = score
                            elif 'accuracy' in metric_lower:
                                scores['Accuracy'] = score
                            elif 'completeness' in metric_lower:
                                scores['Completeness'] = score
                            elif 'fluency' in metric_lower:
                                scores['Fluency'] = score
                            elif 'kg' in metric_lower or 'alignment' in metric_lower:
                                scores['KG_Alignment'] = score
            
            # Validate we have all 5 scores for KG-only evaluation
            expected_metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
            if all(metric in scores for metric in expected_metrics):
                return scores
            else:
                self.logger.warning(f"Missing metrics in response: {response}")
                return self._parse_with_fallback(response)
                
        except Exception as e:
            self.logger.error(f"Failed to parse response: {response}. Error: {str(e)}")
            return None
    
    def _parse_with_fallback(self, response: str) -> Optional[Dict[str, int]]:
        """Fallback parsing with multiple regex patterns for KG-only evaluation."""
        patterns = [
            r':\s*([1-5])(?:/5)?',          # Primary pattern
            r'\b([1-5])\b',                 # Simple number pattern
            r'([1-5])\s*(?:out of 5|/5)?'   # Alternative pattern
        ]
        
        for pattern in patterns:
            try:
                scores = {}
                lines = response.strip().split('\n')
                
                for line in lines:
                    if ':' in line:
                        parts = line.split(':', 1)
                        if len(parts) == 2:
                            metric = parts[0].strip()
                            score_text = parts[1].strip()
                            
                            score_match = re.search(pattern, score_text)
                            if score_match:
                                score = int(score_match.group(1))
                                
                                # Normalize metric names for KG-only
                                metric_lower = metric.lower()
                                if 'relevance' in metric_lower:
                                    scores['Relevance'] = score
                                elif 'accuracy' in metric_lower:
                                    scores['Accuracy'] = score
                                elif 'completeness' in metric_lower:
                                    scores['Completeness'] = score
                                elif 'fluency' in metric_lower:
                                    scores['Fluency'] = score
                                elif 'kg' in metric_lower or 'alignment' in metric_lower:
                                    scores['KG_Alignment'] = score
                
                # Check if this pattern worked
                expected_metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
                if all(metric in scores for metric in expected_metrics):
                    return scores
                    
            except Exception:
                continue
        
        return None

    def load_qa_dataset(self, file_path: str) -> List[Dict]:
        """Load KG-only QA dataset from JSON file with correct structure."""
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            
            # Handle nested structure: root -> metadata -> queries
            if "metadata" in data and "queries" in data["metadata"]:
                queries = data["metadata"]["queries"]
            elif "queries" in data:
                queries = data["queries"] 
            else:
                # Fallback: assume data is the queries list directly
                queries = data if isinstance(data, list) else []
            
            self.logger.info(f"Loaded {len(queries)} KG-only QA pairs from {file_path}")
            
            # Log sample structure for debugging
            if queries:
                sample = queries[0]
                self.logger.info(f"Sample structure keys: {list(sample.keys())}")
                if "ground_truth" in sample:
                    self.logger.info(f"Ground truth keys: {list(sample['ground_truth'].keys())}")
            
            return queries
            
        except Exception as e:
            self.logger.error(f"Failed to load dataset: {str(e)}")
            return []

    def save_checkpoint(self, results: List[Dict], checkpoint_path: str):
        """Save evaluation results as checkpoint."""
        try:
            df = pd.DataFrame(results)
            df.to_csv(checkpoint_path, index=False)
            self.logger.info(f"Checkpoint saved: {len(results)} results to {checkpoint_path}")
        except Exception as e:
            self.logger.error(f"Failed to save checkpoint: {str(e)}")

    def load_checkpoint(self, checkpoint_path: str) -> List[Dict]:
        """Load evaluation results from checkpoint."""
        try:
            if os.path.exists(checkpoint_path):
                df = pd.read_csv(checkpoint_path)
                results = df.to_dict('records')
                self.logger.info(f"Loaded checkpoint: {len(results)} results from {checkpoint_path}")
                return results
            return []
        except Exception as e:
            self.logger.error(f"Failed to load checkpoint: {str(e)}")
            return []

    def evaluate_dataset(self, dataset_path: str, output_path: str, 
                        sample_size: Optional[int] = None, 
                        delay_seconds: float = 0.5,
                        checkpoint_interval: int = 50) -> pd.DataFrame:
        """
        Evaluate KG-only QA dataset using LLM judge.
        
        Args:
            dataset_path: Path to the KG-only QA dataset JSON file
            output_path: Path to save evaluation results
            sample_size: Number of samples to evaluate (None for all)
            delay_seconds: Delay between API calls to avoid rate limits
            checkpoint_interval: Save checkpoint every N evaluations
            
        Returns:
            DataFrame with evaluation results
        """
        # Load dataset
        qa_items = self.load_qa_dataset(dataset_path)
        if not qa_items:
            self.logger.error("No QA items loaded. Exiting.")
            return pd.DataFrame()
        
        # Sample subset if requested
        if sample_size and sample_size < len(qa_items):
            import random
            random.seed(42)  # For reproducibility
            qa_items = random.sample(qa_items, sample_size)
            self.logger.info(f"Sampling {sample_size} items for evaluation")
        
        # Setup checkpoint
        checkpoint_path = f"{output_path}.checkpoint"
        results = self.load_checkpoint(checkpoint_path)
        
        # Track processed items
        processed_ids = {result['qa_id'] for result in results} if results else set()
        
        # Filter unprocessed items
        remaining_items = [item for item in qa_items 
                          if item.get('id', f'item_{qa_items.index(item)}') not in processed_ids]
        
        if processed_ids:
            self.logger.info(f"Resuming from checkpoint: {len(results)} completed, {len(remaining_items)} remaining")
        
        failed_evaluations = 0
        
        # Progress bar with correct total and initial values
        total_items = len(qa_items)
        completed_items = len(results)
        pbar = tqdm(
            remaining_items, 
            desc="Evaluating KG-only QA pairs",
            total=total_items,
            initial=completed_items,
            unit="items"
        )
        
        for i, item in enumerate(remaining_items):
            try:
                # Extract data from item with KG-only dataset structure
                qa_id = item.get('id', f'item_{qa_items.index(item)}')
                question = item.get('question', '')
                answer = item.get('answer', '')
                question_type = item.get('question_type', 'unknown')
                generation_method = item.get('generation_method', 'unknown')
                
                # Verify this is from KG-only ablation study
                if generation_method != 'ablation_kg_only':
                    self.logger.warning(f"Item {qa_id} not from KG-only ablation (method: {generation_method})")
                
                # Extract KG triples (only source of information for ablation study)
                kg_triples = self.extract_triples_from_item(item)
                
                # Skip if essential data is missing
                if not question or not answer:
                    self.logger.warning(f"Skipping item {qa_id}: missing question or answer")
                    pbar.update(1)
                    continue
                
                # Skip if no KG triples available
                if kg_triples == "No KG triples available":
                    self.logger.warning(f"Skipping item {qa_id}: no KG triples found")
                    pbar.update(1)
                    continue
                
                # Call LLM judge (KG-only evaluation)
                evaluation = self.call_llm_judge(question, answer, kg_triples)
                
                if evaluation:
                    # Store results with additional KG-only metadata
                    result = {
                        'qa_id': qa_id,
                        'question_type': question_type,
                        'generation_method': generation_method,
                        'question': question,
                        'answer': answer,
                        'num_source_triples': item.get('ground_truth', {}).get('num_source_triples', 0),
                        'text_context_used': item.get('ground_truth', {}).get('text_context_used', False),
                        'ablation_study': item.get('ground_truth', {}).get('ablation_study', 'unknown'),
                        'Relevance': evaluation['Relevance'],
                        'Accuracy': evaluation['Accuracy'],
                        'Completeness': evaluation['Completeness'],
                        'Fluency': evaluation['Fluency'],
                        'KG_Alignment': evaluation['KG_Alignment'],
                        'Overall_Score': sum(evaluation.values()) / len(evaluation)
                    }
                    results.append(result)
                    
                    # Update progress bar with detailed status
                    pbar.set_postfix({
                        'Completed': len(results),
                        'Failed': failed_evaluations,
                        'Success_Rate': f"{len(results)/(len(results)+failed_evaluations)*100:.1f}%",
                        'Last_Score': f"{result['Overall_Score']:.1f}"
                    })
                    
                    # Save checkpoint
                    if len(results) % checkpoint_interval == 0:
                        self.save_checkpoint(results, checkpoint_path)
                    
                else:
                    failed_evaluations += 1
                    self.logger.warning(f"Failed to evaluate item {qa_id}")
                
                # Update progress bar
                pbar.update(1)
                
                # Rate limiting
                if delay_seconds > 0:
                    time.sleep(delay_seconds)
                    
            except Exception as e:
                failed_evaluations += 1
                self.logger.error(f"Error processing item {qa_id}: {str(e)}")
                pbar.update(1)
                continue
        
        pbar.close()
        
        # Final save
        if results:
            df = pd.DataFrame(results)
            df.to_csv(output_path, index=False)
            
            # Clean up checkpoint
            if os.path.exists(checkpoint_path):
                os.remove(checkpoint_path)
            
            # Print summary statistics
            self.print_evaluation_summary(df, failed_evaluations)
            
            return df
        else:
            self.logger.error("No successful evaluations completed")
            return pd.DataFrame()

    def print_evaluation_summary(self, df: pd.DataFrame, failed_count: int):
        """Print comprehensive summary statistics of the KG-only evaluation."""
        print(f"\n{'='*70}")
        print(f" SECOND ABLATION STUDY: KG-ONLY EVALUATION SUMMARY")
        print(f"{'='*70}")
        
        print(f" Evaluation Statistics:")
        print(f"   Total Evaluated: {len(df)}")
        print(f"   Failed Evaluations: {failed_count}")
        print(f"   Success Rate: {len(df)/(len(df)+failed_count)*100:.1f}%")
        print(f"   Evaluation Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        
        print(f"\n Average Scores by Metric (KG-Only System):")
        metrics = ['Relevance', 'Accuracy', 'Completeness', 'Fluency', 'KG_Alignment']
        for metric in metrics:
            mean_score = df[metric].mean()
            std_score = df[metric].std()
            min_score = df[metric].min()
            max_score = df[metric].max()
            print(f"   {metric}: {mean_score:.2f} ± {std_score:.2f} (range: {min_score}-{max_score})")
        
        print(f"\n Overall Performance:")
        print(f"   Mean Overall Score: {df['Overall_Score'].mean():.2f}")
        print(f"   Median Overall Score: {df['Overall_Score'].median():.2f}")
        print(f"   Best Score: {df['Overall_Score'].max():.2f}")
        print(f"   Worst Score: {df['Overall_Score'].min():.2f}")
        
        # Score distribution
        print(f"\n Score Distribution:")
        score_ranges = [(1.0, 2.0), (2.0, 3.0), (3.0, 4.0), (4.0, 5.0)]
        for min_score, max_score in score_ranges:
            count = len(df[(df['Overall_Score'] >= min_score) & (df['Overall_Score'] < max_score)])
            percentage = (count / len(df)) * 100
            print(f"   {min_score:.1f}-{max_score:.1f}: {count} ({percentage:.1f}%)")
        
        # Perfect scores
        perfect_scores = len(df[df['Overall_Score'] == 5.0])
        print(f"   Perfect (5.0): {perfect_scores} ({(perfect_scores/len(df))*100:.1f}%)")
        
        # KG-specific analysis
        print(f"\n KG-Only System Analysis:")
        print(f"   Average KG Alignment: {df['KG_Alignment'].mean():.2f}")
        print(f"   KG Alignment Range: {df['KG_Alignment'].min():.1f} - {df['KG_Alignment'].max():.1f}")
        
        # Dataset-specific insights
        if 'num_source_triples' in df.columns:
            print(f"   Average Source Triples: {df['num_source_triples'].mean():.1f}")
            print(f"   Triple Count Range: {df['num_source_triples'].min()}-{df['num_source_triples'].max()}")
        
        if 'text_context_used' in df.columns:
            text_context_count = df['text_context_used'].sum()
            print(f"   Text Context Used: {text_context_count} ({(text_context_count/len(df))*100:.1f}%)")
        
        if 'ablation_study' in df.columns:
            kg_only_count = len(df[df['ablation_study'] == 'kg_only'])
            print(f"   KG-Only Items: {kg_only_count} ({(kg_only_count/len(df))*100:.1f}%)")
        
        # Low KG alignment items
        low_kg_align = len(df[df['KG_Alignment'] <= 2])
        print(f"   Poor KG Alignment (≤2): {low_kg_align} ({(low_kg_align/len(df))*100:.1f}%)")
        
        # Correlation analysis between triple count and performance
        if 'num_source_triples' in df.columns and len(df) > 1:
            correlation = df['num_source_triples'].corr(df['Overall_Score'])
            print(f"   Triple Count vs Performance Correlation: {correlation:.3f}")
        
        if 'question_type' in df.columns:
            print(f"\n Performance by Question Type (KG-Only):")
            type_summary = df.groupby('question_type').agg({
                'Overall_Score': ['mean', 'std', 'count'],
                'Relevance': 'mean',
                'Accuracy': 'mean',
                'Completeness': 'mean',
                'Fluency': 'mean',
                'KG_Alignment': 'mean'
            }).round(3)
            
            for qtype in type_summary.index:
                stats = type_summary.loc[qtype]
                mean_score = stats[('Overall_Score', 'mean')]
                std_score = stats[('Overall_Score', 'std')]
                count = stats[('Overall_Score', 'count')]
                kg_align = stats[('KG_Alignment', 'mean')]
                print(f"   {qtype}: {mean_score:.2f} ± {std_score:.2f} (n={count}, KG_Align={kg_align:.2f})")

def run_kg_only_evaluation():
    """Run KG-only evaluation for the second ablation study."""
    
    # Configuration
    API_KEY = ""  # Replace with your actual API key
    DATASET_PATH = "Ablation_2_kg_only_qa_dataset.json"  # Update with your KG-only dataset path
    OUTPUT_PATH = "DeepSeek-R1_Distill_Llma70B_kg_only_evaluation_results.csv"
    
    # Initialize KG-only evaluator
    evaluator = KGOnlyLLMJudgeEvaluator(api_key=API_KEY)
    
    print("┌" + "─" * 68 + "┐")
    print("│ SECOND ABLATION STUDY: KG-ONLY EVALUATION                       │")
    print("├" + "─" * 68 + "┤")
    print("│ This evaluates QA pairs generated using ONLY Knowledge Graph    │")
    print("│ information without any source text chunks.                     │")
   
    # Confirm before starting
    response = input("\nProceed with KG-only evaluation? (y/n): ").strip().lower()
    if response != 'y':
        print("✗ Evaluation cancelled.")
        return
    
    start_time = time.time()
    
    # Run evaluation on KG-only dataset
    results_df = evaluator.evaluate_dataset(
        dataset_path=DATASET_PATH,
        output_path=OUTPUT_PATH,
        sample_size=None,  # Process all items
        delay_seconds=0.5,  # 0.5 second delay between requests
        checkpoint_interval=50  # Save every 50 evaluations
    )
    
    end_time = time.time()
    duration = end_time - start_time
    
    if not results_df.empty:
        print(f" KG-ONLY EVALUATION COMPLETED!")
        print(f"  Total time: {duration/60:.1f} minutes")
        print(f"  Evaluated: {len(results_df)} QA pairs")
        print(f"  Results saved to: {OUTPUT_PATH}")
        
        # Additional analysis
        print(f"\n Processing Statistics:")
        print(f"   Average processing time: {duration/len(results_df):.2f} seconds per QA pair")
        print(f"   Items per minute: {len(results_df)/(duration/60):.1f}")
        
        # KG-specific insights
        print(f"\n KG-Only System Insights:")
        print(f"   Mean Overall Score: {results_df['Overall_Score'].mean():.2f}")
        print(f"   Mean KG Alignment: {results_df['KG_Alignment'].mean():.2f}")
        print(f"   Score Range: {results_df['Overall_Score'].min():.2f} - {results_df['Overall_Score'].max():.2f}")
        print(f"   Standard Deviation: {results_df['Overall_Score'].std():.2f}")

        
    else:
        print(" Evaluation failed. Check logs for details.")

if __name__ == "__main__":
    run_kg_only_evaluation()