In [1]:
import os
import math
import pandas as pd
import logging
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
from tqdm import tqdm
import numpy as np



## Loop over input_files

## Proccess

In [1]:
import math
import pandas as pd
import logging
from tqdm import tqdm
import numpy as np
import json
import os


def log_error(error_msg, error_file="error_log.txt"):
    """Log errors to a file with timestamp"""
    timestamp = pd.Timestamp.now().strftime("%Y-%m-%d %H:%M:%S")
    with open(error_file, 'a') as f:
        f.write(f"[{timestamp}] {error_msg}\n")


def get_scores(file_path1):
    """
    Get scores from a JSON files and determine if hallucination or fact.
    """
    try:
        # Load both JSON files
        with open(file_path1, 'r', encoding='utf-8') as f1:
            data1 = json.load(f1)
            
        # Dictionary to store results for each question
        results = {}
        
        # Get scores for each question (Q1 to Q15)
        for q in range(1, 16):
            q_key = f"Q{q}"
            
            # Skip if question doesn't exist in either file
            if q_key not in data1:
                continue
                
            # Get scores based on granularity level
            score1 = data1[q_key].get("score")
            score2 = data1[q_key].get("direct_score")
            
            if score1 is not None and score2 is not None:
                # Convert scores to float for comparison
                score1 = float(score1)
                score2 = float(score2)

                # Check for conflicting scores
                if score1 != score2 and (score1 in [0,1] and score2 in [0,1]):
                    print(f"Found conflicting scores for {q_key}: {score1} vs {score2}, using direct_score")
                    # Use direct_score (score2) for classification
                    if score2 == 1:
                        results[q_key] = "hallucination"
                    else:  # score2 == 0
                        results[q_key] = "fact"
                # If no conflict, use original OR logic
                else:
                    if score1 == 1 or score2 == 1:
                        results[q_key] = "hallucination"
                    elif score1 == 0 or score2 == 0:
                        results[q_key] = "fact"
                    else:
                        results[q_key] = "N/A"
                        print(f"Found N/A scores for {q_key}: {score1} vs {score2}")

            else:
                print(f"Warning: Missing score for {q_key} in one of the files")
                
        return results
            
    except Exception as e:
        print(f"Error comparing files: {e}")
        return None

def add_evaluation_to_file(results, target_file, output_file):
    """
    Add evaluation results to target file.
    
    Args:
        results: Dictionary with evaluation results from compare_scores
        target_file: Path to file where results will be added
        output_file: Path where the new file will be saved
    """
    try:
        # Load the target file
        with open(target_file, 'r') as f:
            target_data = json.load(f)
            
        # Create new dictionary with updated structure
        updated_data = {}
        for q_key, text in target_data.items():
            if q_key in results:
                updated_data[q_key] = {
                    "answer": text,
                    "eval_databases": results[q_key]
                }
            else:
                updated_data[q_key] = {"answer": text}
                
        # Save the updated data to new file
        with open(output_file, 'w') as f:
            json.dump(updated_data, f, indent=2)
            
        print(f"Successfully added evaluation results to {output_file}")
        
    except Exception as e:
        print(f"Error adding evaluation results: {e}")
        return None

def create_summary_report():
    """
    Create a detailed summary report of the evaluation results
    """
    # Initialize summary dictionaries
    total_summary = {
        "total_processed": 0,
        "facts": 0,
        "hallucinations": 0,
        "N/A": 0  # Make sure this is initialized
    }
    
    # Initialize per-question summary for each combination
    combination_summaries = {}
    for granularity in granularities:
        for input_type in input_types:
            key = f"{granularity}_{input_type}"
            combination_summaries[key] = {
                f"Q{i}": {"fact": 0, "hallucination": 0, "N/A": 0} 
                for i in range(1, 16)
            }
    
    # Process each combination
    for granularity in granularities:
        for input_type in input_types:
            output_dir = os.path.join(
                output_base_path,
                f"{granularity}_level",
                f"Inference_{input_type}_{granularity}"
            )
            
            if not os.path.exists(output_dir):
                continue
                
            # Process each file in the directory
            json_files = [f for f in os.listdir(output_dir) if f.endswith('.json')]
            for json_file in json_files:
                file_path = os.path.join(output_dir, json_file)
                try:
                    with open(file_path, 'r') as f:
                        data = json.load(f)
                        
                    # Update summaries
                    for q_key, content in data.items():
                        if "eval_databases" in content:
                            result = content["eval_databases"]
                            
                            # Update total summary
                            if result == "fact":
                                total_summary["facts"] += 1
                            elif result == "hallucination":
                                total_summary["hallucinations"] += 1
                            elif result == "N/A":
                                total_summary["N/A"] += 1
                            
                            total_summary["total_processed"] += 1
                            
                            # Update combination-specific summary
                            comb_key = f"{granularity}_{input_type}"
                            combination_summaries[comb_key][q_key][result] += 1
                            
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
                    continue
    
    # Create the report
    report = []
    report.append("=" * 80)
    report.append("EVALUATION SUMMARY REPORT")
    report.append("=" * 80)
    report.append("\nOVERALL SUMMARY:")
    report.append("-" * 40)
    report.append(f"Total questions processed: {total_summary['total_processed']}")
    report.append(f"Facts: {total_summary['facts']}")
    report.append(f"Hallucinations: {total_summary['hallucinations']}")
    report.append(f"N/A: {total_summary['N/A']}")
    
    # Add detailed summaries for each combination
    for granularity in granularities:
        for input_type in input_types:
            comb_key = f"{granularity}_{input_type}"
            report.append(f"\n\nDETAILED SUMMARY FOR {comb_key.upper()}")
            report.append("-" * 60)
            
            # Summary table header
            report.append("\nQuestion | Facts | Hallucinations | N/A")
            report.append("-" * 50)
            
            # Add data for each question
            for q_num in range(1, 16):
                q_key = f"Q{q_num}"
                q_data = combination_summaries[comb_key][q_key]
                report.append(
                    f"{q_key:8} | {q_data['fact']:5} | {q_data['hallucination']:13} | {q_data['N/A']:11}"
                )
    
    # Save the report
    report_path = "evaluation_summary_report_1.txt"
    with open(report_path, 'w') as f:
        f.write('\n'.join(report))
    
    print(f"\nDetailed summary report has been saved to: {report_path}")
    
    return total_summary, combination_summaries

########## CODE EXECUTION ###################

# Define possible values
granularities = ["Paragraph"]         # ["Paragraph", "Sentence", "Claim"]
input_types =   ["title", "summary", "json"]  # ["title", "json", "summary"]

# Base path
base_path = "Database_dependent_evaluation/Clinical_trials/3-Inference/Model_Answer/Llama3.3-70B"
output_base_path = "Database_dependent_evaluation/Clinical_trials/4-Evaluation/Evaluation_factual/Llama3.3-70B/Paragraph_level"

eval_vs_summary_base_path = "Database_dependent_evaluation/Clinical_trials/4-Evaluation/Evaluation_factual/Llama3.3-70B/Paragraph_level"

# Loop through all combinations
for granularity in granularities:
    for input_type in input_types:
        # Construct the full path
        inference_input_dir = os.path.join(
            base_path,
            f"{granularity}_level",
            f"Inference_{input_type}_{granularity}"
        )

        eval_vs_summary_input_dir = os.path.join(
            eval_vs_summary_base_path,
            f"{granularity}_level",
            f"Evaluation_{input_type}_{granularity}"
        )

        output_dir = os.path.join(
            output_base_path,
            f"{granularity}_level",
            f"Inference_{input_type}_{granularity}"
        )
        os.makedirs(output_dir, exist_ok=True)
        
        # Log or print the current combination being processed
        print(f"\nProcessing: Granularity={granularity}, Input Type={input_type}")
        # print(f"Input Inference Directory: {inference_input_dir}")
        # print(f"Input Eval_vs_json Directory: {eval_vs_json_input_dir}")
        # print(f"Input Eval_vs_summary Directory: {eval_vs_summary_input_dir}")
        
            
        # Get all JSON files in this directory
        json_files = sorted([f for f in os.listdir(inference_input_dir) if f.endswith('.json')])
        print(f"Found {len(json_files)} JSON files in directory")

        # Process files
        error_files = []
        for json_file in tqdm(json_files, desc=f"Processing {granularity}-{input_type}"):
            try:
                inference_file_path= os.path.join(inference_input_dir, json_file)
                eval_1_file_path = os.path.join(eval_vs_summary_input_dir, json_file)
                output_file_path = os.path.join(output_dir, json_file)
                # print("Inference:",inference_file_path)
                # print("Eval1:",eval_1_file_path)
                # print("Output:",output_file_path)
        
                # Check if all files exist
                for file_path in [inference_file_path, eval_1_file_path]:
                    if not os.path.exists(file_path):
                        error_msg = f"File not found: {file_path}"
                        log_error(error_msg)
                        error_files.append((json_file, error_msg))
                        continue
            
                # # Get comparison results
                results = get_scores(eval_1_file_path)
        
                if results is None:
                    error_msg = f"Error comparing scores for {json_file}"
                    log_error(error_msg)
                    error_files.append((json_file, error_msg))
                    continue
            
                # # Add evaluation results to inference_file placing the file in a new output_file Inference_with_labels
                add_evaluation_to_file(results, inference_file_path, output_file_path)
        
            except Exception as e:
                error_msg = f"Error processing {json_file}: {str(e)}"
                log_error(error_msg)
                error_files.append((json_file, error_msg))
                continue
        
        # Print summary of errors
        if error_files:
            print("\nFiles with errors:")
            for file_name, error in error_files:
                print(f"- {file_name}: {error}")

# Generate and print summary report
total_summary, combination_summaries = create_summary_report()