## Here we reduce and concatenate the stats

In [1]:
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm

def get_fixed_column_names():
    """
    Generate a fixed list of column names to ensure consistency across all files
    """
    columns = []
    
    # Models (excluding model_5)
    models = [f"model_{i}" for i in range(1, 7) if i != 5]
    
    # Metrics for each model
    metrics = ['gold_lp', 'rank', 'Entropy', 'gold_lp_cumsum', 'gold_prob', 'gold_lp_normalized']
    
    # Statistics to compute
    stats = ['median', 'max', 'min', 'std', 
             'moment1', 'moment2', 'moment3', 'moment4', 'moment5', 
             'q95', 'q90', 'q85', 'q80', 'q20', 'q15', 'q10', 'q05']
    
    # Generate columns for model metrics
    for model in models:
        for metric in metrics:
            for stat in stats:
                columns.append(f"{model}_{metric}_{stat}")
    
    # Generate columns for KL divergence terms
    model_pairs = [(i, j) for i in range(1, 7) for j in range(i+1, 7) 
                  if i != 5 and j != 5]
    
    for i, j in model_pairs:
        for stat in stats:
            columns.append(f"kl_{i}_vs_{j}_{stat}")
    
    return columns

def get_label_from_json(trial_name, question_number, label_folder_path):
    """
    Get label from corresponding JSON file
    """
    if not label_folder_path:
        return None
        
    json_path = os.path.join(label_folder_path, f"{trial_name}.json")
    
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            statement = data.get('label')
            return 'fact' if statement == question_number else 'hallucination'
    except Exception as e:
        print(f"Error reading JSON for {trial_name}: {e}")
        return None

def extract_trial_and_question(file_path):
    """
    Extract trial name and question number from file path
    """
    try:
        filename = os.path.basename(file_path)
        question_num = int(filename.split('_')[-1].split('.')[0])
        trial_name = file_path.split('/')[-2]
        return trial_name, question_num
    except Exception as e:
        print(f"Error extracting trial and question from {file_path}: {e}")
        return None, None

def compute_statistics(series):
    """
    Return first value for all statistics
    """
    try:
        first_value = float(series.iloc[0])
        stats = {
            'median': first_value,
            'max': first_value,
            'min': first_value,
            'std': first_value,
            'moment1': first_value,
            'moment2': first_value,
            'moment3': first_value,
            'moment4': first_value,
            'moment5': first_value,
            'q95': first_value,
            'q90': first_value,
            'q85': first_value,
            'q80': first_value,
            'q20': first_value,
            'q15': first_value,
            'q10': first_value,
            'q05': first_value,
        }    
        return stats
    except Exception as e:
        print(f"Error getting first value: {e}")
        return {stat: np.nan for stat in [
            'median', 'max', 'min', 'std',
            'moment1', 'moment2', 'moment3', 'moment4', 'moment5',
            'q95', 'q90', 'q85', 'q80', 'q20', 'q15', 'q10', 'q05'
        ]}

def process_metrics_and_kl(input_folder, kl_folder, output_folder, label_folder_path=None):
    """
    Process original metrics and KL divergence terms with consistent column ordering
    """
    fixed_columns = get_fixed_column_names()
    metrics = ['gold_lp', 'rank', 'Entropy', 'gold_lp_cumsum', 'gold_prob', 'gold_lp_normalized']
    
    csv_files = []
    model1_path = os.path.join(input_folder, "model_1")
    for root, dirs, files in os.walk(model1_path):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for file in files:
            if not file.startswith('.') and file.endswith('.csv'):
                csv_files.append((os.path.join(root, file), file))
    
    print(f"Found {len(csv_files)} CSV files to process")
    print(f"Output will have {len(fixed_columns) + 1} columns")
    
    for file_path1, file_name in tqdm(csv_files, desc="Processing files"):
        try:
            trial_name, question_num = extract_trial_and_question(file_path1)
            
            if trial_name and question_num and label_folder_path:
                label = get_label_from_json(trial_name, question_num, label_folder_path)
            else:
                label = None
            
            all_stats = {col: np.nan for col in fixed_columns}
            all_stats['label'] = label
            
            models = [f"model_{i}" for i in range(1, 7) if i != 5]
            
            for model in models:
                file_path = os.path.join(input_folder, model, os.path.relpath(file_path1, model1_path))
                if not os.path.exists(file_path):
                    print(f"Skipping {file_name} - no matching file in {model}")
                    continue
                
                df = pd.read_csv(file_path, nrows=1)  # Only read first row
                
                for metric in metrics:
                    if metric in df.columns:
                        stats = compute_statistics(df[metric])
                        for stat_name, value in stats.items():
                            col_name = f"{model}_{metric}_{stat_name}"
                            all_stats[col_name] = value
            
            kl_file_path = os.path.join(kl_folder, os.path.relpath(file_path1, model1_path))
            if os.path.exists(kl_file_path):
                kl_df = pd.read_csv(kl_file_path, nrows=1)  # Only read first row
                
                kl_cols = [col for col in kl_df.columns if col.startswith('kl_')]
                
                for kl_col in kl_cols:
                    stats = compute_statistics(kl_df[kl_col])
                    for stat_name, value in stats.items():
                        col_name = f"{kl_col}_{stat_name}"
                        all_stats[col_name] = value
            
            columns_with_label = fixed_columns + ['label']
            result_df = pd.DataFrame([all_stats])[columns_with_label]
            
            output_file_path = os.path.join(output_folder, os.path.relpath(file_path1, model1_path))
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
            
            result_df.to_csv(output_file_path, index=False)
            
        except Exception as e:
            print(f"\nError processing {file_name}:")
            print(f"Error type: {type(e)}")
            print(f"Error message: {str(e)}")
            continue

def concatenate_output_files(output_folder):
    """
    Concatenate all CSV files in output_folder into a single DataFrame
    """
    csv_files = []
    for root, dirs, files in os.walk(output_folder):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for file in files:
            if not file.startswith('.') and file.endswith('.csv'):
                csv_files.append((os.path.join(root, file), file))
    
    print(f"Found {len(csv_files)} CSV files to concatenate")
    csv_files = sorted(csv_files)
    
    all_dfs = []
    for file_path, file_name in tqdm(csv_files, desc="Reading files"):
        try:
            df = pd.read_csv(file_path)
            df['source_file'] = file_name
            df['file_path'] = os.path.relpath(file_path, output_folder)
            all_dfs.append(df)
        except Exception as e:
            print(f"\nError reading {file_name}:")
            print(f"Error type: {type(e)}")
            print(f"Error message: {str(e)}")
            continue
    
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        print("\nFinal DataFrame info:")
        print(f"Shape: {final_df.shape}")
        print(f"Number of samples: {len(final_df)}")
        print(f"Number of features: {len(final_df.columns)}")
        
        if 'label' in final_df.columns:
            print("\nLabel distribution:")
            print(final_df['label'].value_counts())
        
        return final_df
    else:
        print("No files were successfully read!")
        return None

if __name__ == "__main__":
    # Paths
    input_folder = "Database_free_evaluation/MedQA/1-Run_ensemble/Features/Test"
    kl_folder = "kl_analysis_Paragraph_title_features"
    output_folder = "Output_folder_Paragraph_title_first_token"
    label_folder_path = "Database_free_evaluation/MedQA/1-Run_ensemble/Features/Test"
    
    os.makedirs(output_folder, exist_ok=True)

    process_metrics_and_kl(input_folder, kl_folder, output_folder, label_folder_path)

    final_df = concatenate_output_files(output_folder)
    
    if final_df is not None:
        final_df.to_csv("concatenated_results_Paragraph_title_MEDQA_TESTSET_first_token.csv", index=False)

Found 5092 CSV files to process
Output will have 681 columns


Processing files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5092/5092 [01:59<00:00, 42.61it/s]


Found 5092 CSV files to concatenate


Reading files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5092/5092 [00:36<00:00, 138.98it/s]



Final DataFrame info:
Shape: (5092, 683)
Number of samples: 5092
Number of features: 683

Label distribution:
label
hallucination    3819
fact             1273
Name: count, dtype: int64


In [2]:
final_df.head(20)

Unnamed: 0,model_1_gold_lp_median,model_1_gold_lp_max,model_1_gold_lp_min,model_1_gold_lp_std,model_1_gold_lp_moment1,model_1_gold_lp_moment2,model_1_gold_lp_moment3,model_1_gold_lp_moment4,model_1_gold_lp_moment5,model_1_gold_lp_q95,...,kl_4_vs_6_q90,kl_4_vs_6_q85,kl_4_vs_6_q80,kl_4_vs_6_q20,kl_4_vs_6_q15,kl_4_vs_6_q10,kl_4_vs_6_q05,label,source_file,file_path
0,-0.007472,-0.007472,-0.007472,-0.007472,-0.007472,-0.007472,-0.007472,-0.007472,-0.007472,-0.007472,...,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,hallucination,logprob_matrix_1.csv,NCT00001/logprob_matrix_1.csv
1,-9.929261,-9.929261,-9.929261,-9.929261,-9.929261,-9.929261,-9.929261,-9.929261,-9.929261,-9.929261,...,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,fact,logprob_matrix_2.csv,NCT00001/logprob_matrix_2.csv
2,-8.588479,-8.588479,-8.588479,-8.588479,-8.588479,-8.588479,-8.588479,-8.588479,-8.588479,-8.588479,...,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,hallucination,logprob_matrix_3.csv,NCT00001/logprob_matrix_3.csv
3,-13.147138,-13.147138,-13.147138,-13.147138,-13.147138,-13.147138,-13.147138,-13.147138,-13.147138,-13.147138,...,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,0.001155,hallucination,logprob_matrix_4.csv,NCT00001/logprob_matrix_4.csv
4,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,...,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,hallucination,logprob_matrix_1.csv,NCT00002/logprob_matrix_1.csv
5,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,-9.694468,...,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,hallucination,logprob_matrix_2.csv,NCT00002/logprob_matrix_2.csv
6,-3.258713,-3.258713,-3.258713,-3.258713,-3.258713,-3.258713,-3.258713,-3.258713,-3.258713,-3.258713,...,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,hallucination,logprob_matrix_3.csv,NCT00002/logprob_matrix_3.csv
7,-0.040836,-0.040836,-0.040836,-0.040836,-0.040836,-0.040836,-0.040836,-0.040836,-0.040836,-0.040836,...,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,0.192503,fact,logprob_matrix_4.csv,NCT00002/logprob_matrix_4.csv
8,-7.302943,-7.302943,-7.302943,-7.302943,-7.302943,-7.302943,-7.302943,-7.302943,-7.302943,-7.302943,...,0.00035,0.00035,0.00035,0.00035,0.00035,0.00035,0.00035,hallucination,logprob_matrix_1.csv,NCT00003/logprob_matrix_1.csv
9,-0.330877,-0.330877,-0.330877,-0.330877,-0.330877,-0.330877,-0.330877,-0.330877,-0.330877,-0.330877,...,0.00035,0.00035,0.00035,0.00035,0.00035,0.00035,0.00035,fact,logprob_matrix_2.csv,NCT00003/logprob_matrix_2.csv
