## Here we reduce and concatenate the stats

In [1]:
import pandas as pd
import numpy as np
import os
import json
from tqdm import tqdm

def get_fixed_column_names():
    """
    Generate a fixed list of column names to ensure consistency across all files
    """
    columns = []
    
    # Models (excluding model_5)
    models = [f"model_{i}" for i in range(1, 7) if i != 5]
    
    # Metrics for each model
    metrics = ['gold_lp', 'rank', 'Entropy', 'gold_lp_cumsum', 'gold_prob', 'gold_lp_normalized']
    
    # Statistics to compute
    stats = ['median', 'max', 'min', 'std', 
             'moment1', 'moment2', 'moment3', 'moment4', 'moment5', 
             'q95', 'q90', 'q85', 'q80', 'q20', 'q15', 'q10', 'q05']
    
    # Generate columns for model metrics
    for model in models:
        for metric in metrics:
            for stat in stats:
                columns.append(f"{model}_{metric}_{stat}")
    
    # Generate columns for KL divergence terms
    model_pairs = [(i, j) for i in range(1, 7) for j in range(i+1, 7) 
                  if i != 5 and j != 5]
    
    for i, j in model_pairs:
        for stat in stats:
            columns.append(f"kl_{i}_vs_{j}_{stat}")
    
    return columns

def get_label_from_json(trial_name, question_number, label_folder_path):
    """
    Get label from corresponding JSON file
    """
    if not label_folder_path:
        return None
        
    json_path = os.path.join(label_folder_path, f"{trial_name}.json")
    
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            statement = data.get('label')
            return statement
    except Exception as e:
        print(f"Error reading JSON for {trial_name}: {e}")
        return None

def extract_trial_and_question(file_path):
    """
    Extract trial name and question number from file path
    """
    try:
        filename = os.path.basename(file_path)
        question_num = int(filename.split('_')[-1].split('.')[0])
        trial_name = file_path.split('/')[-2]
        return trial_name, question_num
    except Exception as e:
        print(f"Error extracting trial and question from {file_path}: {e}")
        return None, None

def compute_statistics(series):
    """
    Compute various statistical measures for a series
    """
    mean = series.mean()
    
    stats = {
        'median': series.median(),
        'max': series.max(),
        'min': series.min(),
        'std': series.std(),
        
        # Central moments
        'moment1': mean,
        'moment2': ((series - mean) ** 2).mean(),
        'moment3': ((series - mean) ** 3).mean(),
        'moment4': ((series - mean) ** 4).mean(),
        'moment5': ((series - mean) ** 5).mean(),
        
        # Existing quantiles
        'q95': series.quantile(0.95),
        'q90': series.quantile(0.90),
        'q85': series.quantile(0.85),
        'q80': series.quantile(0.80),
        'q20': series.quantile(0.20),
        'q15': series.quantile(0.15),
        'q10': series.quantile(0.10),
        'q05': series.quantile(0.05),
    }    
    return stats

def process_metrics_and_kl(input_folder, kl_folder, output_folder, label_folder_path=None):
    """
    Process original metrics and KL divergence terms with consistent column ordering
    """
    # Get fixed column names
    fixed_columns = get_fixed_column_names()
    
    # Original metrics to analyze
    metrics = ['gold_lp', 'rank', 'Entropy', 'gold_lp_cumsum', 'gold_prob', 'gold_lp_normalized']
    
    # Get all CSV files from model1 directory
    csv_files = []
    model1_path = os.path.join(input_folder, "model_1")
    for root, dirs, files in os.walk(model1_path):
        # Skip hidden directories
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        
        for file in files:
            if not file.startswith('.') and file.endswith('.csv'):
                csv_files.append((os.path.join(root, file), file))
    
    print(f"Found {len(csv_files)} CSV files to process")
    print(f"Output will have {len(fixed_columns) + 1} columns")  # +1 for label
    
    for file_path1, file_name in tqdm(csv_files, desc="Processing files"):
        try:
            # Get trial name and question number
            trial_name, question_num = extract_trial_and_question(file_path1)
            
            if trial_name and question_num and label_folder_path:
                # Get label from JSON only if path is provided
                label = get_label_from_json(trial_name, question_num, label_folder_path)
            else:
                label = None
            
            # Dictionary to store all metrics
            all_stats = {col: np.nan for col in fixed_columns}  # Initialize with NaN
            
            # Add label column
            all_stats['label'] = label
            
            # Process each model (excluding model_5)
            models = [f"model_{i}" for i in range(1, 7) if i != 5]
            
            # Step 1: Process original metrics for each model
            for model in models:
                file_path = os.path.join(input_folder, model, os.path.relpath(file_path1, model1_path))
                if not os.path.exists(file_path):
                    print(f"Skipping {file_name} - no matching file in {model}")
                    continue
                
                df = pd.read_csv(file_path)
                
                # Compute statistics for each metric
                for metric in metrics:
                    if metric in df.columns:
                        stats = compute_statistics(df[metric])
                        for stat_name, value in stats.items():
                            col_name = f"{model}_{metric}_{stat_name}"
                            all_stats[col_name] = value
            
            # Step 2: Process KL divergence terms
            kl_file_path = os.path.join(kl_folder, os.path.relpath(file_path1, model1_path))
            if os.path.exists(kl_file_path):
                kl_df = pd.read_csv(kl_file_path)
                
                # Get all KL columns
                kl_cols = [col for col in kl_df.columns if col.startswith('kl_')]
                
                # Compute statistics for each KL term
                for kl_col in kl_cols:
                    stats = compute_statistics(kl_df[kl_col])
                    for stat_name, value in stats.items():
                        col_name = f"{kl_col}_{stat_name}"
                        all_stats[col_name] = value
            
            # Create output DataFrame with fixed column order plus label
            columns_with_label = fixed_columns + ['label']
            result_df = pd.DataFrame([all_stats])[columns_with_label]
            
            # Create output directory structure
            output_file_path = os.path.join(output_folder, os.path.relpath(file_path1, model1_path))
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
            
            # Save results
            result_df.to_csv(output_file_path, index=False)
            
        except Exception as e:
            print(f"\nError processing {file_name}:")
            print(f"Error type: {type(e)}")
            print(f"Error message: {str(e)}")
            continue

def concatenate_output_files(output_folder):
    """
    Concatenate all CSV files in Output_folder and its subfolders into a single DataFrame
    """
    # Get list of all CSV files
    csv_files = []
    for root, dirs, files in os.walk(output_folder):
        # Skip hidden directories
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        
        for file in files:
            if not file.startswith('.') and file.endswith('.csv'):
                csv_files.append((os.path.join(root, file), file))
    
    print(f"Found {len(csv_files)} CSV files to concatenate")
    # Sort files alphabetically
    csv_files = sorted(csv_files)
    
    # Read and concatenate all files
    all_dfs = []
    for file_path, file_name in tqdm(csv_files, desc="Reading files"):
        try:
            df = pd.read_csv(file_path)
            
            # Add filename as a column (optional but useful for tracking)
            df['source_file'] = file_name
            
            # Add full path as a column (optional)
            df['file_path'] = os.path.relpath(file_path, output_folder)
            
            all_dfs.append(df)
            
        except Exception as e:
            print(f"\nError reading {file_name}:")
            print(f"Error type: {type(e)}")
            print(f"Error message: {str(e)}")
            continue
    
    # Concatenate all DataFrames
    if all_dfs:
        final_df = pd.concat(all_dfs, ignore_index=True)
        
        # Print some information about the final DataFrame
        print("\nFinal DataFrame info:")
        print(f"Shape: {final_df.shape}")
        print(f"Number of samples: {len(final_df)}")
        print(f"Number of features: {len(final_df.columns)}")
        
        if 'label' in final_df.columns:
            print("\nLabel distribution:")
            print(final_df['label'].value_counts())
        
        return final_df
    else:
        print("No files were successfully read!")
        return None

if __name__ == "__main__":
    # Paths
    input_folder = "Database_free_evaluation/HealthBench/Features/Paragraph_hallucination"
    kl_folder = "kl_analysis_Paragraph_hallucination"
    output_folder = "Output_folder_Paragraph_hallucination"

     # Label folder path (set to None if not using labels)
    label_folder_path = "Database_free_evaluation/HealthBench/Features/Paragraph_hallucination"
    # Or for other folders:
    # label_folder_path = None
    
    # Create output directory if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Step 1: Process files and create individual outputs
    process_metrics_and_kl(input_folder, kl_folder, output_folder, label_folder_path)

    # Step 2: Concatenate all output files
    final_df = concatenate_output_files(output_folder)
    
    # Optional: Save concatenated DataFrame
    if final_df is not None:
        final_df.to_csv("concatenated_results_Paragraph_hallucination_HEALTHBENCH.csv", index=False)

Found 2716 CSV files to process
Output will have 681 columns


Processing files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2716/2716 [05:39<00:00,  7.99it/s]


Found 2716 CSV files to concatenate


Reading files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2716/2716 [00:20<00:00, 132.10it/s]



Final DataFrame info:
Shape: (2716, 683)
Number of samples: 2716
Number of features: 683

Label distribution:
label
concensus    2197
hard          519
Name: count, dtype: int64


In [2]:
final_df.head(20)

Unnamed: 0,model_1_gold_lp_median,model_1_gold_lp_max,model_1_gold_lp_min,model_1_gold_lp_std,model_1_gold_lp_moment1,model_1_gold_lp_moment2,model_1_gold_lp_moment3,model_1_gold_lp_moment4,model_1_gold_lp_moment5,model_1_gold_lp_q95,...,kl_4_vs_6_q90,kl_4_vs_6_q85,kl_4_vs_6_q80,kl_4_vs_6_q20,kl_4_vs_6_q15,kl_4_vs_6_q10,kl_4_vs_6_q05,label,source_file,file_path
0,-0.237119,0.0,-14.250458,1.965077,-1.056251,3.85531,-23.955418,220.303265,-2157.758141,-4.76837e-07,...,0.358536,0.24901,0.200951,0.000531,0.000166,6.8e-05,1.4e-05,hard,logprob_matrix_1.csv,NCT00001/logprob_matrix_1.csv
1,-0.382158,0.0,-13.79038,2.332232,-1.443241,5.426354,-30.964518,273.17892,-2453.615018,-4.708766e-07,...,0.457574,0.351031,0.246409,0.004891,0.001675,0.000445,8.3e-05,hard,logprob_matrix_1.csv,NCT00002/logprob_matrix_1.csv
2,-0.702453,-1.192093e-07,-16.410725,2.683826,-1.758912,7.183966,-45.847645,445.454114,-4458.254837,-0.0001604607,...,0.279393,0.237424,0.201693,0.017044,0.008007,0.002565,0.000475,hard,logprob_matrix_1.csv,NCT00004/logprob_matrix_1.csv
3,-1.172437,0.0,-10.500189,2.740999,-2.261026,7.468617,-29.488776,240.525298,-1659.260922,-1.523483e-05,...,0.407856,0.328881,0.299065,0.035432,0.009065,0.001908,0.000303,hard,logprob_matrix_1.csv,NCT00005/logprob_matrix_1.csv
4,-0.143004,-8.344647e-07,-14.628739,2.353165,-1.281366,5.525627,-33.885256,305.516095,-2852.272773,-3.868267e-05,...,0.31188,0.233264,0.180044,0.002432,0.000953,0.000315,7.4e-05,hard,logprob_matrix_1.csv,NCT00007/logprob_matrix_1.csv
5,-0.367616,0.0,-18.127249,2.732081,-1.634488,7.446945,-48.276527,490.034499,-5383.350356,-7.927386e-06,...,0.429294,0.350966,0.286266,0.009194,0.003093,0.001345,0.000159,hard,logprob_matrix_1.csv,NCT00010/logprob_matrix_1.csv
6,-0.407537,0.0,-10.0,2.518676,-1.590172,6.320826,-32.306112,261.406203,-2040.148718,0.0,...,0.415294,0.314335,0.236265,0.000832,0.000404,0.000149,3.1e-05,hard,logprob_matrix_1.csv,NCT00011/logprob_matrix_1.csv
7,-0.402228,0.0,-16.687504,2.523332,-1.497952,6.357629,-40.516316,393.099953,-3994.961942,-4.76837e-07,...,0.414824,0.332008,0.281975,0.002459,0.001077,0.000436,8.6e-05,hard,logprob_matrix_1.csv,NCT00012/logprob_matrix_1.csv
8,-0.326601,-3.576272e-06,-10.938822,2.299073,-1.348808,5.268738,-30.436774,251.415623,-2067.443443,-0.0006081341,...,0.31913,0.255133,0.218301,0.009795,0.005935,0.0025,0.000554,hard,logprob_matrix_1.csv,NCT00013/logprob_matrix_1.csv
9,-0.656569,0.0,-15.686972,2.431109,-1.573241,5.892272,-37.04695,378.120248,-4066.327061,-5.465731e-06,...,0.377684,0.262506,0.218452,0.004078,0.001554,0.000692,0.000322,hard,logprob_matrix_1.csv,NCT00025/logprob_matrix_1.csv


## Gold-logprob grouped by NCT study

In [30]:
# Extract NCT number and group by it
df_result = final_df
df_result['nct_number'] = df_result['file_path'].str.extract(r'(NCT\d+)')
selected_df = df_result.loc[df_result.groupby('nct_number')['model_1_gold_lp_moment1'].idxmax()]
selected_df = selected_df.reset_index(drop=True)
selected_df

Unnamed: 0,model_1_gold_lp_median,model_1_gold_lp_max,model_1_gold_lp_min,model_1_gold_lp_std,model_1_gold_lp_moment1,model_1_gold_lp_moment2,model_1_gold_lp_moment3,model_1_gold_lp_moment4,model_1_gold_lp_moment5,model_1_gold_lp_q95,...,kl_4_vs_6_q85,kl_4_vs_6_q80,kl_4_vs_6_q20,kl_4_vs_6_q15,kl_4_vs_6_q10,kl_4_vs_6_q05,label,source_file,file_path,nct_number
0,-0.000019,0.000000e+00,-6.800458,1.697467,-0.437432,2.701308,-16.027784,102.487482,-651.912252,0.000000e+00,...,0.001029,0.000649,0.000002,7.912583e-07,4.297463e-07,2.674032e-07,hallucination,logprob_matrix_1.csv,NCT00001/logprob_matrix_1.csv,NCT00001
1,-0.001007,-1.645075e-05,-9.937939,3.752193,-1.428837,12.067675,-85.568573,752.392199,-6367.735112,-1.684413e-05,...,0.157562,0.123565,0.000005,3.429367e-06,2.779497e-06,2.129628e-06,fact,logprob_matrix_4.csv,NCT00002/logprob_matrix_4.csv,NCT00002
2,-0.000008,0.000000e+00,-12.219146,3.260488,-1.109728,9.922062,-92.305538,1023.135020,-11300.067677,0.000000e+00,...,0.004237,0.002132,0.000003,5.863995e-07,2.377398e-07,1.528450e-07,hallucination,logprob_matrix_3.csv,NCT00003/logprob_matrix_3.csv,NCT00003
3,-0.000008,0.000000e+00,-3.837925,0.843255,-0.219459,0.677218,-2.252348,8.168240,-29.540368,0.000000e+00,...,0.034336,0.015903,0.000005,2.003963e-06,1.785156e-06,2.490747e-07,hallucination,logprob_matrix_2.csv,NCT00004/logprob_matrix_2.csv,NCT00004
4,-0.000743,0.000000e+00,-7.633425,2.874610,-1.115848,7.082901,-38.442850,258.992420,-1678.758294,-3.576278e-08,...,0.055925,0.044902,0.000023,1.393937e-05,9.511742e-06,5.084118e-06,fact,logprob_matrix_2.csv,NCT00005/logprob_matrix_2.csv,NCT00005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268,-0.000049,-2.384185e-07,-10.302875,2.965274,-1.031048,8.060115,-65.602955,616.761655,-5709.254193,-2.384185e-07,...,0.416146,0.268766,0.000001,8.022104e-07,7.839844e-07,6.001128e-07,fact,logprob_matrix_4.csv,NCT01269/logprob_matrix_4.csv,NCT01269
1269,-0.080891,-9.858122e-05,-2.052711,1.078576,-0.814305,0.930661,-0.368046,1.020148,-0.753044,-4.176092e-04,...,0.185708,0.104739,0.000121,1.020453e-04,8.274469e-05,6.344407e-05,fact,logprob_matrix_4.csv,NCT01270/logprob_matrix_4.csv,NCT01270
1270,-0.135590,-8.344647e-07,-5.784778,2.490962,-1.408896,4.963914,-15.222602,75.433766,-318.002960,-1.032090e-04,...,0.327464,0.236849,0.001694,1.272173e-03,8.499063e-04,4.276398e-04,fact,logprob_matrix_2.csv,NCT01271/logprob_matrix_2.csv,NCT01271
1271,-0.000219,-5.960463e-07,-8.013097,2.813409,-1.062843,6.925861,-41.045082,292.651805,-2026.257809,-2.264971e-06,...,0.005158,0.003996,0.000012,7.046992e-06,4.559846e-06,2.447378e-06,fact,logprob_matrix_3.csv,NCT01272/logprob_matrix_3.csv,NCT01272


In [31]:
print(selected_df['label'].value_counts(normalize=True))

label
fact             0.709348
hallucination    0.290652
Name: proportion, dtype: float64


In [7]:
import numpy as np
row = [-6.88, -100, -11.85, -100, -100]
mean= np.mean(row)
mean

-63.746

In [8]:
std = np.std(row)
std

44.4297069988088

In [9]:
(-6.88 - mean) / std

1.2799094083948073

In [6]:
import numpy as np

row = [-6.88, -100, -11.85, -100, -100]  # Fixed the typo in -11,85

# Calculate mean and std of the vector
mean = np.mean(row)
std = np.std(row)

print(f"Mean: {mean}")
print(f"Std: {std}")

# Normalize the first component
normalized_value = (row[0] - mean) / std

print(f"\nOriginal value: {row[0]}")
print(f"Normalized value: {normalized_value}")

Mean: -63.746
Std: 44.4297069988088

Original value: -6.88
Normalized value: 1.2799094083948073
