In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from itertools import combinations

def compute_kl_divergence(p, q):
    """
    Compute KL divergence between two probability distributions
    """
    # Convert log probabilities to probabilities
    p = np.exp(p.astype(float))
    q = np.exp(q.astype(float))
    
    # Normalize to ensure they sum to 1
    p = p / np.sum(p)
    q = q / np.sum(q)
    
    # Add small epsilon to avoid log(0)
    epsilon = 1e-12
    p = p + epsilon
    q = q + epsilon
    
    # Renormalize after adding epsilon
    p = p / np.sum(p)
    q = q / np.sum(q)
    
    # Compute KL divergence
    kl_div = np.sum(p * np.log(p/q))
    
    return kl_div

def compare_all_models_distributions(base_path, output_path):
    """
    Compare distributions between all possible model pairs (excluding model_5)
    """
    # Define models excluding model_5
    models = [f"model_{i}" for i in range(1, 7) if i != 5]  # model_1,2,3,4,6
    
    # Get all possible model pairs
    model_pairs = list(combinations(models, 2))
    
    print(f"Processing models: {models}")
    print(f"Will compute KL divergence for {len(model_pairs)} model pairs:")
    for pair in model_pairs:
        print(f"  {pair[0]} vs {pair[1]}")
    
    # Get all CSV files from model1 directory
    csv_files = []
    model1_path = os.path.join(base_path, "model_1")
    for root, dirs, files in os.walk(model1_path):
        # Skip hidden directories
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        
        for file in files:
            if not file.startswith('.') and file.endswith('.csv'):
                csv_files.append((os.path.join(root, file), file))
    
    print(f"\nFound {len(csv_files)} CSV files to process")
    # Sort files alphabetically
    csv_files = sorted(csv_files)
    
    for file_path1, file_name in tqdm(csv_files, desc="Processing files"):
        try:
            # Dictionary to store all model DataFrames
            model_dfs = {}
            
            # Load all model DataFrames
            skip_file = False
            for model in models:
                file_path = os.path.join(base_path, model, os.path.relpath(file_path1, model1_path))
                if not os.path.exists(file_path):
                    print(f"Skipping {file_name} - no matching file in {model}")
                    skip_file = True
                    break
                model_dfs[model] = pd.read_csv(file_path)
            
            if skip_file:
                continue
            
            # Verify all DataFrames have the same length
            lengths = {model: len(df) for model, df in model_dfs.items()}
            if len(set(lengths.values())) > 1:
                print(f"Skipping {file_name} - mismatched lengths: {lengths}")
                continue
            
            # Base DataFrame for output (use model_1's DataFrame)
            output_df = model_dfs["model_1"].copy()
            
            # Get probability columns
            prob_cols = [f'lp_{i}' for i in range(1, 51)]
            
            # Compute KL divergence for each pair
            for model1, model2 in model_pairs:
                df1 = model_dfs[model1]
                df2 = model_dfs[model2]
                
                kl_divergences = []
                for idx in range(len(df1)):
                    p = df1.loc[idx, prob_cols].values
                    q = df2.loc[idx, prob_cols].values
                    kl_div = compute_kl_divergence(p, q)
                    kl_divergences.append(kl_div)
                
                # Add KL divergence column with pair name
                col_name = f'kl_{model1[-1]}_vs_{model2[-1]}'  # e.g., kl_1_vs_2
                output_df[col_name] = kl_divergences
            
            # Create output directory structure
            output_file_path = os.path.join(output_path, os.path.relpath(file_path1, model1_path))
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
            
            # Save updated CSV
            output_df.to_csv(output_file_path, index=False)
            
        except Exception as e:
            print(f"\nError processing {file_name}:")
            print(f"Error type: {type(e)}")
            print(f"Error message: {str(e)}")
            continue

if __name__ == "__main__":
    # Paths
    base_path = "Database_free_evaluation/MedQA/1-Run_ensemble/Features/Train"  # Contains model_1 to model_6 directories
    output_path = "kl_analysis_Paragraph_title_features/"  # Output directory

    
    # Run analysis
    compare_all_models_distributions(base_path, output_path)

Processing models: ['model_1', 'model_2', 'model_3', 'model_4', 'model_6']
Will compute KL divergence for 10 model pairs:
  model_1 vs model_2
  model_1 vs model_3
  model_1 vs model_4
  model_1 vs model_6
  model_2 vs model_3
  model_2 vs model_4
  model_2 vs model_6
  model_3 vs model_4
  model_3 vs model_6
  model_4 vs model_6

Found 40712 CSV files to process


Processing files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40712/40712 [31:51<00:00, 21.30it/s]
