In [1]:
import pandas as pd
import numpy as np
import math
import os
from tqdm import tqdm

def compute_entropy_from_logprobs(row, lp_cols):
    """
    Given a row with columns lp_1..lp_50 (log probabilities),
    convert them to probabilities, normalize, and compute
    the entropy (in nats).
    """
    logps = row[lp_cols].values.astype(float)
    ps = np.exp(logps)
    p_sum = np.sum(ps)
    
    if p_sum <= 0:
        return 0.0
    
    p_norm = ps / p_sum
    epsilon = 1e-12
    entropy = -np.sum(p_norm * np.log(p_norm + epsilon))
    
    return entropy

def get_logprob_stats_across_models(csv_path):
    """
    Get mean and std of gold_lp across all models for each row
    """
    try:
        # Extract components from the path
        # For path: ".../FEATURES/Paragraph_title_features/model_1/NCT00001959/logprob_matrix_1.csv"
        parts = csv_path.split('/')
        model_idx = parts.index([p for p in parts if p.startswith('model_')][0])
        
        # Reconstruct base path (up to FEATURES/Paragraph_title_features)
        base_path = '/'.join(parts[:model_idx])
        
        # Get trial folder and filename
        trial_folder = parts[model_idx + 1]
        filename = parts[-1]
        
        # First, read all dataframes
        dfs = {}
        for model_num in [1,2,3,4,6]:
            model_path = os.path.join(base_path, f"model_{model_num}", trial_folder, filename)
            if os.path.exists(model_path):
                dfs[f"model_{model_num}"] = pd.read_csv(model_path)
        
        if not dfs:
            print(f"No model files found for {csv_path}")  # Debug print
            return None, None
            
        # Get number of rows from first dataframe
        n_rows = next(iter(dfs.values())).shape[0]
        
        # Initialize arrays for means and stds
        means = np.zeros(n_rows)
        stds = np.zeros(n_rows)
        
        # Calculate mean and std for each row
        for row_idx in range(n_rows):
            row_values = []
            for df in dfs.values():
                if 'gold_lp' in df.columns:
                    row_values.append(df.iloc[row_idx]['gold_lp'])
            
            means[row_idx] = np.mean(row_values)
            stds[row_idx] = np.std(row_values)
            
        
        return means, stds
        
    except Exception as e:
        print(f"Error getting logprob stats for {csv_path}: {e}")
        print(f"Full path: {csv_path}")  # Debug print
        return None, None

def add_columns(csv_file):
    """
    Add entropy and gold_lp_cumsum columns to a CSV file, updating the original file
    """
    try:
        epsilon = 1e-12
        
        # Read the CSV
        df = pd.read_csv(csv_file)
        
        # Check if columns already exist
        new_columns = []
        if 'Entropy' not in df.columns:
            new_columns.append('Entropy')
            # Add Entropy column
            lp_cols = [f"lp_{i}" for i in range(1, 51)]
            df["Entropy"] = df.apply(lambda row: compute_entropy_from_logprobs(row, lp_cols), axis=1)
        
        if 'gold_prob' not in df.columns or 'gold_lp_cumsum' not in df.columns:  # Changed name in check
            new_columns.extend(['gold_prob', 'gold_lp_cumsum'])  # Changed name in new columns
            # Convert log probabilities to probabilities
            df["gold_prob"] = np.exp(df["gold_lp"].astype(float))
            # Compute cumulative sum of probabilities
            cumsum_probs = df["gold_prob"].cumsum()
            # Take log of cumulative sum (adding small epsilon to avoid log(0))
            epsilon = 1e-12
            df["gold_lp_cumsum"] = np.log(cumsum_probs + epsilon)  # Changed column name

        # df = df.drop('gold_lp_normalized', axis=1)
        # Add normalized gold_lp if needed
        if 'gold_lp_normalized' not in df.columns:
            new_columns.append('gold_lp_normalized')
    
            # Get means and stds for each row
            means, stds = get_logprob_stats_across_models(csv_file)
    
            if means is not None and stds is not None:
                df['gold_lp_normalized'] = (df['gold_lp'] - means) / (stds + epsilon)
            else:
                print(f"Could not compute normalization stats for {csv_file}")
                df['gold_lp_normalized'] = np.nan
        
        if new_columns:
            # Save back to the same file only if changes were made
            df.to_csv(csv_file, index=False)
            # print(f"Added {', '.join(new_columns)} to {csv_file}")
            return True
        else:
            print(f"All columns already exist in {csv_file}")
            return True
            
    except Exception as e:
        print(f"Error processing {csv_file}: {e}")
        return False

def process_all_csv_files(base_folder):
    """
    Process all CSV files in the base folder and its subfolders
    """
    # Get list of all CSV files
    csv_files = []
    for root, dirs, files in os.walk(base_folder):
        # Skip hidden directories
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        
        for file in files:
            if not file.startswith('.') and file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))

    # Sort files alphabetically
    csv_files = sorted(csv_files)
    print(f"Found {len(csv_files)} CSV files to process")
    # Process each file
    successful = 0
    failed = 0
    skipped = 0
    
    with tqdm(total=len(csv_files), desc="Processing CSV files") as pbar:
        for csv_file in csv_files:
            # Process the file
            if add_columns(csv_file):
                successful += 1
            else:
                failed += 1
            
            pbar.update(1)
    
    # Print summary
    print("\nProcessing complete!")
    print(f"Successfully processed: {successful} files")
    print(f"Failed to process: {failed} files")
    
    return successful, failed

if __name__ == "__main__":
    # Process all CSV files in Paragraph_title folder and its subfolders
    base_folder = "Database_free_evaluation/Clinical_trials/Features/FEATURES_Llama3.3-70B/Paragraph_title_features"
    # base_folder = "Database_free_evaluation/Clinical_trials/Features/FEATURES_Llama3.3-70B/Paragraph_summary_features"
    
    successful, failed = process_all_csv_files(base_folder)

Found 236760 CSV files to process


Processing CSV files: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 236760/236760 [1:29:11<00:00, 44.24it/s]


Processing complete!
Successfully processed: 236760 files
Failed to process: 0 files



