In [1]:
import pandas as pd
import numpy as np
import math
import os
from tqdm import tqdm

def check_all_files_consistency(input_folder):
    """
    Check row consistency for all files across models
    Args:
        input_folder: Path to the main folder containing model_1
    """
    inconsistent_files = []
    missing_files = []
    error_files = []
    
    # Walk through model_1 directory
    model1_path = os.path.join(input_folder, "model_1")
    
    print("Starting consistency check across models...")
    
    # Get list of all CSV files
    csv_files = []
    for root, dirs, files in os.walk(model1_path):
        # Skip hidden directories
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        
        for file in files:
            if not file.startswith('.') and file.endswith('.csv'):
                csv_files.append(os.path.join(root, file))

    # Sort files alphabetically
    csv_files = sorted(csv_files)
    print(f"Found {len(csv_files)} CSV files to process")
    
    # Process each file
    for csv_path in tqdm(csv_files, desc="Checking files"):
        try:
            # Extract components from the path
            parts = csv_path.split('/')
            model_idx = parts.index([p for p in parts if p.startswith('model_')][0])
            base_path = '/'.join(parts[:model_idx])
            trial_folder = parts[model_idx + 1]
            filename = parts[-1]
            
            # Store row counts for each model
            row_counts = {}
            
            # Check each model
            for model_num in [1,2,3,4,6]:
                model_path = os.path.join(base_path, f"model_{model_num}", trial_folder, filename)
                if os.path.exists(model_path):
                    df = pd.read_csv(model_path)
                    row_counts[f"model_{model_num}"] = len(df)
                else:
                    missing_files.append((filename, f"model_{model_num}"))
                    print(model_path)
            
            # If we found any files
            if row_counts:
                # Get unique row counts
                unique_counts = set(row_counts.values())
                
                # If more than one unique count, files are inconsistent
                if len(unique_counts) > 1:
                    inconsistent_files.append({
                        'file': filename,
                        'trial': trial_folder,
                        'counts': row_counts
                    })
            
        except Exception as e:
            error_files.append((csv_path, str(e)))
    
    # Print summary report
    print("\nCONSISTENCY CHECK REPORT")
    print("=" * 50)
    
    if inconsistent_files:
        print("\nFiles with inconsistent row counts:")
        print("-" * 40)
        for item in inconsistent_files:
            print(f"\nFile: {item['file']}")
            print(f"Trial: {item['trial']}")
            for model, count in item['counts'].items():
                print(f"{model}: {count} rows")
    
    if missing_files:
        print("\nMissing files:")
        print("-" * 40)
        for filename, model in missing_files:
            print(f"{filename} not found in {model}")
    
    if error_files:
        print("\nErrors encountered:")
        print("-" * 40)
        for file_path, error in error_files:
            print(f"{file_path}: {error}")
    
    # Print final statistics
    print("\nFinal Statistics:")
    print("-" * 40)
    print(f"Total files checked: {len(csv_files)}")
    print(f"Files with inconsistent rows: {len(inconsistent_files)}")
    print(f"Missing files: {len(missing_files)}")
    print(f"Files with errors: {len(error_files)}")
    
    return inconsistent_files, missing_files, error_files

# Example usage:
input_folder = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Forced_inference/Run_ENSEMBLE_Gpt5/Paragraph_title"
inconsistent, missing, errors = check_all_files_consistency(input_folder)

Starting consistency check across models...
Found 1500 CSV files to process


Checking files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1500/1500 [00:27<00:00, 54.67it/s]


CONSISTENCY CHECK REPORT

Final Statistics:
----------------------------------------
Total files checked: 1500
Files with inconsistent rows: 0
Missing files: 0
Files with errors: 0



