In [1]:
import pandas as pd

df = pd.read_csv("testset_final_predictions.csv")
df

Unnamed: 0,nct_number,file_path,gpt_label,predicted_label,probability_fact,probability_hallucination
0,NCT00001959,NCT00001959/logprob_matrix_1.csv,fact,fact,0.984,0.016
1,NCT00001959,NCT00001959/logprob_matrix_10.csv,hallucination,fact,0.800,0.200
2,NCT00001959,NCT00001959/logprob_matrix_11.csv,hallucination,hallucination,0.374,0.626
3,NCT00001959,NCT00001959/logprob_matrix_12.csv,hallucination,hallucination,0.448,0.552
4,NCT00001959,NCT00001959/logprob_matrix_13.csv,hallucination,hallucination,0.429,0.571
...,...,...,...,...,...,...
295,NCT03271047,NCT03271047/logprob_matrix_5.csv,fact,fact,0.986,0.014
296,NCT03271047,NCT03271047/logprob_matrix_6.csv,fact,fact,1.000,0.000
297,NCT03271047,NCT03271047/logprob_matrix_7.csv,fact,fact,1.000,0.000
298,NCT03271047,NCT03271047/logprob_matrix_8.csv,fact,fact,1.000,0.000


In [2]:
# Create a new 'source' column
df['source'] = 'summary'  # Set default value as 'summary'

# Set 'title' for rows 0 to 150
df.loc[0:149, 'source'] = 'title'
df[149:152]

Unnamed: 0,nct_number,file_path,gpt_label,predicted_label,probability_fact,probability_hallucination,source
149,NCT03271047,NCT03271047/logprob_matrix_9.csv,coverage,hallucination,0.456,0.544,title
150,NCT00001959,NCT00001959/logprob_matrix_1.csv,fact,fact,0.997,0.003,summary
151,NCT00001959,NCT00001959/logprob_matrix_10.csv,fact,fact,0.996,0.004,summary


In [3]:
files_to_process = [
            'NCT00001959.json', 
            'NCT00448136.json', 
            'NCT00726986.json', 
            'NCT01226485.json',
            'NCT01334723.json',
            'NCT01338987.json',
            'NCT01696968.json',
            'NCT02684058.json',
            'NCT03158220.json',
            'NCT03271047.json'
        ]

# Assuming your DataFrame is called df
for file in files_to_process:
    # Remove .json extension and count rows where file_path contains the NCT number
    nct_number = file.replace('.json', '')
    count = df[df['file_path'].str.contains(nct_number)].shape[0]
    print(f"{nct_number}: {count} rows")

NCT00001959: 30 rows
NCT00448136: 30 rows
NCT00726986: 30 rows
NCT01226485: 30 rows
NCT01334723: 30 rows
NCT01338987: 30 rows
NCT01696968: 30 rows
NCT02684058: 30 rows
NCT03158220: 30 rows
NCT03271047: 30 rows


## Percentiles

In [4]:
import pandas as pd
import numpy as np

df_filtered = df[df['gpt_label'] == 'coverage']
df_sorted = df_filtered.sort_values('probability_fact', ascending=False).reset_index(drop=True)

# Define percentile ranges (5, 10, ..., 95, 100)
percentiles = range(5, 105, 5)

# Store results
results = []

for p in percentiles:
    # Calculate row indices for this percentile
    n_rows = len(df_sorted)
    start_idx = int(n_rows * (p-5)/100)
    end_idx = int(n_rows * p/100)
    
    # Get data for this percentile range
    df_slice = df_sorted.iloc[start_idx:end_idx]
    
    # Store ranges
    results.append({
        'percentile': f"{p-5}-{p}%",
        'n_samples': len(df_slice),
        'hallucination_prob_range': f"{df_slice['probability_hallucination'].min():.3f}-{df_slice['probability_hallucination'].max():.3f}"
    })

# Convert to DataFrame and display
results_df = pd.DataFrame(results)
print("\nProbability Ranges per Percentile:")
print(results_df.to_string(index=False))


Probability Ranges per Percentile:
percentile  n_samples hallucination_prob_range
      0-5%          3              0.002-0.009
     5-10%          3              0.053-0.089
    10-15%          3              0.130-0.143
    15-20%          3              0.154-0.163
    20-25%          3              0.188-0.190
    25-30%          3              0.252-0.292
    30-35%          4              0.322-0.348
    35-40%          3              0.348-0.367
    40-45%          3              0.393-0.398
    45-50%          3              0.422-0.444
    50-55%          3              0.459-0.484
    55-60%          3              0.486-0.505
    60-65%          3              0.544-0.552
    65-70%          4              0.568-0.577
    70-75%          3              0.578-0.585
    75-80%          3              0.586-0.616
    80-85%          3              0.627-0.650
    85-90%          3              0.651-0.663
    90-95%          3              0.698-0.716
   95-100%          4   

In [5]:
df_sorted

Unnamed: 0,nct_number,file_path,gpt_label,predicted_label,probability_fact,probability_hallucination,source
0,NCT01338987,NCT01338987/logprob_matrix_1.csv,coverage,fact,0.998,0.002,title
1,NCT01334723,NCT01334723/logprob_matrix_1.csv,coverage,fact,0.998,0.002,title
2,NCT02684058,NCT02684058/logprob_matrix_1.csv,coverage,fact,0.991,0.009,summary
3,NCT00448136,NCT00448136/logprob_matrix_10.csv,coverage,fact,0.947,0.053,summary
4,NCT00448136,NCT00448136/logprob_matrix_11.csv,coverage,fact,0.919,0.081,summary
...,...,...,...,...,...,...,...
58,NCT01338987,NCT01338987/logprob_matrix_5.csv,coverage,hallucination,0.284,0.716,title
59,NCT03271047,NCT03271047/logprob_matrix_6.csv,coverage,hallucination,0.255,0.745,title
60,NCT03271047,NCT03271047/logprob_matrix_8.csv,coverage,hallucination,0.167,0.833,title
61,NCT00448136,NCT00448136/logprob_matrix_8.csv,coverage,hallucination,0.147,0.853,title


In [6]:
# Save first 20 rows
top_10 = df_sorted.head(10)
top_10.to_csv('Coverage_top10_low_hall.csv', index=False)

# Save last 20 rows
bottom_10 = df_sorted.tail(10)
bottom_10.to_csv('Coverage_bot10_high_hall.csv', index=False)

# Print preview
print("\nTop 10 examples (highest fact probability):")
print(top_10)
print("\nBottom 10 examples (lowest fact probability):")
print(bottom_10)


Top 10 examples (highest fact probability):
    nct_number                          file_path gpt_label predicted_label  \
0  NCT01338987   NCT01338987/logprob_matrix_1.csv  coverage            fact   
1  NCT01334723   NCT01334723/logprob_matrix_1.csv  coverage            fact   
2  NCT02684058   NCT02684058/logprob_matrix_1.csv  coverage            fact   
3  NCT00448136  NCT00448136/logprob_matrix_10.csv  coverage            fact   
4  NCT00448136  NCT00448136/logprob_matrix_11.csv  coverage            fact   
5  NCT01226485  NCT01226485/logprob_matrix_14.csv  coverage            fact   
6  NCT00001959  NCT00001959/logprob_matrix_14.csv  coverage            fact   
7  NCT00448136  NCT00448136/logprob_matrix_14.csv  coverage            fact   
8  NCT00448136  NCT00448136/logprob_matrix_10.csv  coverage            fact   
9  NCT01334723   NCT01334723/logprob_matrix_2.csv  coverage            fact   

   probability_fact  probability_hallucination   source  
0             0.998        

## Put everithing together for review

In [8]:
files_to_process = [
            'NCT00001959.json', 
            'NCT00448136.json', 
            'NCT00726986.json', 
            'NCT01226485.json',
            'NCT01334723.json',
            'NCT01338987.json',
            'NCT01696968.json',
            'NCT02684058.json',
            'NCT03158220.json',
            'NCT03271047.json'
        ]

In [14]:
import json
import os
import pandas as pd

# Define the folder paths
input_folder1 = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Files_with_Summary/"   
input_folder2 = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Eval_vs_Summary/Paragraph_level/Evaluation_summary_Paragraph_GPT/"   
input_folder3 = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Eval_vs_Summary_Counterfactual/Paragraph_level/Evaluation_summary_Paragraph_GPT/"  
output_folder = "Coverage_analysis/summary"            

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process each file in files_to_process
for file_name in files_to_process:
    input_file_path1 = os.path.join(input_folder1, file_name)
    input_file_path2 = os.path.join(input_folder2, file_name)
    input_file_path3 = os.path.join(input_folder3, file_name)
    output_file_path = os.path.join(output_folder, file_name)
    
    try:
        # Read all JSON files
        with open(input_file_path1, 'r') as f1:
            json_data1 = json.load(f1)
        with open(input_file_path2, 'r') as f2:
            json_data2 = json.load(f2)
        with open(input_file_path3, 'r') as f3:
            json_data3 = json.load(f3)
        
        # Extract Final_text from json_data1
        final_text = json_data1.get('Final_text', '')
        
        # Extract and combine data
        extracted_data = {
            'DataBase': final_text,
            'questions': {}
        }
        
        # Process each question using json_data2's keys (since it has the Q structure)
        for q_key in json_data2.keys():
            if q_key.startswith('Q'):  # Only process question entries
                extracted_data['questions'][q_key] = {
                    # Data from second file (factual evaluations)
                    'model_answer': json_data2[q_key].get('statement', ''),
                    'factual_eval': json_data2[q_key].get('evaluation', ''),
                    'factual_label': json_data2[q_key].get('label', ''),
                    # Data from third file (counterfactual evaluations)
                    'counterfactual_eval': json_data3[q_key].get('evaluation', ''),
                    'counterfactual_label': json_data3[q_key].get('label', '')
                }
        
        # Save to output JSON file
        with open(output_file_path, 'w') as f:
            json.dump(extracted_data, f, indent=4)
            
        print(f"Processed {file_name} successfully - {len(extracted_data['questions'])} questions processed")
                
    except Exception as e:
        print(f"Error processing {file_name}: {str(e)}")

print("\nProcessing complete!")
print(f"Output files saved in: {output_folder}")

# Optional: Display summary of processed files
print("\nFiles processed:")
for file_name in files_to_process:
    output_file_path = os.path.join(output_folder, file_name)
    if os.path.exists(output_file_path):
        with open(output_file_path, 'r') as f:
            data = json.load(f)
            print(f"{file_name}: {len(data['questions'])} questions processed")
            print(f"Final_text length: {len(data['DataBase'])} characters")

Processed NCT00001959.json successfully - 15 questions processed
Processed NCT00448136.json successfully - 15 questions processed
Processed NCT00726986.json successfully - 15 questions processed
Processed NCT01226485.json successfully - 15 questions processed
Processed NCT01334723.json successfully - 15 questions processed
Processed NCT01338987.json successfully - 15 questions processed
Processed NCT01696968.json successfully - 15 questions processed
Processed NCT02684058.json successfully - 15 questions processed
Processed NCT03158220.json successfully - 15 questions processed
Processed NCT03271047.json successfully - 15 questions processed

Processing complete!
Output files saved in: Coverage_analysis/summary

Files processed:
NCT00001959.json: 15 questions processed
Final_text length: 4446 characters
NCT00448136.json: 15 questions processed
Final_text length: 1696 characters
NCT00726986.json: 15 questions processed
Final_text length: 3737 characters
NCT01226485.json: 15 questions pr