## Add new Paragraph "counterfactual" labels and "Final_label" combinations

In [1]:
import pandas as pd
import os
import json
import numpy as np

def get_label_from_json(trial_name, question_number, label_folder_path):
    """
    Get label from corresponding JSON file if label_folder_path is provided
    """
    if not label_folder_path:  # If no path provided
        return None
        
    json_path = os.path.join(
        label_folder_path,
        f"{trial_name}.json"
    )
    
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            label = data[f"Q{question_number}"]["eval_databases"]
            # Explicitly handle "N/A" to prevent pandas from converting it
            if label == "N/A":
                return "N/A"  # Forces pandas to keep it as string
            return label
    except Exception as e:
        print(f"Error reading JSON for {trial_name}: {e}")
        return None

def extract_trial_and_question(file_path):
    """
    Extract trial name and question number from file path
    Example: path/to/NCT00001959/logprob_matrix_1.csv -> ("NCT00001959", 1)
    """
    try:
        # Get filename and remove extension
        filename = os.path.basename(file_path)
        # Extract question number
        question_num = int(filename.split('_')[-1].split('.')[0])
        # Extract trial name from path
        trial_name = file_path.split('/')[-2]  # Adjust this based on your actual path structure
        return trial_name, question_num
    except Exception as e:
        print(f"Error extracting trial and question from {file_path}: {e}")
        return None, None

def process_csv_and_add_labels(csv_path, label_folder_path, output_path):
    """
    Read CSV, add labels from JSON files, and create Final_label based on combinations
    
    Label combinations:
    - label="fact" & counterfactual="fact" -> Final_label="fact"
    - label="hallucination" & counterfactual="hallucination" -> Final_label="hallucination"
    - label="fact" & counterfactual="hallucination" -> Final_label="error"
    - label="hallucination" & counterfactual="fact" -> Final_label="coverage"
    """
    try:
        # Read the CSV file
        print(f"Reading CSV file from: {csv_path}")
        df = pd.read_csv(csv_path)
        
        # Create a new column for counterfactual labels
        df['counterfactual'] = None
        
        # Process each row
        print("Processing rows and adding labels...")
        for idx, row in df.iterrows():
            # Extract trial name and question number from file_path
            trial_name, question_num = extract_trial_and_question(row['file_path'])
            
            if trial_name and question_num:
                # Get label from corresponding JSON
                label = get_label_from_json(trial_name, question_num, label_folder_path)
                df.at[idx, 'counterfactual'] = label
            
            # Print progress every 1000 rows
            if idx % 1000 == 0:
                print(f"Processed {idx} rows...")
        
        # Create Final_label column based on combinations
        print("Creating Final_label column...")
        df['Final_label'] = None
        
        conditions = [
            (df['label'] == 'fact') & (df['counterfactual'] == 'fact'),
            (df['label'] == 'hallucination') & (df['counterfactual'] == 'hallucination'),
            (df['label'] == 'fact') & (df['counterfactual'] == 'hallucination'),
            (df['label'] == 'hallucination') & (df['counterfactual'] == 'fact')
        ]
        
        choices = ['fact', 'hallucination', 'error', 'coverage']
        
        df['Final_label'] = np.select(conditions, choices, default=None)
        
        # Save the updated DataFrame
        print(f"Saving updated CSV to: {output_path}")
        df.to_csv(output_path, index=False)
        
        # Print summary
        print("\nProcessing complete!")
        print(f"Total rows processed: {len(df)}")
        print("\nLabel distribution:")
        print(df['label'].value_counts(dropna=False))
        print("\nCounterfactual distribution:")
        print(df['counterfactual'].value_counts(dropna=False))
        print("\nFinal_label distribution:")
        print(df['Final_label'].value_counts(dropna=False))
        print("\nDetailed combination counts:")
        print(pd.crosstab(df['label'], df['counterfactual'], margins=True))
        
    except Exception as e:
        print(f"Error processing CSV: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Define paths
    csv_path = "concatenated_results_Paragraph_title_GptO3.csv"  # Replace with your CSV path
    label_folder_path = "Database_dependent_evaluation/Clinical_trials/4-Evaluation/Evaluation_counterfactual/GptO3/Paragraph_level/Evaluation_title_Paragraph_GPT"
    output_path = "concatenated_results_Paragraph_title_COUNTERFACTUAL_GptO3.csv"  # Replace with desired output path
    
    # Process the CSV and add labels
    process_csv_and_add_labels(csv_path, label_folder_path, output_path)

Reading CSV file from: concatenated_results_Paragraph_title_Gpt5.csv
Processing rows and adding labels...
Processed 0 rows...
Processed 1000 rows...
Creating Final_label column...
Saving updated CSV to: concatenated_results_Paragraph_title_COUNTERFACTUAL_Gpt5.csv

Processing complete!
Total rows processed: 1500

Label distribution:
label
hallucination    957
fact             540
NaN                3
Name: count, dtype: int64

Counterfactual distribution:
counterfactual
fact             1100
hallucination     398
N/A                 2
Name: count, dtype: int64

Final_label distribution:
Final_label
coverage         573
fact             524
hallucination    383
error             15
None               5
Name: count, dtype: int64

Detailed combination counts:
counterfactual  N/A  fact  hallucination   All
label                                         
fact              1   524             15   540
hallucination     1   573            383   957
All               2  1097            398  1497

## Add new Sentence "counterfactual" labels and "Final_label" combinations

In [1]:
import pandas as pd
import os
import json
import numpy as np

def get_label_from_json(trial_name, question_number, sentence_number=None, label_folder_path=None):
    """
    Get label from corresponding JSON file
    """
    if not label_folder_path:
        return None
        
    json_path = os.path.join(label_folder_path, f"{trial_name}.json")
    
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            
        q_key = f"Q{question_number}"
        
        # Handle sentence-level structure
        if sentence_number is not None:
            if q_key in data and isinstance(data[q_key], list):
                if 0 <= sentence_number - 1 < len(data[q_key]):
                    sentence_data = data[q_key][sentence_number - 1]
                    if isinstance(sentence_data, dict):
                        label = sentence_data.get("eval_databases")
                    else:
                        return None
                else:
                    print(f"Sentence number {sentence_number} out of range for {trial_name} {q_key}")
                    return None
            else:
                print(f"No sentence-level data found for {trial_name} {q_key}")
                return None
        else:
            # Original paragraph-level structure
            if q_key in data and isinstance(data[q_key], dict):
                label = data[q_key].get("eval_databases")
            else:
                return None
                
        return "N/A" if label == "N/A" else label
        
    except Exception as e:
        print(f"Error reading JSON for {trial_name}: {e}")
        return None

def extract_trial_and_question(file_path):
    """
    Extract trial name, question number, and sentence number from file path
    Example: path/to/NCT00001959/logprob_matrix_3_sentence_1.csv -> ("NCT00001959", 3, 1)
    """
    try:
        filename = os.path.basename(file_path)
        parts = filename.split('_')
        
        # Extract question number and sentence number
        if 'sentence' in filename:
            question_num = int(parts[2])  # Get number after 'matrix'
            sentence_num = int(parts[-1].split('.')[0])  # Get number after 'sentence'
        else:
            question_num = int(parts[-1].split('.')[0])  # Original format
            sentence_num = None
            
        # Extract trial name from path
        trial_name = file_path.split('/')[-2]
        
        return trial_name, question_num, sentence_num
        
    except Exception as e:
        print(f"Error extracting info from {file_path}: {e}")
        return None, None, None


def process_csv_and_add_labels(csv_path, label_folder_path, output_path):
    """
    Read CSV, add labels from JSON files, and create Final_label based on combinations
    
    Label combinations:
    - label="fact" & counterfactual="fact" -> Final_label="fact"
    - label="hallucination" & counterfactual="hallucination" -> Final_label="hallucination"
    - label="fact" & counterfactual="hallucination" -> Final_label="error"
    - label="hallucination" & counterfactual="fact" -> Final_label="coverage"
    """
    try:
        # Read the CSV file
        print(f"Reading CSV file from: {csv_path}")
        df = pd.read_csv(csv_path)
        
        # Create a new column for counterfactual labels
        df['counterfactual'] = None
        
        # Process each row
        print("Processing rows and adding labels...")
        for idx, row in df.iterrows():
            # Extract trial name and question number from file_path
            trial_name, question_num, sentence_num = extract_trial_and_question(row['file_path'])
            
            if trial_name and question_num:
                # Get label from JSON with sentence number
                label = get_label_from_json(trial_name, question_num, sentence_num, label_folder_path)
                df.at[idx, 'counterfactual'] = label
            
            # Print progress every 1000 rows
            if idx % 1000 == 0:
                print(f"Processed {idx} rows...")
        
        # Create Final_label column based on combinations
        print("Creating Final_label column...")
        df['Final_label'] = None
        
        conditions = [
            (df['label'] == 'fact') & (df['counterfactual'] == 'fact'),
            (df['label'] == 'hallucination') & (df['counterfactual'] == 'hallucination'),
            (df['label'] == 'fact') & (df['counterfactual'] == 'hallucination'),
            (df['label'] == 'hallucination') & (df['counterfactual'] == 'fact')
        ]
        
        choices = ['fact', 'hallucination', 'error', 'coverage']
        
        df['Final_label'] = np.select(conditions, choices, default=None)
        
        # Save the updated DataFrame
        print(f"Saving updated CSV to: {output_path}")
        df.to_csv(output_path, index=False)
        
        # Print summary
        print("\nProcessing complete!")
        print(f"Total rows processed: {len(df)}")
        print("\nLabel distribution:")
        print(df['label'].value_counts(dropna=False))
        print("\nCounterfactual distribution:")
        print(df['counterfactual'].value_counts(dropna=False))
        print("\nFinal_label distribution:")
        print(df['Final_label'].value_counts(dropna=False))
        print("\nDetailed combination counts:")
        print(pd.crosstab(df['label'], df['counterfactual'], margins=True))
        
    except Exception as e:
        print(f"Error processing CSV: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Define paths
    csv_path = "concatenated_results_Sentence_summary.csv"  # Replace with your CSV path
    label_folder_path = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Inference_files_with_labels_Counterfactual/Sentence_level/Inference_summary_Sentence"
    output_path = "concatenated_results_Sentence_summary_COUNTERFACTUAL.csv"  # Replace with desired output path
    
    # Process the CSV and add labels
    process_csv_and_add_labels(csv_path, label_folder_path, output_path)

Reading CSV file from: concatenated_results_Sentence_summary.csv
Processing rows and adding labels...
Processed 0 rows...
Processed 1000 rows...
Processed 2000 rows...
Processed 3000 rows...
Processed 4000 rows...
Creating Final_label column...
Saving updated CSV to: concatenated_results_Sentence_summary_COUNTERFACTUAL.csv

Processing complete!
Total rows processed: 4911

Label distribution:
label
fact             4175
hallucination     669
NaN                67
Name: count, dtype: int64

Counterfactual distribution:
counterfactual
fact             3691
N/A              1134
hallucination      86
Name: count, dtype: int64

Final_label distribution:
Final_label
fact             3132
None             1150
coverage          550
error              43
hallucination      36
Name: count, dtype: int64

Detailed combination counts:
counterfactual   N/A  fact  hallucination   All
label                                          
fact            1000  3132             43  4175
hallucination     83 

## Add new Claim "counterfactual" labels and "Final_label" combinations

In [1]:
import pandas as pd
import os
import json
import numpy as np

def get_label_from_json(trial_name, question_number, claim_number=None, label_folder_path=None):
    """
    Get label from corresponding JSON file
    """
    if not label_folder_path:
        return None
        
    json_path = os.path.join(label_folder_path, f"{trial_name}.json")
    
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
            
        q_key = f"Q{question_number}"
        
        # Handle claim-level structure
        if claim_number is not None:
            if q_key in data and isinstance(data[q_key], list):
                if 0 <= claim_number - 1 < len(data[q_key]):
                    claim_data = data[q_key][claim_number - 1]
                    if isinstance(claim_data, dict):
                        label = claim_data.get("eval_databases")
                    else:
                        return None
                else:
                    print(f"claim number {claim_number} out of range for {trial_name} {q_key}")
                    return None
            else:
                print(f"No claim-level data found for {trial_name} {q_key}")
                return None
        else:
            # Original paragraph-level structure
            if q_key in data and isinstance(data[q_key], dict):
                label = data[q_key].get("eval_databases")
            else:
                return None
                
        return "N/A" if label == "N/A" else label
        
    except Exception as e:
        print(f"Error reading JSON for {trial_name}: {e}")
        return None

def extract_trial_and_question(file_path):
    """
    Extract trial name, question number, and claim number from file path
    Example: path/to/NCT00001959/logprob_matrix_3_claim_1.csv -> ("NCT00001959", 3, 1)
    """
    try:
        filename = os.path.basename(file_path)
        parts = filename.split('_')
        
        # Extract question number and claim number
        if 'claim' in filename:
            question_num = int(parts[2])  # Get number after 'matrix'
            claim_num = int(parts[-1].split('.')[0])  # Get number after 'claim'
        else:
            question_num = int(parts[-1].split('.')[0])  # Original format
            claim_num = None
            
        # Extract trial name from path
        trial_name = file_path.split('/')[-2]
        
        return trial_name, question_num, claim_num
        
    except Exception as e:
        print(f"Error extracting info from {file_path}: {e}")
        return None, None, None


def process_csv_and_add_labels(csv_path, label_folder_path, output_path):
    """
    Read CSV, add labels from JSON files, and create Final_label based on combinations
    
    Label combinations:
    - label="fact" & counterfactual="fact" -> Final_label="fact"
    - label="hallucination" & counterfactual="hallucination" -> Final_label="hallucination"
    - label="fact" & counterfactual="hallucination" -> Final_label="error"
    - label="hallucination" & counterfactual="fact" -> Final_label="coverage"
    """
    try:
        # Read the CSV file
        print(f"Reading CSV file from: {csv_path}")
        df = pd.read_csv(csv_path)
        
        # Create a new column for counterfactual labels
        df['counterfactual'] = None
        
        # Process each row
        print("Processing rows and adding labels...")
        for idx, row in df.iterrows():
            # Extract trial name and question number from file_path
            trial_name, question_num, claim_num = extract_trial_and_question(row['file_path'])
            
            if trial_name and question_num:
                # Get label from JSON with sentence number
                label = get_label_from_json(trial_name, question_num, claim_num, label_folder_path)
                df.at[idx, 'counterfactual'] = label
            
            # Print progress every 1000 rows
            if idx % 1000 == 0:
                print(f"Processed {idx} rows...")
        
        # Create Final_label column based on combinations
        print("Creating Final_label column...")
        df['Final_label'] = None
        
        conditions = [
            (df['label'] == 'fact') & (df['counterfactual'] == 'fact'),
            (df['label'] == 'hallucination') & (df['counterfactual'] == 'hallucination'),
            (df['label'] == 'fact') & (df['counterfactual'] == 'hallucination'),
            (df['label'] == 'hallucination') & (df['counterfactual'] == 'fact')
        ]
        
        choices = ['fact', 'hallucination', 'error', 'coverage']
        
        df['Final_label'] = np.select(conditions, choices, default=None)
        
        # Save the updated DataFrame
        print(f"Saving updated CSV to: {output_path}")
        df.to_csv(output_path, index=False)
        
        # Print summary
        print("\nProcessing complete!")
        print(f"Total rows processed: {len(df)}")
        print("\nLabel distribution:")
        print(df['label'].value_counts(dropna=False))
        print("\nCounterfactual distribution:")
        print(df['counterfactual'].value_counts(dropna=False))
        print("\nFinal_label distribution:")
        print(df['Final_label'].value_counts(dropna=False))
        print("\nDetailed combination counts:")
        print(pd.crosstab(df['label'], df['counterfactual'], margins=True))
        
    except Exception as e:
        print(f"Error processing CSV: {e}")
        return None

# Example usage
if __name__ == "__main__":
    # Define paths
    csv_path = "csv_proccess_ensemble/concatenated_results_Claim_title.csv"  # Replace with your CSV path
    label_folder_path = "/home/4481281/Clinical_trials/Original_format/Results/Factuality_Eval/Inference_files_with_labels_Counterfactual/Claim_level/Inference_title_Claim"
    output_path = "concatenated_results_Claim_title_COUNTERFACTUAL.csv"  # Replace with desired output path
    
    # Process the CSV and add labels
    process_csv_and_add_labels(csv_path, label_folder_path, output_path)

Reading CSV file from: csv_proccess_ensemble/concatenated_results_Claim_title.csv
Processing rows and adding labels...
Processed 0 rows...
Processed 1000 rows...
Processed 2000 rows...
Processed 3000 rows...
Processed 4000 rows...
Processed 5000 rows...
Processed 6000 rows...
Processed 7000 rows...
Processed 8000 rows...
Processed 9000 rows...
Processed 10000 rows...
Processed 11000 rows...
Processed 12000 rows...
Processed 13000 rows...
Processed 14000 rows...
Processed 15000 rows...
Processed 16000 rows...
Processed 17000 rows...
Processed 18000 rows...
Processed 19000 rows...
Processed 20000 rows...
Processed 21000 rows...
Processed 22000 rows...
Processed 23000 rows...
Processed 24000 rows...
Processed 25000 rows...
Processed 26000 rows...
Processed 27000 rows...
Processed 28000 rows...
Processed 29000 rows...
Processed 30000 rows...
Processed 31000 rows...
Processed 32000 rows...
Processed 33000 rows...
Processed 34000 rows...
Processed 35000 rows...
Processed 36000 rows...
Proces

## Test

In [1]:
import pandas as pd
import os
import json
import numpy as np

csv_path1 = "csv_proccess_ensemble/concatenated_results_Claim_title.csv"
csv_path2 = "csv_proccess_ensemble/concatenated_results_Claim_summary.csv"

df1 = pd.read_csv(csv_path1)
df2 = pd.read_csv(csv_path2)


df_combined = pd.concat([df1, df2], axis=0, ignore_index=True)
print(f"Original shape: {df_combined.shape}")

df_clean = df_combined.dropna(subset=['label'])
print(f"Shape after removing NaN labels: {df_clean.shape}")

print("\nLabel distribution in clean data:")
print(df_clean['label'].value_counts())

Original shape: (116866, 683)
Shape after removing NaN labels: (111120, 683)

Label distribution in clean data:
label
hallucination    58071
fact             53049
Name: count, dtype: int64


In [1]:
import pandas as pd
import os
import json
import numpy as np

csv_path1 = "concatenated_results_Sentence_title_COUNTERFACTUAL.csv"
csv_path2 = "concatenated_results_Sentence_summary_COUNTERFACTUAL.csv"

df1 = pd.read_csv(csv_path1)
df2 = pd.read_csv(csv_path2)


df_combined = pd.concat([df1, df2], axis=0, ignore_index=True)
print(f"Original shape: {df_combined.shape}")

df_clean = df_combined.dropna(subset=['counterfactual'])
df_clean_final = df_combined.dropna(subset=['Final_label'])

print(f"Shape after removing NaN labels: {df_clean_final.shape}")

print("\nLabel distribution in clean data:")
print(df_clean['counterfactual'].value_counts())
print("\nLabel distribution in final data:")
print(df_clean_final['Final_label'].value_counts())

Original shape: (26556, 685)
Shape after removing NaN labels: (21177, 685)

Label distribution in clean data:
counterfactual
fact             17794
hallucination     4069
Name: count, dtype: int64

Label distribution in final data:
Final_label
coverage         9486
fact             7911
hallucination    3459
error             321
Name: count, dtype: int64


In [2]:
9486/21177

0.44793880152996174