In [None]:
import pandas as pd
import os

import warnings
import numpy as np
import utils
import matplotlib.pyplot as plt
from seaborn import heatmap
from sklearn.metrics import f1_score, cohen_kappa_score

from sklearn.metrics import confusion_matrix
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests

In [None]:
# makes sure that the font is recognized in Adobe Illustrator as Text
plt.rcParams['svg.fonttype'] = 'none'

## Functions

In [None]:
def preprocess_dataframes(dataframes, binary=False):
    for df_name, df in dataframes.items():
        # First, replace specific values, to make them all conform to one standard
        df.replace({"yes": "Yes", "no": "No", "None": "No"}, inplace=True)
        df.replace({np.NaN: "No"}, inplace=True)
        df.replace({1: "Yes", "0": "No", -1: "Maybe", "maybe": "Maybe", "1": "Yes", "1.0":"Yes"}, inplace=True)

        lung_opacity_conditions = ['Edema', 'Consolidation', 'Pneumonia', 'Lung_Lesion', 'Atelectasis']
        cardiomegaly_conditions = ['Cardiomegaly']

        utils.update_column(df, 'Lung_Opacity', lung_opacity_conditions)
        utils.update_column(df, 'Enlarged_Cardiomediastinum', cardiomegaly_conditions)


        # Then, replace all values that are not Yes or Maybe with No
        for column in df.columns:
            df[column] = df[column].apply(lambda x: "Yes" if x == "Yes" else ("Maybe" if x == "Maybe" else "No"))

        # replace maybe with yes
        if binary:
            df.replace({"Maybe": "Yes"}, inplace=True)
        
    return dataframes

In [None]:
def calculate_f1_scores(df_ground_truth, df_predictions):
    """
    Calculate individual, average, micro, and macro F1 scores for multi-label classification.

    Parameters:
    - df_ground_truth: DataFrame containing the ground truth labels.
    - df_predictions: DataFrame containing the predicted labels.

    Returns:
    - A tuple containing the dictionary of F1 scores for each label, average F1 score, micro F1 score, and macro F1 score.
    """
    # Check for values other than 'Yes' or 'No' and notify replacement
    for df in [df_ground_truth, df_predictions]:
        if df.isin(['Yes', 'No']).all().all() == False:
            warnings.warn("dataframe contains values other than 'Yes' or 'No'. Please ensure all values are 'Yes' or 'No'.", UserWarning)

    # Convert 'Yes'/'No' to binary format
    df_ground_truth_binary = df_ground_truth.apply(lambda col: col.map({'Yes': 1, 'No': 0}))
    df_predictions_binary = df_predictions.apply(lambda col: col.map({'Yes': 1, 'No': 0}))

    # Align columns
    common_columns = df_ground_truth.columns.intersection(df_predictions.columns)
    df_ground_truth_binary = df_ground_truth_binary[common_columns]
    df_predictions_binary = df_predictions_binary.reindex(columns=df_ground_truth_binary.columns)

    # Calculate F1 scores for each label
    f1_scores = {label: f1_score(df_ground_truth_binary[label], df_predictions_binary[label])
                 for label in common_columns}

    # Calculate average, micro, and macro F1 scores
    average_f1 = sum(f1_scores.values()) / len(f1_scores)
    micro_f1 = f1_score(df_ground_truth_binary.values.ravel(), df_predictions_binary.values.ravel(), average='micro')
    macro_f1 = f1_score(df_ground_truth_binary, df_predictions_binary, average='macro', zero_division=0)

    return f1_scores, average_f1, micro_f1, macro_f1


In [None]:
def calculate_cohens_kappa(df_ground_truth, df_predictions):
    # Map 'Yes', 'Maybe', 'No' to numeric values
    class_mapping = {'Yes': 1, 'Maybe': 2, 'No': 0}
    df_ground_truth_mapped = df_ground_truth.apply(lambda col: col.map(class_mapping))
    df_predictions_mapped = df_predictions.apply(lambda col: col.map(class_mapping))

    # Align columns
    common_columns = df_ground_truth.columns.intersection(df_predictions.columns)
    df_ground_truth_mapped = df_ground_truth_mapped[common_columns]
    df_predictions_mapped = df_predictions_mapped.reindex(columns=df_ground_truth_mapped.columns)

    # Calculate Cohen's Kappa for each finding
    cohens_kappa_scores = {finding: cohen_kappa_score(df_ground_truth_mapped[finding], df_predictions_mapped[finding])
                           for finding in df_ground_truth_mapped.columns}


    average_kappa = sum(cohens_kappa_scores.values()) / len(cohens_kappa_scores)

    return cohens_kappa_scores, average_kappa

## Constants


In [None]:
base_path = '/base/path/for/inference_results/'

diseases = ['Atelectasis',
'Cardiomegaly',
'Consolidation',
'Edema',
'Lung_Lesion',
'Lung_Opacity',
'Pleural_Other',
'Pleural_Effusion',
'Pneumonia',
'Pneumothorax',
'Support_Devices',
'Enlarged_Cardiomediastinum',
'Fracture']

In [None]:
file_paths = {
    'gpt_4': os.path.join(base_path, 'few_shot_positive_gpt_4/gpt_labeled_reports.csv'),
    'gpt_35': os.path.join(base_path, 'few_shot_positive_gpt_35/gpt_labeled_reports.csv'),
    'llama13b': os.path.join(base_path, 'few_shot_positive_llama13b/llm_labeled_reports.csv'),
    'llama70b': os.path.join(base_path, 'few_shot_positive_llama70b/llm_labeled_reports.csv'),
    'mistral7b': os.path.join(base_path, 'few_shot_positive_mistral7b/llm_labeled_reports.csv'),
    'mixtral8x7b': os.path.join(base_path, 'few_shot_positive_mixtral8x7b/llm_labeled_reports.csv'),
    'qwen_72b': os.path.join(base_path, 'few_shot_positive_qwen72b/llm_labeled_reports.csv'),
    
    'ground_truth': "/base/path/for/inference_results/groundtruth.csv",
    'chexbert': "/base/path/for/inference_results/chexbert_labeled_mgb.csv",
    'chexpert': "/base/path/for/inference_results/chexpert_labeled_reports_mgb.csv",
}


## Pre-Processing

In [None]:
dataframes = {name: utils.load_and_preprocess_data(path) for name, path in file_paths.items()}

#create raw copy that will not get processed for further analysis
dataframes_raw = dataframes.copy()

In [None]:
# go through dataframes and drop all columns that are not in diseases
for name, df in dataframes.items():
    for col in df.columns:
        if col not in diseases:
            df.drop(col, axis=1, inplace=True)
    df.replace(["No Information", "Undefined"], "No", inplace=True)


In [None]:
# For using "Maybe" as a seperate class set binary=False
dataframes = preprocess_dataframes(dataframes, binary=True)

In [None]:
gpt_4 = dataframes['gpt_4']
gpt_35 = dataframes['gpt_35']

llama13b = dataframes['llama13b']
llama70b = dataframes['llama70b']
mistral7b = dataframes['mistral7b']
mixtral8x7b = dataframes['mixtral8x7b']
qwen_72b = dataframes['qwen_72b']

ground_truth = dataframes['ground_truth']
chexbert = dataframes['chexbert']
chexpert = dataframes['chexpert']


In [None]:
# combine Mixtral, Llama70 and qwen in an ensemble
# NOTE: this only works for the binary use case, there is not a straighforward way
# to apply this to the multi-class case using just three models (potentially each model has a different label)
llama70b_numerical = pd.DataFrame(index=llama70b.index, columns=llama70b.columns)
mixtral8x7b_numerical = pd.DataFrame(index=mixtral8x7b.index, columns=mixtral8x7b.columns)
qwen_72b_numerical = pd.DataFrame(index=qwen_72b.index, columns=qwen_72b.columns)


llama70b_numerical = llama70b.apply(lambda x: x.map({'Yes': 1, 'No': 0}))
mixtral8x7b_numerical = mixtral8x7b.apply(lambda x: x.map({'Yes': 1, 'No': 0}))
qwen_72b_numerical = qwen_72b.apply(lambda x: x.map({'Yes': 1, 'No': 0}))

majority_vote_sum = llama70b_numerical + mixtral8x7b_numerical + qwen_72b_numerical


model_ensemble = majority_vote_sum.apply(lambda col: col.map(lambda x: 'Yes' if x >= 2 else 'No'))


## Evaluation

In [None]:
# Ground truth labels
y_true = ground_truth

# Prediction DataFrames
predictions = {
    'gpt_4': gpt_4,
    'gpt_35': gpt_35,
    'llama13b': llama13b,
    'llama70b': llama70b,
    'mistral7b': mistral7b,
    'mixtral8x7b': mixtral8x7b,
    'mixtral8x7b': mixtral8x7b,
    'qwen_72b': qwen_72b,
    'chexbert': chexbert,
    'chexpert': chexpert,
    'ensemble': model_ensemble
}

In [None]:
# Calculating F1 scores for each model
model_scores = {model_name: calculate_f1_scores(y_true, y_pred) for model_name, y_pred in predictions.items()}

# Creating DataFrame for model scores
scores_data = {}
for model_name, (f1_scores, average_f1, micro_f1, macro_f1) in model_scores.items():
    for finding, score in f1_scores.items():
        scores_data.setdefault(finding, {}).update({f'F1 Score {model_name.title()}': score})
    scores_data.setdefault('Average', {}).update({f'F1 Score {model_name.title()}': average_f1})
    scores_data.setdefault('Micro F1', {}).update({f'F1 Score {model_name.title()}': micro_f1})
    scores_data.setdefault('Macro F1', {}).update({f'F1 Score {model_name.title()}': macro_f1})


f1_scores_df = pd.DataFrame(scores_data).T.round(3)
f1_scores_df = f1_scores_df[['F1 Score Chexpert', 'F1 Score Chexbert', 'F1 Score Mistral7B',  'F1 Score Llama13B',  'F1 Score Mixtral8X7B', 'F1 Score Llama70B','F1 Score Qwen_72B', 'F1 Score Ensemble', 'F1 Score Gpt_35', 'F1 Score Gpt_4']]
f1_scores_df

In [None]:
# save inference results
f1_scores_df.to_csv(os.path.join(base_path, 'f1_scores_zero_shot_binary.csv'))

In [None]:
model_kappa_scores = {model_name: calculate_cohens_kappa(y_true, y_pred) for model_name, y_pred in predictions.items()}

# Creating DataFrame for Cohen's Kappa scores
kappa_scores_data = {}
for model_name, (cohens_kappa_scores, average_kappa) in model_kappa_scores.items():
    for finding, score in cohens_kappa_scores.items():
        kappa_scores_data.setdefault(finding, {}).update({f'Cohen\'s Kappa {model_name.title()}': score})
    kappa_scores_data.setdefault('Average Kappa', {}).update({f'Cohen\'s Kappa {model_name.title()}': average_kappa})

# Converting the kappa_scores_data dictionary to a DataFrame and rounding the scores for better readability.
kappa_scores_df = pd.DataFrame(kappa_scores_data).T.round(3)

# Order the columns to make it easier to transfer to overleaf
ordered_columns = [
    'Cohen\'s Kappa Chexpert', 'Cohen\'s Kappa Chexbert', 'Cohen\'s Kappa Mistral7B', 
    'Cohen\'s Kappa Llama13B', 'Cohen\'s Kappa Mixtral8X7B', 'Cohen\'s Kappa Llama70B',
    'Cohen\'s Kappa Qwen_72B', 'Cohen\'s Kappa Gpt_35', 'Cohen\'s Kappa Gpt_4',
]
kappa_scores_df = kappa_scores_df.reindex(columns=ordered_columns)

kappa_scores_df

In [None]:
# save kappa scores
kappa_scores_df.to_csv(os.path.join(base_path, 'kappa_scores_zero_shot_mgb.csv'))


## Label Frequency in Groundtruth

In [None]:
# count values for each column
ground_truth_count = ground_truth.apply(pd.Series.value_counts)
ground_truth_count = ground_truth_count.T.fillna(0).astype(int)

#ground_truth_count.to_csv('ground_truth_count.csv')
ground_truth_count

## Statistics

In [None]:
def mcnemar_test_corrected(model1, model2, diseases, ground_truth):
    valid_values = {"Yes", "No", "Maybe"}
    
    def check_values(df, columns):
        for col in columns:
            if not df[col].isin(valid_values).all():
                raise ValueError(f"Column {col} contains values other than 'Yes', 'Maybe' or 'No'.")
    
    check_values(model1, diseases)
    check_values(model2, diseases)

    results = []
    combined_contingency = [0, 0, 0, 0]  # yes_yes, yes_no, no_yes, no_no
    
    for disease in diseases:
        correct_correct = ((model1[disease] == ground_truth[disease]) & (model2[disease] == ground_truth[disease])).sum()
        correct_incorrect = ((model1[disease] == ground_truth[disease]) & (model2[disease] != ground_truth[disease])).sum()
        incorrect_correct = ((model1[disease] != ground_truth[disease]) & (model2[disease] == ground_truth[disease])).sum()
        incorrect_incorrect = ((model1[disease] != ground_truth[disease]) & (model2[disease] != ground_truth[disease])).sum()
        
        combined_contingency[0] += correct_correct
        combined_contingency[1] += correct_incorrect
        combined_contingency[2] += incorrect_correct
        combined_contingency[3] += incorrect_incorrect
        
        contingency_table = [[correct_correct, correct_incorrect], [incorrect_correct, incorrect_incorrect]]

        result = mcnemar(contingency_table, exact=True if min(correct_incorrect, incorrect_correct) < 25 else False)
        stat = result.statistic
        p = result.pvalue
        
        results.append({
            "Disease": disease,
            "McNemar Statistic": stat,
            "P-value": p
        })
    
    # Perform McNemar test on the combined contingency table
    combined_table = [[combined_contingency[0], combined_contingency[1]],
                      [combined_contingency[2], combined_contingency[3]]]
    print(combined_table)
    combined_result = mcnemar(combined_table, exact=True if np.sum([combined_contingency[1], combined_contingency[2]])<25 else False)
    # Append combined results
    results.append({
        "Disease": "All Combined",
        "McNemar Statistic": combined_result.statistic,
        "P-value": combined_result.pvalue
    })
    
    results_df = pd.DataFrame(results)
    p_values = results_df["P-value"].values
    bonferroni_corrected = multipletests(p_values, alpha=0.05, method='bonferroni')
    results_df['Bonferroni-corrected P-value'] = bonferroni_corrected[1]
    
    return results_df


aggregated_results_mcnemar = pd.DataFrame()

for model_name, model_predictions in predictions.items():

    results_df_mcnemar = mcnemar_test_corrected(gpt_4, model_predictions, diseases, y_true)
    results_df_mcnemar['Model'] = model_name  # Add a column for the model name
    # Ensure the 'Disease' column is included before concatenation
    aggregated_results_mcnemar = pd.concat([aggregated_results_mcnemar, results_df_mcnemar], axis=0)


# Resetting index for better readability and to avoid duplicate indices
aggregated_results_mcnemar.reset_index(drop=True, inplace=True)
aggregated_results_mcnemar.set_index(["Model"], inplace=True)
total_mcnemar = aggregated_results_mcnemar[aggregated_results_mcnemar["Disease"] == "All Combined"]
total_mcnemar

In [None]:
total_mcnemar.to_csv(os.path.join(base_path, 'total_mcnemar_few_shot_positive_binary.csv'))

## Radar Plot

In [None]:
# set findings
findings = f1_scores_df.index[:13] 

# replace _ with sspace in findings for nicer plotting
findings = [finding.replace("_", " ") for finding in findings.tolist()]

# Results for the revlevant models
values_gpt4 = f1_scores_df['F1 Score Gpt_4'][:13].tolist()
values_qwen = f1_scores_df['F1 Score Qwen_72B'][:13].tolist()
values_mixtral = f1_scores_df['F1 Score Mixtral8X7B'][:13].tolist()
values_llama = f1_scores_df['F1 Score Llama70B'][:13].tolist()

# Complete the loop for the radar chart
values_gpt4 += values_gpt4[:1]
values_qwen += values_qwen[:1]
values_mixtral += values_mixtral[:1]
values_llama += values_llama[:1]

num_vars = len(findings)

# Split the circle into even parts and save the angles
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()

# Complete the loop for the radar chart
angles += angles[:1]


fig, ax = plt.subplots(figsize=(6, 6), subplot_kw=dict(polar=True))
plt.xticks(angles[:-1], findings, color='black', size=8)

ax.set_rlabel_position(0)
plt.yticks(color="grey", size=7)
plt.ylim(0.5, 1)

ax.plot(angles, values_gpt4, linewidth=1, linestyle='solid', label='GPT-4')
ax.plot(angles, values_qwen, linewidth=1, linestyle='solid', label='QWEN1.5-72B')
ax.plot(angles, values_mixtral, linewidth=1, linestyle='solid', label='Mixtral-8x7B')
ax.plot(angles, values_llama, linewidth=1, linestyle='solid', label='Llama2-70B')


plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))

plt.savefig(os.path.join(base_path, 'radar_plot_few_shot_positive_binary.svg'), format='svg', bbox_inches='tight', transparent=True)
plt.show()