# Import python libraries

In [None]:
import plotly.io as pio
pio.renderers.default = 'iframe'

In [None]:
import pandas as pd
import confidenceinterval
import numpy as np
from scipy.stats import ttest_rel
import scipy.stats as stats


from sklearn.metrics import (roc_auc_score,
                             accuracy_score,
                             classification_report,
                             f1_score,
                             confusion_matrix)

import plotly.express as px
from ast import literal_eval

# add path to access src code
import sys
sys.path.append("../src/")

from CI_utils import get_sens_spec, sensitivity_and_specificity_with_confidence_intervals
from plot_utils import generate_confidence_plot, add_inset

# Load data with answers

In [None]:
df_answers = pd.read_csv("../data/BCC_Quiz_answers_anon.csv")
df_ai_scores = pd.read_csv("../data/ai_video_scores_anon.csv")
comprehensive_diag = pd.read_csv("../data/simple_diags_CT.csv", sep=";")

df_answers = pd.merge(df_answers, comprehensive_diag, left_on="case_uuid", right_on="case_id", how="inner")

# set scores to list of float
df_answers["all_scores"] = df_answers.all_scores.apply(lambda x : literal_eval(x))
df_ai_scores["all_scores"] = df_ai_scores.all_scores.apply(lambda x : literal_eval(x))

# Compute two-sided t-tests for confidence

In [None]:
# Overal improvement of confidence accross all answers between LC-OCT and AI-assisted LC-OCT

conf_with_ai = df_answers[df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).lcoct_phase_trust_score.values
conf_no_ai = df_answers[~df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).lcoct_phase_trust_score.values

t_statistic, p_value = stats.ttest_rel(conf_with_ai, conf_no_ai, alternative='two-sided')

print("conf with at mean", conf_with_ai.mean())
print("lcoct conf mean", conf_no_ai.mean())
print("T-statistic:", t_statistic)
print("P-value:", p_value)

In [None]:
# Overal improvement of confidence accross all answers between LC-OCT and dermoscopy

conf_dermo = df_answers[~df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).clinical_phase_trust_score.values
conf_lcoct = df_answers[~df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).lcoct_phase_trust_score.values

t_statistic, p_value = stats.ttest_rel(conf_lcoct, conf_dermo, alternative='two-sided')

print("Is confidence up with lc-oct?")
print("dermo conf mean", conf_dermo.mean())
print("lcoct conf mean", conf_lcoct.mean())
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Compute two-sided t-tests for time to diagnostic

In [None]:
# Time comparison between LC-OCT stage and AI-assisted LC-OCT stage

time_with_ai = df_answers[df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).lcoct_phase_elapsed_time.values
time_no_ai = df_answers[~df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).lcoct_phase_elapsed_time.values

t_statistic, p_value = stats.ttest_rel(time_with_ai, time_no_ai, alternative='two-sided')
print("Is slower with AI?")
print("AI time mean", time_with_ai.mean())
print("Normal time mean", time_no_ai.mean())
print("T-statistic:", t_statistic)
print("P-value:", p_value)

In [None]:
# Time comparison between LC-OCT stage and clinical + dermoscopy stage

time_with_lcoct = df_answers[~df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).lcoct_phase_elapsed_time.values
time_clinical = df_answers[~df_answers.ai_assistance_present].sort_values(by=["user", "case_uuid"]).clinical_phase_elapsed_time.values

t_statistic, p_value = stats.ttest_rel(time_clinical, time_with_lcoct)
print("Is slower with clinical data?")
print("time_with_lcoct", time_with_lcoct.mean())
print("time_clinical", time_clinical.mean())
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Compute AUC score of AI alone

In [None]:
roc_auc_score(y_true=df_ai_scores.is_diagnostic_bcc, y_score=df_ai_scores.max_moving_avg_24)

In [None]:
check_cols = ["clinical_phase_elapsed_time", "clinical_phase_trust_score", "lcoct_phase_trust_score", "lcoct_phase_elapsed_time"]
df_answers.groupby("ai_assistance_present")[check_cols].mean()

In [None]:

for (cat, is_ai_help), case_df in df_answers.groupby(["categorie", "ai_assistance_present"]):
    num_cases = len(case_df) // 43
    print("-------")
    print("category", cat, "with ai assitance?", is_ai_help, f'({num_cases} cases)' )

    patho_score, patho_inter = confidenceinterval.accuracy_score(y_true=case_df.is_diagnostic_bcc, y_pred=case_df.lcoct_phase_answer)
    print("accuracy_score", patho_score, "CI", patho_inter)

In [None]:
# Create individuals scores from question level answers
df_results = []
for user, user_df in df_answers.groupby("user"):
    user_df = user_df.reset_index(drop=True)
    user_df["quiz_nb"] = "quiz 1"
    # separate quiz 1 and quiz 2
    user_df.loc[user_df.index>=200, "quiz_nb"] = "quiz 2"
    for ai_help in [True, False]:
        res = {}
        aux_df = user_df[user_df.ai_assistance_present==ai_help].reset_index(drop=True)
        score = roc_auc_score(y_true=aux_df.is_diagnostic_bcc, y_score=aux_df.lcoct_phase_answer)
        all_scores = get_sens_spec(aux_df)
        f_score = f1_score(y_true=aux_df.is_diagnostic_bcc, y_pred=aux_df.lcoct_phase_answer)
        
        res = {"user": user, "user_type": aux_df.user_type.values[0], "f1_score": f_score, "auc": score, "ai_help": ai_help, "clinical_phase": False, **all_scores}

        df_results.append(res)
    for quiz in ["quiz 1", "quiz 2"]:
        aux_df = user_df[user_df.quiz_nb==quiz].reset_index(drop=True)
        acc_clinical = get_sens_spec(aux_df, "clinical_phase_answer")
        f_score = f1_score(y_true=aux_df.is_diagnostic_bcc,
                            y_pred=aux_df.clinical_phase_answer)
        res = {"user": user, "user_type": aux_df.user_type.values[0], "f1_score": f_score, "auc": score, "ai_help": False, "clinical_phase": quiz, **acc_clinical}
        df_results.append(res)
df_results = pd.DataFrame(df_results)

In [None]:
df_results.head(3)

In [None]:
def generate_naming(clinical_phase, ai_help):
    """
    Auxiliary function to generate correct naming of phase and AI assistance
    """
    res = ""
    if clinical_phase:
        res += "dermoscopic & clinical " + clinical_phase 
    else:
        res += "LC-OCT"
        
        if ai_help:
            res += " with ai"
        else:
            res += " no ai"
            
    return res
df_results["categories"] = df_results[["clinical_phase", "ai_help"]].apply(lambda x: generate_naming(x[0], x[1]), axis=1)

# Sensitivity specificity accross all users and expertise level for clinical and dermoscopy

In [None]:

print("CLINICAL & DERMO scores")
for expertise_lvl in df_answers.user_type.unique():
    print(expertise_lvl)
    df_clinical_results = df_answers[(df_answers.user_type==expertise_lvl)].reset_index(drop=True)
    tn, fp, fn, tp = confusion_matrix(y_true=df_clinical_results.is_diagnostic_bcc.values.astype(float),
                             y_pred=df_clinical_results.clinical_phase_answer.values.astype(float)).ravel()

    sensitivity_point_estimate, specificity_point_estimate, \
                sensitivity_confidence_interval, specificity_confidence_interval \
                = sensitivity_and_specificity_with_confidence_intervals(TP=tp, FP=fp, FN=fn, TN=tn, alpha=0.95)
    print(f"Average sensitivity for clinical : {sensitivity_point_estimate:.3f} ({sensitivity_confidence_interval[0]:.3f} - {sensitivity_confidence_interval[1]:.3f}, 95%)")
    print(f"Average specificity for clinical : {specificity_point_estimate:.3f} ({specificity_confidence_interval[0]:.3f} - {specificity_confidence_interval[1]:.3f}, 95%)")
    
    accuracy_score, acc_ci_intervals = confidenceinterval.accuracy_score(y_true=df_clinical_results.is_diagnostic_bcc.values.astype(float),
                                                                         y_pred=df_clinical_results.clinical_phase_answer.values.astype(float),
                                                                         confidence_level=0.95)

    print(f"Average accuracy for clinical : {accuracy_score:.3f} ({acc_ci_intervals[0]:.3f} - {acc_ci_intervals[1]:.3f}, 95%)")

    print("----")
    
print("ALL")
df_clinical_results = df_answers.reset_index(drop=True)
tn, fp, fn, tp = confusion_matrix(y_true=df_clinical_results.is_diagnostic_bcc.values.astype(float),
                         y_pred=df_clinical_results.clinical_phase_answer.values.astype(float)).ravel()

sensitivity_point_estimate, specificity_point_estimate, \
            sensitivity_confidence_interval, specificity_confidence_interval \
            = sensitivity_and_specificity_with_confidence_intervals(TP=tp, FP=fp, FN=fn, TN=tn, alpha=0.95)
print(f"Average sensitivity for clinical : {sensitivity_point_estimate:.3f} ({sensitivity_confidence_interval[0]:.3f} - {sensitivity_confidence_interval[1]:.3f}, 95%)")
print(f"Average specificity for clinical : {specificity_point_estimate:.3f} ({specificity_confidence_interval[0]:.3f} - {specificity_confidence_interval[1]:.3f}, 95%)")

accuracy_score, acc_ci_intervals = confidenceinterval.accuracy_score(y_true=df_clinical_results.is_diagnostic_bcc.values.astype(float),
                                                                         y_pred=df_clinical_results.clinical_phase_answer.values.astype(float),
                                                                         confidence_level=0.95)

print(f"Average accuracy for clinical : {accuracy_score:.3f} ({acc_ci_intervals[0]:.3f} - {acc_ci_intervals[1]:.3f}, 95%)")

# Sensitivity specificity accross all users and expertise level for LC-OCT answers

In [None]:

print("LC-OCT scores")
for expertise_lvl in df_answers.user_type.unique():
    print(expertise_lvl)
    df_lcoct_results = df_answers[(df_answers.user_type==expertise_lvl) & (~df_answers.ai_assistance_present)].reset_index(drop=True)
    tn, fp, fn, tp = confusion_matrix(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                             y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float)).ravel()

    sensitivity_point_estimate, specificity_point_estimate, \
                sensitivity_confidence_interval, specificity_confidence_interval \
                = sensitivity_and_specificity_with_confidence_intervals(TP=tp, FP=fp, FN=fn, TN=tn, alpha=0.95)
    print(f"Average sensitivity for lc-oct : {sensitivity_point_estimate:.3f} ({sensitivity_confidence_interval[0]:.3f} - {sensitivity_confidence_interval[1]:.3f}, 95%)")
    print(f"Average specificity for lc-oct : {specificity_point_estimate:.3f} ({specificity_confidence_interval[0]:.3f} - {specificity_confidence_interval[1]:.3f}, 95%)")
    
    accuracy_score, acc_ci_intervals = confidenceinterval.accuracy_score(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                                                                         y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float),
                                                                         confidence_level=0.95)

    print(f"Average accuracy for lc-oct : {accuracy_score:.3f} ({acc_ci_intervals[0]:.3f} - {acc_ci_intervals[1]:.3f}, 95%)")

    print("----")
    
print("ALL")
df_lcoct_results = df_answers[~df_answers.ai_assistance_present].reset_index(drop=True)
tn, fp, fn, tp = confusion_matrix(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                         y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float)).ravel()

sensitivity_point_estimate, specificity_point_estimate, \
            sensitivity_confidence_interval, specificity_confidence_interval \
            = sensitivity_and_specificity_with_confidence_intervals(TP=tp, FP=fp, FN=fn, TN=tn, alpha=0.95)
print(f"Average sensitivity for lc-oct : {sensitivity_point_estimate:.3f} ({sensitivity_confidence_interval[0]:.3f} - {sensitivity_confidence_interval[1]:.3f}, 95%)")
print(f"Average specificity for lc-oct : {specificity_point_estimate:.3f} ({specificity_confidence_interval[0]:.3f} - {specificity_confidence_interval[1]:.3f}, 95%)")

accuracy_score, acc_ci_intervals = confidenceinterval.accuracy_score(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                                                                         y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float),
                                                                         confidence_level=0.95)

print(f"Average accuracy for lc-oct : {accuracy_score:.3f} ({acc_ci_intervals[0]:.3f} - {acc_ci_intervals[1]:.3f}, 95%)")

# Sensitivity specificity accross all users and expertise level for AI assisted LC-OCT answers

In [None]:

print("LC-OCT scores")
for expertise_lvl in df_answers.user_type.unique():
    print(expertise_lvl)
    df_lcoct_results = df_answers[(df_answers.user_type==expertise_lvl) & (df_answers.ai_assistance_present)].reset_index(drop=True)
    tn, fp, fn, tp = confusion_matrix(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                             y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float)).ravel()

    sensitivity_point_estimate, specificity_point_estimate, \
                sensitivity_confidence_interval, specificity_confidence_interval \
                = sensitivity_and_specificity_with_confidence_intervals(TP=tp, FP=fp, FN=fn, TN=tn, alpha=0.95)
    print(f"Average sensitivity for AI-assisted lc-oct : {sensitivity_point_estimate:.3f} ({sensitivity_confidence_interval[0]:.3f} - {sensitivity_confidence_interval[1]:.3f}, 95%)")
    print(f"Average specificity for AI-assisted lc-oct : {specificity_point_estimate:.3f} ({specificity_confidence_interval[0]:.3f} - {specificity_confidence_interval[1]:.3f}, 95%)")
    
    accuracy_score, acc_ci_intervals = confidenceinterval.accuracy_score(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                                                                         y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float),
                                                                         confidence_level=0.95)

    print(f"Average accuracy for AI-assisted lc-oct : {accuracy_score:.3f} ({acc_ci_intervals[0]:.3f} - {acc_ci_intervals[1]:.3f}, 95%)")

    print("----")
    
print("ALL")
df_lcoct_results = df_answers[df_answers.ai_assistance_present].reset_index(drop=True)
tn, fp, fn, tp = confusion_matrix(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                         y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float)).ravel()

sensitivity_point_estimate, specificity_point_estimate, \
            sensitivity_confidence_interval, specificity_confidence_interval \
            = sensitivity_and_specificity_with_confidence_intervals(TP=tp, FP=fp, FN=fn, TN=tn, alpha=0.95)
print(f"Average sensitivity for AI-assisted lc-oct : {sensitivity_point_estimate:.3f} ({sensitivity_confidence_interval[0]:.3f} - {sensitivity_confidence_interval[1]:.3f}, 95%)")
print(f"Average specificity for AI-assisted lc-oct : {specificity_point_estimate:.3f} ({specificity_confidence_interval[0]:.3f} - {specificity_confidence_interval[1]:.3f}, 95%)")

accuracy_score, acc_ci_intervals = confidenceinterval.accuracy_score(y_true=df_lcoct_results.is_diagnostic_bcc.values.astype(float),
                                                                         y_pred=df_lcoct_results.lcoct_phase_answer.values.astype(float),
                                                                         confidence_level=0.95)

print(f"Average accuracy for AI-assisted lc-oct : {accuracy_score:.3f} ({acc_ci_intervals[0]:.3f} - {acc_ci_intervals[1]:.3f}, 95%)")

# Compute intra operator variability

Compute per doctor and per question the average "clinical phase answer":
- if score is 0: the doctor always answered no BCC
- if score is 1: the doctor always answered BCC
- if score is 0.5: the doctor always switched answers between quiz 1 and quiz 2

In [None]:
intra_operator = []
for (user_name, case_uuid), df_user_type in df_answers.groupby(["user", "case_uuid"]):
    intra_operator.append({"user": user_name, "case_uuid": case_uuid,
                           "variability": df_user_type.clinical_phase_answer.mean(),
                          "expertise": df_user_type.user_type.iloc[0]})
df_intra_operator = pd.DataFrame(intra_operator)

df_intra_operator["variability"] = df_intra_operator["variability"].apply(lambda x : x == 0.5)

def mode_agg(series):
    return series.mode().iloc[0] if not series.mode().empty else None

df_intra_var = df_intra_operator.groupby("user").agg({"variability": "mean", "expertise": mode_agg}).reset_index()

fig = px.violin(df_intra_var, y="variability", box=True, color="expertise",
                           points="all",
          hover_data=["user"],
                     category_orders={"expertise": ['novices', 'intermediates',
                                                           'experts']})

fig.show()
print(f"Global average intra-operator variability: {df_intra_operator.variability.mean():.3f}")

# Compute individual doctors auc score after scaling answers by the trust score

In [None]:
all_indiv_aux_scores = []
users = []
user_types = []
# scores without AI
for user, df_user_answers in df_answers[df_answers.ai_assistance_present==False].groupby("user"):
    df_user_answers = df_user_answers.reset_index(drop=True)
    df_user_answers["indiv_probs"] = df_user_answers["lcoct_phase_answer"].apply(lambda x : 1 if x else -1) *df_user_answers["lcoct_phase_trust_score"]
    df_user_answers["indiv_probs"] /= 20
    df_user_answers["indiv_probs"] += 0.5
    auc_score, (lower_conf, high_conf) = confidenceinterval.roc_auc_score(y_true=df_user_answers.is_diagnostic_bcc.astype(float),
                                                                          y_pred=df_user_answers.indiv_probs)
    all_indiv_aux_scores.append(auc_score)
    users.append(user)
    user_types.append(df_user_answers["user_type"][0])
    # print("doctor id:", user, "- auc and CI", auc_score, (lower_conf, high_conf))

# scores with AI
for user, df_user_answers in df_answers[df_answers.ai_assistance_present==True].groupby("user"):
    df_user_answers = df_user_answers.reset_index(drop=True)
    df_user_answers["indiv_probs"] = df_user_answers["lcoct_phase_answer"].apply(lambda x : 1 if x else -1) *df_user_answers["lcoct_phase_trust_score"]
    df_user_answers["indiv_probs"] /= 20
    df_user_answers["indiv_probs"] += 0.5
    auc_score, (lower_conf, high_conf) = confidenceinterval.roc_auc_score(y_true=df_user_answers.is_diagnostic_bcc.astype(float),
                                                                          y_pred=df_user_answers.indiv_probs)
    all_indiv_aux_scores.append(auc_score)
    users.append(user)
    user_types.append(df_user_answers["user_type"][0]+" with AI assistance")
    # print("doctor id:", user, "- auc and CI", auc_score, (lower_conf, high_conf))
# add AI standalone

ai_auc, _ = confidenceinterval.roc_auc_score(y_true=df_ai_scores.is_diagnostic_bcc,
                                             y_pred=df_ai_scores.max_moving_avg_24)
all_indiv_aux_scores.append(ai_auc)
users.append("AI standalone")
user_types.append("AI standalone")

df_indiv_auc = pd.DataFrame()

df_indiv_auc["user"] = users
df_indiv_auc["auc"] = all_indiv_aux_scores
df_indiv_auc["expertise"] = user_types

df_indiv_auc = df_indiv_auc.sort_values(by="auc").reset_index(drop=True)
df_indiv_auc["num doctors"] = range(1, len(users)+1)
df_indiv_auc["num doctors"] = df_indiv_auc["num doctors"].astype(str)
df_indiv_auc.loc[df_indiv_auc.user=="AI standalone", "num doctors"] = "AI"


df_indiv_auc["expertise"] = df_indiv_auc["expertise"].apply(lambda x: "<br> with".join(x.split("with")))
df_indiv_auc["simple expertise"] = df_indiv_auc["expertise"].apply(lambda x : x.split(" <br>")[0])

df_indiv_auc["AI_assistance"] = df_indiv_auc.expertise.apply(lambda x: "with AI assistance" in x)


In [None]:

standalone_ai_auc = df_indiv_auc[df_indiv_auc.user=="AI standalone"].auc.values[0]
fig = px.box(df_indiv_auc[df_indiv_auc.user!="AI standalone"],
             x="expertise",
             y="auc",
             color="simple expertise",  # or "expertise" or "simple expertise"
             points="all",
             category_orders={"expertise": ['novices',
                                            "intermediates",
                                            "experts",
#                                             'AI standalone',
                                            'novices <br> with AI assistance',
                                            'intermediates <br> with AI assistance',
                                            'experts <br> with AI assistance']})

# Add a horizontal line
fig.add_hline(y=standalone_ai_auc, line_dash="dash", line_color="orange", annotation_text="")
fig.add_vline(x=2.5, line_dash="dash", line_color="grey", annotation_text="", line={"width":1})
# Add an invisible scatter trace for the legend
fig.add_scatter(x=[None], y=[None], mode="lines",
                line=dict(color="orange", dash="dash"),
                name=f"AI standalone ({standalone_ai_auc:.3f})")
fig.update_xaxes(tickangle=45)
# fig.update_layout(margin=dict(l=40, r=40, t=40, b=100))


fig.show()

# Compute voting AUC and accuracy scores per level of expertise with LC-OCT

In [None]:
print("NO AI voting scores")
print("ALL")
df_voting = df_answers[df_answers.ai_assistance_present==False].groupby("case_uuid")[["is_diagnostic_bcc", "lcoct_phase_answer"]].mean().reset_index()

print("AUC")
print(confidenceinterval.roc_auc_score(y_true=df_voting.is_diagnostic_bcc, y_pred=df_voting.lcoct_phase_answer,))

print("experts")
df_voting_experts = df_answers[(df_answers.ai_assistance_present==False) & (df_answers.user_type=="experts")].groupby("case_uuid")[["is_diagnostic_bcc", "lcoct_phase_answer"]].mean().reset_index()
print("AUC")
print(confidenceinterval.roc_auc_score(y_true=df_voting_experts.is_diagnostic_bcc, y_pred=df_voting_experts.lcoct_phase_answer))


print("intermediates")
df_voting_intermediates = df_answers[(df_answers.ai_assistance_present==False) & (df_answers.user_type=="intermediates")].groupby("case_uuid")[["is_diagnostic_bcc", "lcoct_phase_answer"]].mean().reset_index()
print("AUC")
print(confidenceinterval.roc_auc_score(y_true=df_voting_intermediates.is_diagnostic_bcc, y_pred=df_voting_intermediates.lcoct_phase_answer))

print("novices")
df_voting_novices = df_answers[(df_answers.ai_assistance_present==False) & (df_answers.user_type=="novices")].groupby("case_uuid")[["is_diagnostic_bcc", "lcoct_phase_answer"]].mean().reset_index()
print("AUC")
print(confidenceinterval.roc_auc_score(y_true=df_voting_novices.is_diagnostic_bcc, y_pred=df_voting_novices.lcoct_phase_answer))


# Compute two sided t-test for sensitivity, specificity and accuracy per level of expertise between LC-OCT and AI assisted LC-OCT

In [None]:
for user_type_idx, user_type in enumerate(df_results.user_type.unique()):
    df_res_ai = df_results[(df_results["clinical_phase"]==False) & (df_results["ai_help"]==True) & (df_results["user_type"]==user_type)].reset_index(drop=True)
    df_res_no_ai = df_results[(df_results["clinical_phase"]==False)& (df_results["ai_help"]==False)  & (df_results["user_type"]==user_type)].reset_index(drop=True)
    df_res_ai = df_res_ai.sort_values(by="user").reset_index(drop=True)
    df_res_no_ai = df_res_no_ai.sort_values(by="user").reset_index(drop=True)
    print("------------")
    print(user_type, ":", len(df_res_ai))
    print('sensitivity')
    print("no ai:", df_res_no_ai.sensitivity.mean())
    print("ai:", df_res_ai.sensitivity.mean())
    print(ttest_rel(df_res_ai.sensitivity,
                    df_res_no_ai.sensitivity,
                    axis=0,
                    nan_policy='raise',
                    alternative='two-sided'))
    print('specificity')
    print(ttest_rel(df_res_ai.specificity,
                    df_res_no_ai.specificity,
                    axis=0,
                    nan_policy='raise',
                    alternative='two-sided'))
    print('accuracy')
    print("no ai:", df_res_no_ai.accuracy.mean())
    print("ai:", df_res_ai.accuracy.mean())
    
    print(ttest_rel(df_res_ai.accuracy,
                    df_res_no_ai.accuracy,
                    axis=0,
                    nan_policy='raise',
                    alternative='two-sided'))
    print('f1 score')
    print(ttest_rel(df_res_ai.f1_score,
                    df_res_no_ai.f1_score,
                    axis=0,
                    nan_policy='raise',
                    alternative='two-sided'))

# Compute two sided t-test for sensitivity, specificity and accuracy between LC-OCT and AI assisted LC-OCT

In [None]:
print("------------")
print("OVERALL")
print('sensitivity')

df_res_ai = df_results[(df_results["clinical_phase"]==False) & (df_results["ai_help"]==True)].reset_index(drop=True)
df_res_no_ai = df_results[(df_results["clinical_phase"]==False)& (df_results["ai_help"]==False)  ].reset_index(drop=True)
df_res_ai = df_res_ai.sort_values(by="user").reset_index(drop=True)
df_res_no_ai = df_res_no_ai.sort_values(by="user").reset_index(drop=True)
    
print(ttest_rel(df_res_ai.sensitivity,
                df_res_no_ai.sensitivity,
                axis=0,
                nan_policy='raise',
                alternative='two-sided'))
print('specificity')
print(ttest_rel(df_res_ai.specificity,
                df_res_no_ai.specificity,
                axis=0,
                nan_policy='raise',
                alternative='two-sided'))
print('accuracy')
print("no ai:", df_res_no_ai.accuracy.mean())
print("ai:", df_res_ai.accuracy.mean())
print(ttest_rel(df_res_ai.accuracy,
                df_res_no_ai.accuracy,
                axis=0,
                nan_policy='raise',
                alternative='two-sided'))
print('f1 score')
print(ttest_rel(df_res_ai.f1_score,
                df_res_no_ai.f1_score,
                axis=0,
                nan_policy='raise',
                alternative='two-sided'))

# Generate main plot of the paper

In [None]:
gp_df = df_answers[(df_answers.ai_assistance_present)].groupby("user_type")
sens_spec_df_ai_2 = gp_df.apply(lambda df : pd.Series(get_sens_spec(df))).reset_index()
sens_spec_df_ai_2["ai_assistance_present"] = True

gp_df = df_answers[~df_answers.ai_assistance_present].groupby("user_type")
sens_spec_df_no_ai_2 = gp_df.apply(lambda df : pd.Series(get_sens_spec(df))).reset_index()
sens_spec_df_no_ai_2["ai_assistance_present"] = False

In [None]:
df_results["1-specificity"] = 1 - df_results["specificity"]

In [None]:
fig = generate_confidence_plot(df_results, df_ai_scores, df_answers, sens_spec_df_ai_2, sens_spec_df_no_ai_2, alpha=0.95, size=1000, showlegend=True)
fig_2 = generate_confidence_plot(df_results, df_ai_scores, df_answers, sens_spec_df_ai_2, sens_spec_df_no_ai_2, alpha=0.95, size=None, showlegend=False, zoom_range=[-0.4, 1.02])
final_fig = add_inset(fig, fig_2)
final_fig.show()