In [1]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
# Import
path = r'data/filtered_snomed_charlson_index.csv'
filtered_snomed_charlson_index = pd.read_csv(path, dtype={'code': 'str'})

In [5]:
filtered_snomed_charlson_index

Unnamed: 0,code,STR,comorbid_condition,weight
0,22298006,Myocardial infarction,Myocardial Infarction,1
1,22298006,"Myocardial infarction, NOS",Myocardial Infarction,1
2,22298006,Heart attack,Myocardial Infarction,1
3,22298006,"Heart attack, NOS",Myocardial Infarction,1
4,22298006,Infarction of heart,Myocardial Infarction,1
...,...,...,...,...
657,86406008,HIV infection,HIV/AIDS,6
658,86406008,HIV - Human immunodeficiency virus infection,HIV/AIDS,6
659,86406008,Human immunodeficiency virus infection,HIV/AIDS,6
660,86406008,"Human immunodeficiency virus infection, NOS",HIV/AIDS,6


In [5]:
filtered_snomed_charlson_index.comorbid_condition.value_counts()
filtered_snomed_charlson_index.comorbid_condition.nunique()
filtered_snomed_charlson_index.code.nunique()

Cancer                                         130
Chronic Pulmonary Disease                       65
Connective Tissue Disease Rheumatic Disease     61
Congestive Heart Failure                        50
Moderate or Severe Liver Disease                37
Paraplegia and Hemiplegia                       37
Renal Disease                                   37
Diabetes without Chronic Complications          36
Mild Liver Disease                              33
Peripheral Vascular Disease                     33
Cerebrovascular Disease                         29
Diabetes with Chronic Complications             29
Dementia                                        28
Peptic Ulcer Disease                            24
Myocardial Infarction                           20
Metastatic Carcinoma                             8
HIV/AIDS                                         5
Name: comorbid_condition, dtype: int64

17

113

In [3]:
# Import
path = r'data/cbr_metric_my_method.csv'
my_method = pd.read_csv(path)

path = r'data/cbr_metric_one_hot.csv'
one_hot = pd.read_csv(path)

path = r'data/cbr_metric_rocheteau.csv'
rocheteau = pd.read_csv(path)

In [4]:
# Import
path = r'data/final_problem_dummies.csv'
problem_dummies = pd.read_csv(path)
# Drop columns
problem_dummies.drop(columns=['SUBJECT', 'PROBLEM_DT_TM'], inplace=True)
# Remove prefix
problem_dummies.columns = problem_dummies.columns.str.strip('PROBLEM_')
# Set index
problem_dummies.set_index('new_subject', inplace=True)

In [96]:
# Get charlson_jaccard_index for my_method
my_method_score_list = []

for index, row in my_method.iterrows():
    patient_n = row['0']
    neighbour_n = row['1']
    # Get patient and neighbour problems
    patient_problems = problem_dummies.loc[patient_n][problem_dummies.loc[patient_n] == 1].index.tolist()
    neighbour_problems = problem_dummies.loc[neighbour_n][problem_dummies.loc[neighbour_n] == 1].index.tolist()
    # Get charlson catergories
    patient_charlson = filtered_snomed_charlson_index[filtered_snomed_charlson_index['code'].isin(patient_problems)]
    neighbour_charlson = filtered_snomed_charlson_index[filtered_snomed_charlson_index['code'].isin(neighbour_problems)]
    if patient_charlson.empty:
        patient_charlson = []
    else:
        patient_charlson = patient_charlson['comorbid_condition'].unique().tolist()
    if neighbour_charlson.empty:
        neighbour_charlson = []
    else:
        neighbour_charlson = neighbour_charlson['comorbid_condition'].unique().tolist()
    # If both are empty return 1
    if not patient_charlson and not neighbour_charlson:
        my_method_score_list.append(1)
    else:
        # Work out jaccard similarity
        s1 = set(patient_charlson)
        s2 = set(neighbour_charlson)
        my_method_score_list.append(float(len(s1.intersection(s2)) / len(s1.union(s2))))

In [97]:
len(my_method_score_list)
np.nanmean(my_method_score_list)
np.nanstd(my_method_score_list)

1000

0.8361595238095237

0.344557111884721

In [111]:
# Get jaccard_index for my_method
my_method_jaccard_score_list = []

for index, row in my_method.iterrows():
    patient_n = row['0']
    neighbour_n = row['1']
    # Get patient and neighbour problems
    patient_problems = problem_dummies.loc[patient_n][problem_dummies.loc[patient_n] == 1].index.tolist()
    neighbour_problems = problem_dummies.loc[neighbour_n][problem_dummies.loc[neighbour_n] == 1].index.tolist()
    # Work out jaccard similarity
    s1 = set(patient_problems)
    s2 = set(neighbour_problems)
    my_method_jaccard_score_list.append(float(len(s1.intersection(s2)) / len(s1.union(s2))))

In [112]:
len(my_method_jaccard_score_list)
np.nanmean(my_method_jaccard_score_list)
np.nanstd(my_method_jaccard_score_list)

1000

0.5748233978551922

0.28443555972796036

In [100]:
# Get charlson_jaccard_index for one_hot
one_hot_score_list = []

for index, row in one_hot.iterrows():
    patient_n = row['0']
    neighbour_n = row['1']
    # Get patient and neighbour problems
    patient_problems = problem_dummies.loc[patient_n][problem_dummies.loc[patient_n] == 1].index.tolist()
    neighbour_problems = problem_dummies.loc[neighbour_n][problem_dummies.loc[neighbour_n] == 1].index.tolist()
    # Get charlson catergories
    patient_charlson = filtered_snomed_charlson_index[filtered_snomed_charlson_index['code'].isin(patient_problems)]
    neighbour_charlson = filtered_snomed_charlson_index[filtered_snomed_charlson_index['code'].isin(neighbour_problems)]
    if patient_charlson.empty:
        patient_charlson = []
    else:
        patient_charlson = patient_charlson['comorbid_condition'].unique().tolist()
    if neighbour_charlson.empty:
        neighbour_charlson = []
    else:
        neighbour_charlson = neighbour_charlson['comorbid_condition'].unique().tolist()
    # If both are empty return 1
    if not patient_charlson and not neighbour_charlson:
        one_hot_score_list.append(1)
    else:
        # Work out jaccard similarity
        s1 = set(patient_charlson)
        s2 = set(neighbour_charlson)
        one_hot_score_list.append(float(len(s1.intersection(s2)) / len(s1.union(s2))))

In [101]:
len(one_hot_score_list)
np.nanmean(one_hot_score_list)
np.nanstd(one_hot_score_list)

1000

0.8813833333333332

0.3013014875318223

In [117]:
# Get jaccard_index for one_hot
one_hot_jaccard_score_list = []

for index, row in one_hot.iterrows():
    patient_n = row['0']
    neighbour_n = row['1']
    # Get patient and neighbour problems
    patient_problems = problem_dummies.loc[patient_n][problem_dummies.loc[patient_n] == 1].index.tolist()
    neighbour_problems = problem_dummies.loc[neighbour_n][problem_dummies.loc[neighbour_n] == 1].index.tolist()
    # Work out jaccard similarity
    s1 = set(patient_problems)
    s2 = set(neighbour_problems)
    one_hot_jaccard_score_list.append(float(len(s1.intersection(s2)) / len(s1.union(s2))))

In [118]:
len(one_hot_jaccard_score_list)
np.nanmean(one_hot_jaccard_score_list)
np.nanstd(one_hot_jaccard_score_list)

1000

0.689585351928773

0.19517945439040402

In [102]:
# Get charlson_jaccard_index for rocheteau
rocheteau_score_list = []

for index, row in rocheteau.iterrows():
    patient_n = row['0']
    neighbour_n = row['1']
    # Get patient and neighbour problems
    patient_problems = problem_dummies.loc[patient_n][problem_dummies.loc[patient_n] == 1].index.tolist()
    neighbour_problems = problem_dummies.loc[neighbour_n][problem_dummies.loc[neighbour_n] == 1].index.tolist()
    # Get charlson catergories
    patient_charlson = filtered_snomed_charlson_index[filtered_snomed_charlson_index['code'].isin(patient_problems)]
    neighbour_charlson = filtered_snomed_charlson_index[filtered_snomed_charlson_index['code'].isin(neighbour_problems)]
    if patient_charlson.empty:
        patient_charlson = []
    else:
        patient_charlson = patient_charlson['comorbid_condition'].unique().tolist()
    if neighbour_charlson.empty:
        neighbour_charlson = []
    else:
        neighbour_charlson = neighbour_charlson['comorbid_condition'].unique().tolist()
    # If both are empty return 1
    if not patient_charlson and not neighbour_charlson:
        rocheteau_score_list.append(1)
    else:
        # Work out jaccard similarity
        s1 = set(patient_charlson)
        s2 = set(neighbour_charlson)
        rocheteau_score_list.append(float(len(s1.intersection(s2)) / len(s1.union(s2))))

In [103]:
len(rocheteau_score_list)
np.nanmean(rocheteau_score_list)
np.nanstd(rocheteau_score_list)

1000

0.6448785714285714

0.44609297988911367

In [115]:
# Get jaccard_index for rocheteau
rocheteau_jaccard_score_list = []

for index, row in rocheteau.iterrows():
    patient_n = row['0']
    neighbour_n = row['1']
    # Get patient and neighbour problems
    patient_problems = problem_dummies.loc[patient_n][problem_dummies.loc[patient_n] == 1].index.tolist()
    neighbour_problems = problem_dummies.loc[neighbour_n][problem_dummies.loc[neighbour_n] == 1].index.tolist()
    # Work out jaccard similarity
    s1 = set(patient_problems)
    s2 = set(neighbour_problems)
    rocheteau_jaccard_score_list.append(float(len(s1.intersection(s2)) / len(s1.union(s2))))

In [116]:
len(rocheteau_jaccard_score_list)
np.nanmean(rocheteau_jaccard_score_list)
np.nanstd(rocheteau_jaccard_score_list)

1000

0.4657556637925435

0.3129939022424812

Test stats

In [104]:
from scipy.stats import shapiro 
from scipy.stats import kstest
from scipy.stats import ttest_rel
from scipy.stats import wilcoxon

In [106]:
# Test if normally distributed (wont be)
shapiro(my_method_score_list)
kstest(my_method_score_list, 'norm')
shapiro(one_hot_score_list)
kstest(one_hot_score_list, 'norm')
shapiro(rocheteau_score_list)
kstest(rocheteau_score_list, 'norm')

ShapiroResult(statistic=0.5044409036636353, pvalue=0.0)

KstestResult(statistic=0.638344746068543, pvalue=0.0)

ShapiroResult(statistic=0.425937294960022, pvalue=0.0)

KstestResult(statistic=0.6923447460685429, pvalue=0.0)

ShapiroResult(statistic=0.6812915205955505, pvalue=1.061354667267331e-39)

KstestResult(statistic=0.5, pvalue=1.064517291557782e-231)

In [107]:
import scipy.stats as stats

def test_stats(list1, list2):

    k2, p = stats.wilcoxon(list1, list2)
    alpha = 0.05
    print(p)
    if p < alpha:
        print('Different distribution')
    else:
        print(' Same distribution')

In [108]:
test_stats(my_method_score_list, rocheteau_score_list)

1.5078819993890312e-31
Different distribution


In [109]:
test_stats(my_method_score_list, one_hot_score_list)

2.2527877796978147e-05
Different distribution


In [110]:
test_stats(rocheteau_score_list, one_hot_score_list)

2.133169652205036e-43
Different distribution


Save

In [122]:
# Add columns
my_method['charlson_jaccard_score'] = my_method_score_list
my_method['jaccard_score'] = my_method_jaccard_score_list
one_hot['charlson_jaccard_score'] = one_hot_score_list
one_hot['jaccard_score'] = one_hot_jaccard_score_list
rocheteau['charlson_jaccard_score'] = rocheteau_score_list
rocheteau['jaccard_score'] = rocheteau_jaccard_score_list

In [124]:
# Save
#my_method.to_csv('cbr_my_method_metric_charlson_jaccard.csv')
#one_hot.to_csv('cbr_one_hot_metric_charlson_jaccard.csv')
#rocheteau.to_csv('cbr_rocheteau_metric_charlson_jaccard.csv')