In [2]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score

import gc

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
# Import
path = r'data/los.csv'
los_labels = pd.read_csv(path)

# Group by and take average
los_labels = los_labels.groupby(['new_subject', 'SUBJECT', 'PROBLEM_DT_TM'])['los_days', 'los_hours', 'los_long'].mean()
los_labels = los_labels.reset_index()[['new_subject', 'los_days', 'los_hours', 'los_long']]

# Import
path = r'data/trimmed_patient_embedding_128d.csv'
patients_embeddings = pd.read_csv(path, index_col=0)

# Fill in nan
los_labels.los_hours.fillna(0, inplace=True)
los_labels.los_long.fillna(0, inplace=True)
# Merge
patients_embeddings = pd.merge(los_labels[['new_subject', 'los_hours', 'los_long']], patients_embeddings.reset_index()).set_index('new_subject', inplace=True)

# Import
path = r'data/final_problem_dummies.csv'
problem_dummies = pd.read_csv(path)

# Drop columns
problem_dummies.drop(columns=['SUBJECT', 'PROBLEM_DT_TM'], inplace=True)
# Remove prefix
problem_dummies.columns = problem_dummies.columns.str.strip('PROBLEM_')
# Merge
problem_dummies = pd.merge(los_labels[['new_subject', 'los_hours', 'los_long']], problem_dummies)
# Set index
problem_dummies.set_index('new_subject', inplace=True)
# Import
path = r'data/final_trimmed_snomed_embedding_128d.csv'
snomed_embeddings = pd.read_csv(path, index_col=0)
# Get list 
snomed_embeddings.index = snomed_embeddings.index.astype(str)
snomed_list = snomed_embeddings.index.tolist()
problem_dummies_list = problem_dummies.columns.tolist()
overlap_list = list(set(snomed_list) & set(problem_dummies_list))
overlap_list = ['los_hours', 'los_long'] + overlap_list
# Filter
problem_dummies = problem_dummies[overlap_list]

In [7]:
del los_labels
del patients_embeddings
del snomed_embeddings
del snomed_list
del problem_dummies_list
del overlap_list
gc.collect()

477

In [4]:
# Drop those with no co-morbidities
drop_index_list = problem_dummies.iloc[:,2:][(problem_dummies.iloc[:,2:] == 0).all(axis=1)].index.to_list()
problem_dummies = problem_dummies[~problem_dummies.index.isin(drop_index_list)]

In [5]:
# Work out count for problems
problem_sum = pd.DataFrame(problem_dummies.iloc[:,2:].sum(axis=0), columns=['Count']).sort_values(by=['Count'], ascending=False)
# Define what is a rare disease
cut_off_list = [45, 8]

In [8]:
# Import
path = r'data/problem_charlson_death.csv'
problem_charlson = pd.read_csv(path, index_col=0)
# Filter 
problem_charlson = problem_charlson.iloc[problem_dummies.index]
problem_charlson = problem_charlson[['charlson_score']]

In [12]:
# Reset index
problem_dummies.reset_index(inplace=True, drop=True)
problem_charlson.reset_index(inplace=True, drop=True)

In [None]:
# Get results

results_df = pd.DataFrame()
x_data = problem_charlson[['charlson_score']]
y_data = problem_dummies.los_long.round()
# Get CV folds
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
fold_n = 0
for train_index, test_index in cv.split(x_data, y_data):
    fold_n += 1
    x_train  = x_data[x_data.index.isin(list(train_index))]
    y_train  = y_data[y_data.index.isin(list(train_index))]
    x_test  = x_data[x_data.index.isin(list(test_index))]
    y_test  = y_data[y_data.index.isin(list(test_index))]

    # Fit
    LR = LogisticRegression(class_weight='balanced')
    LR.fit(x_train, y_train)

    # Get results
    # AUC
    aucroc = roc_auc_score(y_test, LR.predict(x_test))
    # Accuracy
    accuracy = accuracy_score(y_test, LR.predict(x_test))
    # Precision
    precision = precision_score(y_test, LR.predict(x_test))
    # Recall
    recall = recall_score(y_test, LR.predict(x_test))
    # AUPRC
    auprc = average_precision_score(y_test, LR.predict(x_test))
    # F1
    f1 = f1_score(y_test, LR.predict(x_test))
    # TPR and FPR
    cm = confusion_matrix(y_test, LR.predict(x_test))
    _tp = cm[0, 0]
    _fn = cm[0, 1]
    _fp = cm[1, 0]
    _tn = cm[1, 1]
    tpr = _tp / (_tp + _fn)
    fpr = _fp / (_tn + _fp)

    new_row = {'data': 'dummies', 'fold': fold_n, 'subset':'overall', 'AUROC': aucroc, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'F1': f1, 'AUPRC': auprc, 'TPR': tpr, 'FPR': fpr}
    results_df = results_df.append(new_row, ignore_index=True)

In [14]:
results_df

Unnamed: 0,data,fold,subset,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
0,dummies,1,overall,0.589538,0.680776,0.08884,0.486322,0.150235,0.073011,0.692754,0.513678
1,dummies,2,overall,0.604296,0.689771,0.094671,0.507599,0.15958,0.076626,0.700992,0.492401
2,dummies,3,overall,0.596816,0.682363,0.091616,0.5,0.154857,0.074908,0.693633,0.5
3,dummies,4,overall,0.59977,0.679894,0.092257,0.509091,0.156206,0.075539,0.690449,0.490909
4,dummies,5,overall,0.583316,0.689065,0.087982,0.463636,0.147898,0.072008,0.702996,0.536364
5,dummies,6,overall,0.602164,0.691127,0.094178,0.50152,0.158578,0.076161,0.702809,0.49848
6,dummies,7,overall,0.599521,0.688834,0.093023,0.49848,0.156788,0.075476,0.700562,0.50152
7,dummies,8,overall,0.603749,0.683366,0.093681,0.513678,0.158462,0.076345,0.69382,0.486322
8,dummies,9,overall,0.614322,0.692538,0.098295,0.525836,0.165629,0.079205,0.702809,0.474164
9,dummies,10,overall,0.607869,0.691127,0.096023,0.513678,0.1618,0.077548,0.70206,0.486322


In [15]:
results_df.groupby(['data', 'subset']).mean()
results_df.groupby(['data', 'subset']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,overall,5.5,0.600136,0.686886,0.093057,0.501984,0.157003,0.075683,0.698289,0.498016


Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,overall,3.02765,0.008833,0.004759,0.003105,0.017181,0.005193,0.002075,0.004983,0.017181


In [None]:
# Get rare 10 fold cv results

results_df = pd.DataFrame()
x_data = problem_charlson[['charlson_score']]
condition_data = problem_dummies.iloc[:,2:]
y_data = problem_dummies.los_long.round()
# Get CV folds
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
fold_n = 0
for train_index, test_index in cv.split(x_data, y_data):
    fold_n += 1
    x_train  = x_data[x_data.index.isin(list(train_index))]
    y_train  = y_data[y_data.index.isin(list(train_index))]
    x_test  = x_data[x_data.index.isin(list(test_index))]
    y_test  = y_data[y_data.index.isin(list(test_index))]
    condition_test  = condition_data[condition_data.index.isin(list(test_index))]

    # Fit
    LR = LogisticRegression(class_weight='balanced')
    LR.fit(x_train, y_train)

    # Get results for rare diseases 
    for n in cut_off_list:
        # Get filter list 
        filter_list = problem_sum[problem_sum['Count'] > n].index.tolist()
        condition_test2 = condition_test.copy()
        for code in filter_list:
            condition_test2 = condition_test2.loc[condition_test2[code] != 1]
        condition_test_list = condition_test2.index.tolist()
        y_test2 = y_test.loc[condition_test_list]
        x_test2 = x_test.loc[condition_test_list]

        # AUC
        try:
            aucroc = roc_auc_score(y_test2, LR.predict(x_test2))
        except:
            aucroc = np.nan
        # Accuracy
        accuracy = accuracy_score(y_test2, LR.predict(x_test2))
        # Precision
        precision = precision_score(y_test2, LR.predict(x_test2))
        # Recall
        recall = precision_score(y_test2, LR.predict(x_test2))
        # AUPRC
        try:
            auprc = average_precision_score(y_test2, LR.predict(x_test2))
        except:
            auprc = np.nan
        # F1
        f1 = f1_score(y_test2, LR.predict(x_test2))
        # TPR and FPR
        cm = confusion_matrix(y_test2, LR.predict(x_test2))
        try:
            _tp = cm[0, 0]
        except:
            _tp = np.nan
        try:
            _fn = cm[0, 1]
        except:
            _fn = np.nan
        try:
            _fp = cm[1, 0]
        except:
            _fp = np.nan
        try:
            _tn = cm[1, 1]
        except:
            _tn = np.nan
        tpr = _tp / (_tp + _fn)
        fpr = _fp / (_tn + _fp)

        new_row = {'data': 'dummies', 'fold': fold_n, 'subset': n, 'AUROC': aucroc, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'F1': f1, 'AUPRC': auprc, 'TPR': tpr, 'FPR': fpr}
        results_df = results_df.append(new_row, ignore_index=True)

In [17]:
results_df

Unnamed: 0,data,fold,subset,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
0,dummies,1,45,0.481675,0.936387,0.0,0.0,0.0,0.02799,0.963351,1.0
1,dummies,1,8,0.471831,0.905405,0.0,0.0,0.0,0.040541,0.943662,1.0
2,dummies,2,45,0.521401,0.936548,0.071429,0.071429,0.074074,0.035951,0.965879,0.923077
3,dummies,2,8,0.5,0.958904,0.0,0.0,0.0,0.041096,1.0,1.0
4,dummies,3,45,0.556018,0.913738,0.25,0.25,0.181818,0.093222,0.969178,0.857143
5,dummies,3,8,0.484848,0.914286,0.0,0.0,0.0,0.057143,0.969697,1.0
6,dummies,4,45,0.519258,0.926829,0.058824,0.058824,0.068966,0.034712,0.955182,0.916667
7,dummies,4,8,0.487179,0.938272,0.0,0.0,0.0,0.037037,0.974359,1.0
8,dummies,5,45,0.483607,0.924282,0.0,0.0,0.0,0.044386,0.967213,1.0
9,dummies,5,8,0.493827,0.952381,0.0,0.0,0.0,0.035714,0.987654,1.0


In [18]:
results_df.groupby(['data', 'subset']).mean()
results_df.groupby(['data', 'subset']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,8,5.5,0.500039,0.941825,0.033333,0.033333,0.025,0.045756,0.980078,0.98
dummies,45,5.5,0.517758,0.931526,0.082609,0.082609,0.071887,0.045101,0.966471,0.930955


Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,8,3.02765,0.031745,0.022146,0.105409,0.105409,0.079057,0.027615,0.016455,0.063246
dummies,45,3.02765,0.026616,0.01098,0.081122,0.081122,0.059651,0.019001,0.007667,0.05227
