In [2]:
# Libraries
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 40)
pd.set_option('display.width', 2000)

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
# Import
path = r'data/slim_problem_dummies_death.csv'
slim_problem_dummies_death = pd.read_csv(path, index_col=0)

# Work out count for problems
problem_sum = pd.DataFrame(slim_problem_dummies_death.iloc[:,2:].sum(axis=0), columns=['Count']).sort_values(by=['Count'], ascending=False)

# Define what is a rare disease
cut_off_list = [45, 8]

In [14]:
# Import
path = r'data/problem_charlson_death.csv'
problem_charlson = pd.read_csv(path, index_col=0)
# Filter 
problem_charlson = problem_charlson.iloc[slim_problem_dummies_death.index]
problem_charlson = problem_charlson[['charlson_score']]
# Reset index
slim_problem_dummies_death.reset_index(inplace=True, drop=True)
problem_charlson.reset_index(inplace=True, drop=True)

# LR

In [None]:
results_df = pd.DataFrame()
x_data = problem_charlson[['charlson_score']]
y_data = slim_problem_dummies_death.death_year_label
# Get CV folds
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
fold_n = 0
for train_index, test_index in cv.split(x_data, y_data):
    fold_n += 1
    x_train  = x_data[x_data.index.isin(list(train_index))]
    y_train  = y_data[y_data.index.isin(list(train_index))]
    x_test  = x_data[x_data.index.isin(list(test_index))]
    y_test  = y_data[y_data.index.isin(list(test_index))]

    # Fit
    LR = LogisticRegression(class_weight='balanced')
    LR.fit(x_train, y_train)

    # Get results
    # AUC
    aucroc = roc_auc_score(y_test, LR.predict(x_test))
    # Accuracy
    accuracy = accuracy_score(y_test, LR.predict(x_test))
    # Precision
    precision = precision_score(y_test, LR.predict(x_test))
    # Recall
    recall = recall_score(y_test, LR.predict(x_test))
    # AUPRC
    auprc = average_precision_score(y_test, LR.predict(x_test))
    # F1
    f1 = f1_score(y_test, LR.predict(x_test))
    # TPR and FPR
    #fpr, tpr, thresholds = roc_curve(y_test, LR.predict(x_test))
    cm = confusion_matrix(y_test, LR.predict(x_test))
    _tp = cm[0, 0]
    _fn = cm[0, 1]
    _fp = cm[1, 0]
    _tn = cm[1, 1]
    tpr = _tp / (_tp + _fn)
    fpr = _fp / (_tn + _fp)

    new_row = {'data': 'dummies', 'fold': fold_n, 'subset':'overall', 'AUROC': aucroc, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'F1': f1, 'AUPRC': auprc, 'TPR': tpr, 'FPR': fpr}
    results_df = results_df.append(new_row, ignore_index=True)

In [19]:
results_df

Unnamed: 0,data,fold,subset,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
0,dummies,1,overall,0.656873,0.700088,0.050651,0.611354,0.093552,0.040804,0.702393,0.388646
1,dummies,2,overall,0.663736,0.696883,0.05141,0.628821,0.09505,0.041724,0.69865,0.371179
2,dummies,3,overall,0.645588,0.694672,0.048485,0.593886,0.089651,0.039075,0.697289,0.406114
3,dummies,4,overall,0.66031,0.705174,0.051857,0.613043,0.095626,0.041629,0.707577,0.386957
4,dummies,5,overall,0.625806,0.68513,0.044838,0.563319,0.083065,0.036314,0.688294,0.436681
5,dummies,6,overall,0.668568,0.702156,0.052632,0.633188,0.097185,0.042613,0.703947,0.366812
6,dummies,7,overall,0.652461,0.695633,0.049607,0.606987,0.091719,0.040061,0.697936,0.393013
7,dummies,8,overall,0.639701,0.695633,0.04767,0.580786,0.088109,0.0383,0.698616,0.419214
8,dummies,9,overall,0.628784,0.69508,0.045961,0.558952,0.084937,0.036856,0.698616,0.441048
9,dummies,10,overall,0.654787,0.700166,0.050344,0.606987,0.092977,0.040508,0.702586,0.393013


In [20]:
results_df.groupby(['data', 'subset']).mean()
results_df.groupby(['data', 'subset']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,overall,5.5,0.649661,0.697061,0.049346,0.599732,0.091187,0.039789,0.699591,0.400268


Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,overall,3.02765,0.014437,0.005443,0.00257,0.025309,0.004673,0.002112,0.005143,0.025309


In [None]:
# Get rare 10 fold cv results
results_df = pd.DataFrame()
x_data = problem_charlson[['charlson_score']]
condition_data = slim_problem_dummies_death.iloc[:,2:]
y_data = slim_problem_dummies_death.death_year_label
# Get CV folds
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2)
fold_n = 0
for train_index, test_index in cv.split(x_data, y_data):
    fold_n += 1
    x_train  = x_data[x_data.index.isin(list(train_index))]
    y_train  = y_data[y_data.index.isin(list(train_index))]
    x_test  = x_data[x_data.index.isin(list(test_index))]
    y_test  = y_data[y_data.index.isin(list(test_index))]
    condition_test  = condition_data[condition_data.index.isin(list(test_index))]

    # Fit
    LR = LogisticRegression(class_weight='balanced')
    LR.fit(x_train, y_train)

    # Get results for rare diseases 
    for n in cut_off_list:
        # Get filter list 
        filter_list = problem_sum[problem_sum['Count'] > n].index.tolist()
        condition_test2 = condition_test.copy()
        for code in filter_list:
            condition_test2 = condition_test2.loc[condition_test2[code] != 1]
        condition_test_list = condition_test2.index.tolist()
        y_test2 = y_test.loc[condition_test_list]
        x_test2 = x_test.loc[condition_test_list]

        # AUC
        try:
            aucroc = roc_auc_score(y_test2, LR.predict(x_test2))
        except:
            aucroc = np.nan
        # Accuracy
        accuracy = accuracy_score(y_test2, LR.predict(x_test2))
        # Precision
        precision = precision_score(y_test2, LR.predict(x_test2))
        # Recall
        recall = precision_score(y_test2, LR.predict(x_test2))
        # AUPRC
        try:
            auprc = average_precision_score(y_test2, LR.predict(x_test2))
        except:
            auprc = np.nan
        # F1
        f1 = f1_score(y_test2, LR.predict(x_test2))
        # TPR and FPR
        cm = confusion_matrix(y_test2, LR.predict(x_test2))
        try:
            _tp = cm[0, 0]
        except:
            _tp = np.nan
        try:
            _fn = cm[0, 1]
        except:
            _fn = np.nan
        try:
            _fp = cm[1, 0]
        except:
            _fp = np.nan
        try:
            _tn = cm[1, 1]
        except:
            _tn = np.nan
        tpr = _tp / (_tp + _fn)
        fpr = _fp / (_tn + _fp)

        new_row = {'data': 'dummies', 'fold': fold_n, 'subset': n, 'AUROC': aucroc, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'F1': f1, 'AUPRC': auprc, 'TPR': tpr, 'FPR': fpr}
        results_df = results_df.append(new_row, ignore_index=True)

In [27]:
results_df

Unnamed: 0,data,fold,subset,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
0,dummies,1,45,0.481651,0.952813,0.0,0.0,0.0,0.010889,0.963303,1.0
1,dummies,1,8,,0.989247,0.0,0.0,0.0,-0.0,0.989247,
2,dummies,2,45,0.568436,0.961326,0.058824,0.058824,0.086957,0.019012,0.970205,0.833333
3,dummies,2,8,,0.986667,0.0,0.0,0.0,-0.0,0.986667,
4,dummies,3,45,0.491107,0.965049,0.0,0.0,0.0,0.017476,0.982213,1.0
5,dummies,3,8,0.494565,0.978495,0.0,0.0,0.0,0.010753,0.98913,1.0
6,dummies,4,45,0.482582,0.945783,0.0,0.0,0.0,0.02008,0.965164,1.0
7,dummies,4,8,0.492647,0.943662,0.0,0.0,0.0,0.042254,0.985294,1.0
8,dummies,5,45,0.481633,0.945892,0.0,0.0,0.0,0.018036,0.963265,1.0
9,dummies,5,8,0.492857,0.971831,0.0,0.0,0.0,0.014085,0.985714,1.0


In [28]:
results_df.groupby(['data', 'subset']).mean()
results_df.groupby(['data', 'subset']).std()

Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,8,5.5,0.496249,0.970801,0.0,0.0,0.0,0.01911,0.989742,1.0
dummies,45,5.5,0.52118,0.954285,0.046457,0.046457,0.05501,0.023562,0.970095,0.927734


Unnamed: 0_level_0,Unnamed: 1_level_0,fold,AUROC,accuracy,precision,recall,F1,AUPRC,TPR,FPR
data,subset,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
dummies,8,3.02765,0.003563,0.019697,0.0,0.0,0.0,0.019743,0.008263,0.0
dummies,45,3.02765,0.043248,0.007093,0.05652,0.05652,0.064543,0.011418,0.006676,0.08407
