In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, mean_squared_error, f1_score, roc_auc_score, roc_curve, auc, \
    mean_absolute_error, precision_score, recall_score, classification_report, confusion_matrix,\
    balanced_accuracy_score, precision_recall_curve

In [None]:
def evaluate(true_y, predict_y, scores_y):
    res = []
    res.append(true_y.shape[0]) # test cases
    res.append(accuracy_score(true_y, predict_y)) # acc
    res.append(balanced_accuracy_score(true_y, predict_y)) 
    
    tn, fp, fn, tp = confusion_matrix(true_y, predict_y, labels=[0,1]).ravel()
    recall0 = tn / (tn + fp)
    precision0 = tn / (tn + fn)
    
    res.append(precision_score(true_y, predict_y))
    res.append(precision0)
    res.append(recall_score(true_y, predict_y))
    res.append(recall0)
    
    res.append(f1_score(true_y, predict_y))
    res.append(2 * precision0 * recall0 / (precision0 + recall0))
    
    try:
        res.append(roc_auc_score(true_y, scores_y))
    except:
        res.append(0)
    
    res.append(fp / (fp + tn))
    res.append(fn / (fn + tp))
    
    curve_precision, curve_recall, _ = precision_recall_curve(true_y, scores_y)
    res.append(auc(curve_recall, curve_precision))
    true_y_filp = list(map(lambda x: x ^ 1, true_y))
    if (scores_y < 0).sum() > 0: # negative value, logr decision scores
        scores_y_filp = np.negative(scores_y)
    else: # mlp scores
        scores_y_filp = list(map(lambda x: 1-x, scores_y))
    curve_precision0, curve_recall0, _ = precision_recall_curve(true_y_filp, scores_y_filp)
    res.append(auc(curve_recall0, curve_precision0))
    
    # num test cases, acc, balanced acc, precision 1 and 0, recall 1 and 0, f1 1 and 0, roc auc, FPR, FNR, auc pr 1 and 0
    return res

In [None]:
def produce_all_results(df_results):
    """
    for whole group table
    takes a data frame contains all results
    returns a list of dicts with subgroups analysis
    """
    # divde into groups
    # sex groups, 0 - female, 1 - male
    male = df_results[df_results['Sex 1'] == 1]
    female = df_results[df_results['Sex 1'] == 0]
    # race groups
    # recode 1 - white, 2 - black, 4 - asian, origin recode 1 - hispanic
    white = df_results[df_results['Race recode Y 1'] == 1]
    black = df_results[df_results['Race recode Y 2'] == 1]
    hispanic = df_results[df_results['Origin Recode NHIA 1'] == 1]
    asian = df_results[df_results['Race recode Y 4'] == 1]
    # age groups
    _30_and_below = df_results[df_results['Age at diagnosis continuous'] < 30]
    _31_to_40 = df_results[(df_results['Age at diagnosis continuous'] >= 30) & \
                                   (df_results['Age at diagnosis continuous'] < 40)]
    _41_to_50 = df_results[(df_results['Age at diagnosis continuous'] >= 40) & \
                                   (df_results['Age at diagnosis continuous'] < 50)]
    _51_to_60 = df_results[(df_results['Age at diagnosis continuous'] >= 50) & \
                                   (df_results['Age at diagnosis continuous'] < 60)]
    _61_to_70 = df_results[(df_results['Age at diagnosis continuous'] >= 60) & \
                                   (df_results['Age at diagnosis continuous'] < 70)]
    _71_to_80 = df_results[(df_results['Age at diagnosis continuous'] >= 70) & \
                                   (df_results['Age at diagnosis continuous'] < 80)]
    _81_to_90 = df_results[(df_results['Age at diagnosis continuous'] >= 80) & \
                                   (df_results['Age at diagnosis continuous'] < 90)]
    _90_and_above = df_results[df_results['Age at diagnosis continuous'] >= 90]
    
    group_list = [df_results, male, female, white, black, hispanic, asian, _30_and_below,\
                  _31_to_40, _41_to_50, _51_to_60, _61_to_70, _71_to_80,\
                 _81_to_90, _90_and_above]
    
    results_list = []
    for g in group_list:
        results_list.append(evaluate(np.array(g['true y']),\
                                     np.array(g['predict y']),\
                                     np.array(g['score y'])))
    return results_list

In [None]:
def produce_all_results_subgroup(dfs):
    """
    list of dfs as input
    in order
    """
    results_list = []
    for g in group_list:
        results_list.append(evaluate(np.array(g['true y']),\
                                     np.array(g['predict y']),\
                                     np.array(g['score y'])))
    return results_list

In [None]:
def write_results_to_file(results_list, filename):    
    df_to_write = pd.DataFrame(data=results_list, \
                               index=['Whole', 'Gender_Male', 'Gender_Female',\
                                     'Ethnicity_White', 'Ethnicity_Black',\
                                     'Ethnicity_Hispanic', 'Ethnicity_Asian',\
                                     'Age<30', '30<=Age<40', '40<=Age<50',\
                                     '50<=Age<60', '60<=Age<70',\
                                     '70<=Age<80', '80<=Age<90', 'Age>=90'],\
                              columns=['Test_data', 'Accuracy', 'Balanced_Accuracy',\
                                      'Precision_C1', 'Precision_C0', 'Recall_C1',\
                                      'Recall_C0', 'F1_C1', 'F1_C0', 'AUC_ROC', 'FPR',\
                                      'FNR', 'AUC_PR_C1', 'AUC_PR_C0'])
    #print(df_to_write)
    df_to_write.to_csv(filename)

In [None]:
def dp_all_units_results(folder_name, cancer, group_name):
    group = []
    
    for n in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20]:
        group1 = []
        
        df = pd.read_csv(folder_name + '/' + str(n) + '_u_metrics.csv')
        #print(df)
        df1 = df.loc[df['Unnamed: 0'] == group_name]
        #print(df1)
        if cancer == 'breast':
            group1.append(df1['Recall_C0'].iloc[0])
            group1.append(df1['Precision_C0'].iloc[0])
            group1.append(df1['AUC_PR_C0'].iloc[0])
            group1.append(df1['Recall_C1'].iloc[0])
            group1.append(df1['Precision_C1'].iloc[0])
            group1.append(df1['AUC_PR_C1'].iloc[0])
            group1.append(df1['Accuracy'].iloc[0])
            group1.append(df1['Balanced_Accuracy'].iloc[0])
            group1.append(df1['AUC_ROC'].iloc[0])
        else: #lung
            group1.append(df1['Recall_C1'].iloc[0])
            group1.append(df1['Precision_C1'].iloc[0])
            group1.append(df1['AUC_PR_C1'].iloc[0])
            group1.append(df1['Recall_C0'].iloc[0])
            group1.append(df1['Precision_C0'].iloc[0])
            group1.append(df1['AUC_PR_C0'].iloc[0])
            group1.append(df1['Accuracy'].iloc[0])
            group1.append(df1['Balanced_Accuracy'].iloc[0])
            group1.append(df1['AUC_ROC'].iloc[0])
            
        group.append(group1)
            
    cols = ['Rec_C1', 'Prec_C1', 'PR_C1', 'Rec_C0', 'Prec_C0', 'PR_C0', 'Acc', 'Bal_Acc', 'ROC']
    rows = ['1 Unit', '2 Units', '3 Units', '4 Units', '5 Units', \
        '6 Units', '7 Units', '8 Untis', '9 Units', '10 Units',
        '12 Units', '14 Units', '16 Untis', '18 Units', '20 Units']

    to_write = pd.DataFrame(data=group, index=rows, columns=cols)
    to_write.to_csv(folder_name + '/all_units_metrics.csv')