# Process holdout dataset

## Data process v5

In [1]:
import sys
print("Python version")
print (sys.version)

Python version
3.6.13 |Anaconda, Inc.| (default, Jun  4 2021, 14:25:59) 
[GCC 7.5.0]


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import pandas as pd
import sklearn
import numpy as np
from numpy import sqrt
from numpy import argmax
from scipy import stats
import os
import pickle
import itertools
from sklearn import linear_model, preprocessing
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split, KFold, GroupKFold
from sklearn.metrics import explained_variance_score, r2_score, mean_squared_error
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import export_graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import precision_recall_fscore_support, auc, average_precision_score, precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import manifold
from sklearn.decomposition import PCA, FastICA, TruncatedSVD, SparsePCA, sparse_encode, FactorAnalysis, KernelPCA
import time

In [3]:
pd.set_option('display.max_columns', None)

## Load training and holdout set

In [4]:
train_df = pd.read_csv('data/training_removed_missing_v5.csv', nrows=100, index_col='ID')

In [7]:
holdout_df = pd.read_csv('data/2021_Competition_Holdout.csv', low_memory=False)
holdout_df = holdout_df.drop('Unnamed: 0', axis=1)
holdout_df = holdout_df.set_index('ID', drop=True)
holdout_df = holdout_df.replace({'*': np.nan})

In [8]:
train_df.drop('covid_vaccination', axis=1).columns.size

308

In [9]:
holdout_df = holdout_df[train_df.drop('covid_vaccination', axis=1).columns]

In [10]:
holdout_df.shape

(525158, 308)

In [11]:
cd_set = []
for name in holdout_df.columns.values:
    if '_cd' in name:
        print(name)
        cd_set.append(name)

cms_orig_reas_entitle_cd
sex_cd
race_cd


In [12]:
id_set = []
for name in train_df.columns.values:
    if '_id' in name:
        print(name)
        id_set.append(name)

src_div_id


In [13]:
categorical_features = ( set(holdout_df.columns) - set(holdout_df.describe().columns) ) | set(cd_set) | set(id_set)
len(categorical_features)

84

In [14]:
for col in categorical_features:
    holdout_df[col] = holdout_df[col].astype(float, errors='ignore')

In [15]:
holdout_df['src_div_id'] = holdout_df['src_div_id'].fillna(0).astype(int, errors='ignore')

In [16]:
# rerun to see changes
categorical_features = ( set(holdout_df.columns) - set(holdout_df.describe().columns) ) | set(cd_set) | set(id_set)
len(categorical_features)

53

## Add missing values

In [17]:
holdout_df.isnull().any().sum()

179

In [18]:
# Use mean or mode
for name in holdout_df.columns:
    if holdout_df[name].hasnans:
        if name in categorical_features:
            print('categorical:', name)
            holdout_df[name].fillna(holdout_df[name].mode()[0], inplace=True)
        elif holdout_df[name].dtypes == float:
            print('numerical:', name)
            holdout_df[name].fillna(holdout_df[name].mean(), inplace=True)
        else:
            print(name)

numerical: atlas_pct_laccess_child15
numerical: atlas_recfacpth14
numerical: atlas_pct_fmrkt_frveg16
numerical: atlas_pct_free_lunch14
numerical: cons_chmi
categorical: mcc_ano_pmpm_ct_t_9-6-3m_b4
numerical: cons_ltmedicr
numerical: rx_gpi4_6110_pmpm_ct
numerical: atlas_pc_snapben15
numerical: credit_bal_nonmtgcredit_60dpd
numerical: atlas_pct_laccess_nhna15
numerical: credit_hh_nonmtgcredit_60dpd
numerical: rx_bh_pmpm_ct_0to3m_b4
numerical: cons_lwcm10
numerical: atlas_fsrpth14
numerical: auth_3mth_dc_home
numerical: atlas_wicspth12
categorical: rx_gpi2_17_pmpm_cost_t_12-9-6m_b4
numerical: cons_hxmioc
numerical: atlas_ghveg_farms12
numerical: credit_hh_bankcardcredit_60dpd
numerical: total_outpatient_allowed_pmpm_cost_6to9m_b4
numerical: cons_cwht
numerical: atlas_netmigrationrate1016
numerical: atlas_pct_laccess_snap15
numerical: atlas_retirement_destination_2015_upda
numerical: atlas_naturalchangerate1016
numerical: atlas_pct_laccess_hisp15
numerical: auth_3mth_dc_no_ref
numerical: 

In [19]:
holdout_df.isnull().any().sum()

0

In [20]:
holdout_df.to_csv('data/holdout_removed_missing_v5.csv')

## Transform ordinal features

In [21]:
ordered_cat_set = set([])
ordered_cat = []
for name in categorical_features:
    if 'No Activity' in holdout_df[name].unique():
        ordered_cat_set = ordered_cat_set | set(holdout_df[name].unique())
        ordered_cat.append(name)
print(ordered_cat_set)

  after removing the cwd from sys.path.


{'Dec_over_8x', 'Resolved', 'No_Change', 'New', 'Inc_2x-4x', 'Inc_1x-2x', 'Dec_4x-8x', 'Inc_4x-8x', 'Dec_1x-2x', 'Inc_over_8x', 'No Activity', 'Dec_2x-4x'}


In [22]:
len(ordered_cat)

45

In [23]:
len(ordered_cat_set)

12

In [24]:
ordered_cat_transform_tab = {'Inc_over_8x': 8,
                             'Inc_4x-8x': 6,
                             'Inc_2x-4x': 3,
                             'Inc_1x-2x': 1.5,
                             'No_Change': 0,
                             'Dec_1x-2x': -1.5,
                             'Dec_2x-4x': -3,
                             'Dec_4x-8x': -6,
                             'Dec_over_8x': -8,
                             'Resolved': -10, 
                             'No Activity': 0.5,
                             'New': 10}

In [25]:
for name in ordered_cat:
    holdout_df[name] = holdout_df[name].map(ordered_cat_transform_tab)

## Transform categorical features

In [26]:
for name in set(categorical_features) - set(ordered_cat) :
    holdout_df = pd.concat([holdout_df.drop(name, axis=1), pd.get_dummies(holdout_df[name], prefix=name, dummy_na=True)], axis=1)
    print(name, holdout_df.shape)

race_cd (525158, 315)
hedis_dia_hba1c_ge9 (525158, 317)
hum_region (525158, 333)
src_div_id (525158, 339)
sex_cd (525158, 341)
cons_hhcomp (525158, 354)
cons_mobplus (525158, 358)
cms_orig_reas_entitle_cd (525158, 362)


In [27]:
holdout_df.shape

(525158, 362)

## change column orders

In [187]:
train_df = pd.read_csv('data/training_cleaned_v5.csv', index_col='ID', nrows=20000)
X_train = train_df.drop('covid_vaccination', axis=1)
y_train = train_df['covid_vaccination']

In [29]:
X_train.columns.size

362

In [30]:
holdout_df.columns.size

362

In [31]:
set(X_train.columns) - set(holdout_df.columns)

set()

In [32]:
set(X_train.columns) == set(holdout_df.columns)

True

In [33]:
holdout_df = holdout_df[X_train.columns]

In [34]:
holdout_df.to_csv('data/holdout_cleaned_v5.csv')

# Restart here

## Readin cleaned holdout data

In [44]:
if 0:
    holdout_df = pd.read_csv('data/holdout_cleaned_v5.csv', index_col='ID')

In [181]:
X_train.shape

(100, 362)

In [178]:
holdout_df.shape

(525158, 362)

In [167]:
def make_result_csv(fn, probability_holdout, holdout_df):
    assert probability_holdout.shape[0] == holdout_df.shape[0]
    
    col_names = ['ID', 'SCORE', 'RANK']
    holdout = pd.DataFrame(holdout_df.reset_index()['ID'])
    holdout['SCORE'] = 1 - probability_holdout
    holdout = holdout.sort_values('SCORE', ascending=False)

    quantile = holdout['SCORE'].quantile(np.arange(0, 1, 0.1))
    holdout['SCORE'].describe()

    rank = 10
    for t in quantile:
        holdout.loc[holdout['SCORE'] >= t, 'RANK'] = rank
        rank -= 1

    #print(pd.DataFrame(holdout.groupby('RANK').SCORE.mean()))

    holdout.to_csv(f'predictions/{fn}', index=False)
    print('-'*40)
    print(f'File saved to: predictions/{fn}')
    return holdout

In [194]:
def cal_AI_fairness(X_train, y_train, probability_train, cols, ref_col):
    fairness = []
    for col in cols:
        if X_train.loc[X_train[col]==1].shape[0] > 0:
            probability_train_df = pd.DataFrame(y_train.copy(deep=True))
            probability_train_df['score'] = probability_train

            y_train_ref = probability_train_df.loc[X_train.loc[X_train[ref_col]==1].index]['covid_vaccination']
            probability_train_ref = probability_train_df.loc[X_train.loc[X_train[ref_col]==1].index]['score']
            S0 = roc_auc_score(y_train_ref, probability_train_ref)

            y_train_minor = probability_train_df.loc[X_train.loc[X_train[col]==1].index]['covid_vaccination']
            probability_train_minor = probability_train_df.loc[X_train.loc[X_train[col]==1].index]['score']    
            Sn = roc_auc_score(y_train_minor, probability_train_minor)

            print(col, S0, Sn, min(1, Sn/S0))
            fairness.append(min(1, Sn/S0))
    return fairness

In [195]:
def make_prediction_csv(clf_name):
#     clf_name = 'Random Forest 300x15 v5'
    clf, T_dev = pickle.load(open(f'best_models/{clf_name}_model.pk', "rb"))

    print(clf_name)
    print('-'*40)
    print('Training set:', end='\t')
    probability_train = clf.predict_proba(X_train)[:, 1]
    predict_train = ["vacc" if x > T_dev else "no_vacc" for x in probability_train]
    print('Frac vacc:', "%.5f" %(predict_train.count('vacc')/len(predict_train)), 'Avg score:', "%.5f" %np.mean(probability_train))
    Train_AUROC = roc_auc_score(y_train, probability_train)
    print('Train_AUROC:', Train_AUROC)
    
    race_cols = ['race_cd_0.0', 'race_cd_1.0', 'race_cd_2.0', 'race_cd_3.0', 'race_cd_4.0', 'race_cd_5.0', 'race_cd_6.0']
    fairness = cal_AI_fairness(X_train, y_train, probability_train, sex_cols, 'sex_cd_M') +\
               cal_AI_fairness(X_train, y_train, probability_train, race_cols, 'race_cd_1.0')

    print('AI_fairness:', np.mean(fairness))
    print('-'*40) 
    print('Holdout set:', end='\t')
    probability_holdout = clf.predict_proba(holdout_df)[:, 1]
    predict_holdout = ["vacc" if x > T_dev else "no_vacc" for x in probability_holdout]
    print('Frac vacc:', "%.5f" %(predict_holdout.count('vacc')/len(predict_holdout)), 'Avg score:', "%.5f" % np.mean(probability_holdout) )

    fn = f'Holdout-{clf_name}.csv'
    holdout = make_result_csv(fn, probability_holdout, holdout_df)

In [196]:
make_prediction_csv('Random Forest 300x15 v5')

Random Forest 300x15 v5
----------------------------------------
Training set:	Frac vacc: 0.42295 Avg score: 0.42345
Train_AUROC: 0.8017837145206113
sex_cd_F 0.805039668305984 0.7983939526952029 0.9917448594492674
sex_cd_M 0.805039668305984 0.805039668305984 1
race_cd_0.0 0.799932659080699 0.7828292058850407 0.9786188837253951
race_cd_1.0 0.799932659080699 0.799932659080699 1
race_cd_2.0 0.799932659080699 0.8187070882893337 1
race_cd_3.0 0.799932659080699 0.7568089430894308 0.9460908171434994
race_cd_4.0 0.799932659080699 0.7683615819209039 0.9605328313559828
race_cd_5.0 0.799932659080699 0.8324277952530741 1
race_cd_6.0 0.799932659080699 0.9841269841269841 1
AI_fairness: 0.9863319324082384
----------------------------------------
Holdout set:	Frac vacc: 0.41876 Avg score: 0.42321
----------------------------------------
File saved to: predictions/Holdout-Random Forest 300x15 v5.csv


In [197]:
make_prediction_csv('Random Forest 200x15 v5')

Random Forest 200x15 v5
----------------------------------------
Training set:	Frac vacc: 0.44030 Avg score: 0.42338
Train_AUROC: 0.8014444708126128
sex_cd_F 0.8045445582087978 0.7981550459489855 0.9920582245015271
sex_cd_M 0.8045445582087978 0.8045445582087978 1
race_cd_0.0 0.7994929341389959 0.785844832312894 0.9829290525990702
race_cd_1.0 0.7994929341389959 0.7994929341389959 1
race_cd_2.0 0.7994929341389959 0.8183656077748765 1
race_cd_3.0 0.7994929341389959 0.7520325203252033 0.9406368564533912
race_cd_4.0 0.7994929341389959 0.7654710287741426 0.9574456459687255
race_cd_5.0 0.7994929341389959 0.8307120388904775 1
race_cd_6.0 0.7994929341389959 0.9841269841269841 1
AI_fairness: 0.9858966421691905
----------------------------------------
Holdout set:	Frac vacc: 0.43656 Avg score: 0.42318
----------------------------------------
File saved to: predictions/Holdout-Random Forest 200x15 v5.csv


In [198]:
make_prediction_csv('Gradient Boost 50x8 v5')

Gradient Boost 50x8 v5
----------------------------------------
Training set:	Frac vacc: 0.44830 Avg score: 0.17365
Train_AUROC: 0.7063274949973939
sex_cd_F 0.7058586186555784 0.705388708786723 0.9993342719683009
sex_cd_M 0.7058586186555784 0.7058586186555784 1
race_cd_0.0 0.7017171725878572 0.7031435620944897 1
race_cd_1.0 0.7017171725878572 0.7017171725878572 1
race_cd_2.0 0.7017171725878572 0.7201672952325693 1
race_cd_3.0 0.7017171725878572 0.6265243902439024 0.8928446028096307
race_cd_4.0 0.7017171725878572 0.6736302719747734 0.9599740440874458
race_cd_5.0 0.7017171725878572 0.7103231341149557 1
race_cd_6.0 0.7017171725878572 0.9206349206349206 1
AI_fairness: 0.9835725465405974
----------------------------------------
Holdout set:	Frac vacc: 0.44884 Avg score: 0.17351
----------------------------------------
File saved to: predictions/Holdout-Gradient Boost 50x8 v5.csv


In [199]:
make_prediction_csv('Gradient Boost 100x8 v5')

Gradient Boost 100x8 v5
----------------------------------------
Training set:	Frac vacc: 0.44875 Avg score: 0.17352
Train_AUROC: 0.7277562942800816
sex_cd_F 0.7291247983435698 0.7254533208209724 0.9949645416930842
sex_cd_M 0.7291247983435698 0.7291247983435698 1
race_cd_0.0 0.7230320263976775 0.7131499588778214 0.9863324622436285
race_cd_1.0 0.7230320263976775 0.7230320263976775 1
race_cd_2.0 0.7230320263976775 0.7527681075331206 1
race_cd_3.0 0.7230320263976775 0.6626016260162602 0.9164208525001356
race_cd_4.0 0.7230320263976775 0.6943896991196952 0.9603858110951387
race_cd_5.0 0.7230320263976775 0.7466399771232486 1
race_cd_6.0 0.7230320263976775 0.9285714285714286 1
AI_fairness: 0.9842337408368874
----------------------------------------
Holdout set:	Frac vacc: 0.44797 Avg score: 0.17323
----------------------------------------
File saved to: predictions/Holdout-Gradient Boost 100x8 v5.csv


In [200]:
make_prediction_csv('Gradient Boost 200x8 v5')

Gradient Boost 200x8 v5
----------------------------------------
Training set:	Frac vacc: 0.44915 Avg score: 0.17316
Train_AUROC: 0.7537582292448548
sex_cd_F 0.756240682155467 0.7506814030906632 0.9926487966120012
sex_cd_M 0.756240682155467 0.756240682155467 1
race_cd_0.0 0.7498224602148217 0.7357671570867221 0.981255158555703
race_cd_1.0 0.7498224602148217 0.7498224602148217 1
race_cd_2.0 0.7498224602148217 0.7768319069722464 1
race_cd_3.0 0.7498224602148217 0.7093495934959348 0.94602340038295
race_cd_4.0 0.7498224602148217 0.7121271843384575 0.9497277317279017
race_cd_5.0 0.7498224602148217 0.7968735106281575 1
race_cd_6.0 0.7498224602148217 0.9285714285714286 1
AI_fairness: 0.9855172319198395
----------------------------------------
Holdout set:	Frac vacc: 0.44885 Avg score: 0.17268
----------------------------------------
File saved to: predictions/Holdout-Gradient Boost 200x8 v5.csv


## race_cd_3.0 has the lowest AI_fairness score