In [None]:
import pandas as pd
import numpy as np
import codecs
import datetime
pd.set_option("mode.chained_assignment", None)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn import metrics as sm
from sklearn.utils import resample
from imblearn.over_sampling import RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.metrics import average_precision_score
from scipy import interp
from scipy import stats
import itertools

In [None]:
cohort_df = pd.read_csv(r"XXX/covid19_all_train_cleaned.csv", low_memory=False)

In [None]:
cohort_df=cohort_df.set_index('PATIENT')

In [None]:
cohort_df=cohort_df.replace(999,0)

In [None]:
for col in cohort_df.columns:
    if len(cohort_df[col].unique()) == 1:
        cohort_df.drop(col,inplace=True,axis=1)
cohort_df.shape

In [None]:
univar_df=cohort_df.drop([ 'train_vet_test','covid_hosp','death','covid_hosp_los',
                          'covid_icu','covid_icu_los','covid_niv',
                          'healthcare_total', 'HEALTHCARE_COVERAGE','Healthcare_Coverage_quint',
                          'HEALTHCARE_EXPENSES','Healthcare_Expenses_quint','CITY'],axis=1)

In [None]:
univar = pd.DataFrame(columns=["Variable", "Chi2", "p_value"])
for i in univar_df.columns:
    contingency = pd.crosstab(univar_df[i],univar_df.covid_status)
    chi2, p, dof, ex = stats.chi2_contingency(contingency, correction=False)
    vardata = pd.DataFrame([[i,chi2,p]],
                           columns=["Variable", "Chi2", "p_value"])
    univar = univar.append(vardata)
    univar = univar.round(2) 

In [None]:
univar2=univar.sort_values(by='Chi2',ascending=False).head(51)
univar2

In [None]:
select = univar2.iloc[1:,:]["Variable"].tolist()
select

In [None]:
data_all = cohort_df.loc[:, (cohort_df.columns.isin(select))]

In [None]:
for column in data_all:
    if (len(data_all[column].unique()) > 2) & ~(data_all[column].name in('age','enco19_ambulatory', 
                                                                              'enco19_outpatient', 'enco19_urgentcare',
                                                                              'enco19_inpatient', 'enco19_emergency', 
                                                                              'enco1518_wellness','enco1518_ambulatory',
                                                                              'enco1518_outpatient', 'enco1518_emergency',
                                                                              'enco1518_inpatient',
                                                                              'obs_2708_6','obs_29463_7',
                                                                              'obs_8302_2','obs_8302_2', 
                                                                              'obs_8310_5', 'obs_8462_4', 
                                                                              'obs_8480_6','obs_8867_4', 
                                                                              'obs_DALY', 'obs_QALY', 
                                                                              'obs_QOLS')) :
        data_all = pd.get_dummies(data=data_all , columns=[column],drop_first=True)

In [None]:
data_all = pd.get_dummies(data=data_all , columns=['GENDER'],drop_first=True)

In [None]:
data_all = pd.get_dummies(data=data_all , columns=['ETHNICITY'],drop_first=True)

In [None]:
data_all = pd.concat([data_all,cohort_df["covid_status"]],axis=1)
data_all = pd.concat([data_all,cohort_df["train_vet_test"]],axis=1)

In [None]:
covid_train = data_all.loc[data_all.train_vet_test==0,:]
covid_train=covid_train.drop(['train_vet_test'],axis=1)
covid_train.shape

In [None]:
covid_test= data_all.loc[data_all.train_vet_test==1,:]
covid_test=covid_test.drop(['train_vet_test'],axis=1)
covid_test.shape

In [None]:
train_data = covid_train.values
column = covid_train.columns
covid_train_x = train_data [:,:-1]
covid_train_x
covid_train_y_covid = train_data [:,-1].astype('int')
covid_train_y_covid

In [None]:
test_data = covid_test.values
column_test = covid_test.columns
covid_test_x = test_data [:,:-1]
covid_test_x
covid_test_y_covid = test_data [:,-1].astype('int')
covid_test_y_covid

In [None]:
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
n_estimators.append(10)
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
#max_depth.append(None)
random_grid = {'n_estimators': [200],
               'max_depth': max_depth,
               'max_features' : ["log2","sqrt"]}
print(random_grid)

In [None]:
estimator = RandomForestClassifier()
clf = RandomizedSearchCV(estimator, random_grid, n_iter = 10, cv = 5, verbose=2)
clf.fit(covid_train_x,covid_train_y_covid)
clf.best_params_

In [None]:
clf.cv_results_

In [None]:
y_pred = clf.predict(covid_test_x)
accuracy = sm.accuracy_score(covid_test_y_covid, y_pred)
recall = sm.recall_score(covid_test_y_covid, y_pred)
precision = sm.precision_score(covid_test_y_covid, y_pred)

In [None]:
accuracy, recall, precision

In [None]:
probas_ = clf.predict_proba(covid_test_x)
fpr,tpr,thresholds =  roc_curve(covid_test_y_covid,probas_[:,1])
roc_auc = auc(fpr, tpr)
roc_auc

In [None]:
auprc = average_precision_score(covid_test_y_covid, probas_[:,1])
auprc

In [None]:
df = pd.DataFrame({'Var':column[:-2], 'vip':clf.best_estimator_.feature_importances_}).sort_values(by='vip',ascending=False)
df

### Use all training data re-run the model

In [None]:
covid_train_all=data_all
covid_train_all=covid_train_all.drop(['train_vet_test'],axis=1)
covid_train_all.shape

In [None]:
covid_train_all

In [None]:
covid_train_total = covid_train_all.values
column__total = covid_train_all.columns
covid_all_x = covid_train_total [:,:-1]
covid_all_y_covid = covid_train_total [:,-1].astype('int')
covid_all_y_covid

In [None]:
clf_final = RandomForestClassifier(n_estimators=200,max_features='log2',max_depth=10)
clf_final.fit(covid_all_x,covid_all_y_covid)

In [None]:
covid_pred_train = clf_final.predict_proba(covid_all_x)
covid_train_all['covid_predict']=covid_pred_train[:,1]
train_outcome_covid=covid_train_all[['covid_predict']]
train_outcome_covid['train_test']=1
train_outcome_covid

In [None]:
probas_ = clf_final.predict_proba(covid_all_x)
fpr,tpr,thresholds =  roc_curve(covid_all_y_covid,probas_[:,1])
roc_auc = auc(fpr, tpr)
roc_auc

In [None]:
auprc = average_precision_score(covid_all_y_covid, probas_[:,1])
auprc

### Predict the test set

In [None]:
test_df = pd.read_csv(r"XXX/covid19_all_test_cleaned.csv", low_memory=False)

In [None]:
test_df=test_df.set_index('PATIENT')

In [None]:
#test_df = test_df.loc[:, (test_df.columns.isin(cohort_df.columns))]
test_df = test_df.loc[:, (test_df.columns.isin(select))]

In [None]:
test_df = test_df.replace(999,0)

In [None]:
test_df=test_df.drop([ 'healthcare_total', 'HEALTHCARE_COVERAGE','Healthcare_Coverage_quint',
                       'HEALTHCARE_EXPENSES','Healthcare_Expenses_quint','CITY'],axis=1)

In [None]:
for column in test_df:
    if (len(test_df[column].unique()) > 2) & ~(test_df[column].name in('age','enco19_ambulatory', 
                                                                              'enco19_outpatient', 'enco19_urgentcare',
                                                                              'enco19_inpatient', 'enco19_emergency', 
                                                                              'enco1518_wellness','enco1518_ambulatory',
                                                                              'enco1518_outpatient', 'enco1518_emergency',
                                                                              'enco1518_inpatient',
                                                                              'obs_2708_6','obs_29463_7',
                                                                              'obs_8302_2','obs_8302_2', 
                                                                              'obs_8310_5', 'obs_8462_4', 
                                                                              'obs_8480_6','obs_8867_4', 
                                                                              'obs_DALY', 'obs_QALY', 
                                                                              'obs_QOLS')) :
        test_df = pd.get_dummies(data=test_df , columns=[column],drop_first=True)

In [None]:
test_df = pd.get_dummies(data=test_df , columns=['GENDER'],drop_first=True)

In [None]:
test_df = pd.get_dummies(data=test_df , columns=['ETHNICITY'],drop_first=True)

In [None]:
covid_test_total = test_df.values
column_test = test_df.columns
covid_test_all_x = covid_test_total [:,:]

In [None]:
covid_pred_test = clf_final.predict_proba(covid_test_all_x)

In [None]:
test_df['covid_predict']=covid_pred_test[:,1]
test_outcome_covid=test_df[['covid_predict']]
test_outcome_covid['train_test']=0
test_outcome_covid