In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
df_train = pd.read_csv('/kaggle/input/widsdatathon2020/training_v2.csv')
df_train.head()

In [None]:
# find the categorical features, binary, and numerical features in the dataset
cat_feature = [i for i in df_train.columns.drop('hospital_death') if (df_train.dtypes[i]=='object')]
bin_feature = [i for i in df_train.columns.drop('hospital_death') if df_train[i].isin([0.0,1.0,'nan']).all()]
num_feature = [i for i in df_train.columns.drop('hospital_death') 
               if i not in set(cat_feature).union(set(bin_feature))
               .union(set(['encounter_id','patient_id','hospital_id']))]

In [None]:
# filter out highly correlated features (pearson correlation>=0.8)
corr_mattrix = df_train[num_feature].corr()
upper = corr_mattrix.where(np.triu(np.ones(corr_mattrix.shape),k=1).astype(bool)).abs()
lower = corr_mattrix.where(np.tril(np.ones(corr_mattrix.shape),k=-1).astype(bool)).abs()
to_drop_col = [i for i in upper.columns if any(upper[i]>=0.80)]
to_drop_ind = [i for i in lower.index if any(lower[i]>=0.80)]
print(len(to_drop_col),len(to_drop_ind),to_drop_col==to_drop_ind)

In [None]:
# separate each highly correlated feature pairs, and find which correlated set of features has less missing values
f_col = [e for e in to_drop_col if e not in to_drop_ind]
f_ind = [e for e in to_drop_ind if e not in to_drop_col]
d = {'f_col': [to_drop_col,df_train[f_col].isnull().sum().mean()],
        'f_ind': [to_drop_ind,df_train[f_ind].isnull().sum().mean()]}
def select_feature(d):
    if d['f_col'][1] >= d['f_ind'][1]:
        return d['f_col'][0]
    else:
        return d['f_ind'][0]
to_drop = select_feature(d)
# only drop those highly correlated set within the pairs of correlation matrix with less missing values as found in the previous step
num_feature = [e for e in num_feature if e not in to_drop]
df_train[num_feature].corr().style.background_gradient('coolwarm')

In [None]:
from sklearn.preprocessing import StandardScaler
df_num = pd.DataFrame(StandardScaler().fit_transform(df_train[num_feature]),columns=num_feature)
df_num.head()

In [None]:
# feature selection by calculating f-value
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
# impute missing values in x with mean of column
X_num = SimpleImputer().fit_transform(df_num.values)
y = df_train.hospital_death.values
f_selector = SelectKBest(score_func=f_classif, k=15).fit(X_num,y)
print('Sorted f-value: ', np.sort(f_selector.scores_))
print('Sorted p-value: ', np.sort(f_selector.pvalues_))
print('Most important features: ', df_num.columns[f_selector.pvalues_<0.05])
print(len(df_num.columns[f_selector.pvalues_<0.05]), len(num_feature))
# select features with p-value < 0.05
num_feature = df_num.columns[f_selector.pvalues_<0.05]

In [None]:
# get dummies of cat_feature and concatenate with bin_feature
df_cat = pd.get_dummies(df_train[cat_feature])
# df_train[bin_feature]
df_cat = pd.concat([df_cat,df_train[bin_feature]],axis=1)
print(df_cat.shape)
df_cat.head()

In [None]:
# select categorical features by calculating chi-square
from sklearn.feature_selection import chi2
X_cat = SimpleImputer(strategy='most_frequent').fit_transform(df_cat.values)
chi_selector = SelectKBest(score_func=chi2,k=15).fit(X_cat,y)
print('Sorted chi2_value: ', np.sort(chi_selector.scores_))
print('Sorted p-value: ', np.sort(chi_selector.pvalues_))
print('The most important feature: ', df_cat.columns[chi_selector.pvalues_<0.05])
print(len(df_cat.columns[chi_selector.pvalues_<0.05]), len(df_cat.columns))
cat_feature = df_cat.columns[chi_selector.pvalues_<0.05]
df_cat = df_cat[cat_feature]
df_cat.head()

In [None]:
# drop rows where cat_features have missing values, the resultant df is 90676X125
df_model = pd.concat([df_train[['encounter_id','hospital_death']],df_num[num_feature],df_cat],axis=1)
df_model.drop(index=df_model.index[df_model[cat_feature].isnull().any(axis=1)],inplace=True)
print('Features that have over 70% of missing values in num_feature: ', num_feature[df_model[num_feature].isnull().sum()/len(df_model[num_feature])>0.7])
df_model.drop(columns=num_feature[df_model[num_feature].isnull().sum()/len(df_model[num_feature])>0.7],inplace=True)
print(df_model.shape)
df_model.head()

In [None]:
# feature selection using embedded method
from sklearn.ensemble import GradientBoostingClassifier
X_model = SimpleImputer().fit_transform(df_model.drop(columns=['encounter_id','hospital_death']).values)
y_model = df_model['hospital_death'].values
gbm_fit = GradientBoostingClassifier(random_state=0).fit(X_model,y_model)
zero_feature = df_model.drop(columns=['encounter_id','hospital_death']).columns[gbm_fit.feature_importances_==0]
print('Features with zero importance: ', zero_feature)
# drop columns in df_model with zero importance
df_model.drop(columns=zero_feature,inplace=True)
print(df_model.shape)
df_model.head()

In [None]:
df_model.drop(columns=['encounter_id','hospital_death']).columns[rfe_fit.support_]

In [None]:
# select features using wrapper method RFE
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeClassifier
X_model = SimpleImputer().fit_transform(df_model.drop(columns=['encounter_id','hospital_death']).values)
rfe_selector = RFECV(estimator=DecisionTreeClassifier(random_state=0),min_features_to_select=15,cv=3,verbose=1,n_jobs=-1)
rfe_fit = rfe_selector.fit(X_model,y_model)
print('Selected features: ', df_model.drop(columns=['encounter_id','hospital_death']).columns[rfe_fit.support_])
print('Feature ranking: ', rfe_fit.ranking_)
X_feature = df_model.drop(columns=['encounter_id','hospital_death']).columns[rfe_fit.support_]

In [None]:
# model selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
X_model = SimpleImputer().fit_transform(df_model[X_feature].values)
best_model = {}
model_name = ['logis','rdf','ada','gb']
models = [LogisticRegression(solver='saga',random_state=0),
          RandomForestClassifier(random_state=0),
          AdaBoostClassifier(random_state=0),
          GradientBoostingClassifier(random_state=0)]
params = [{'penalty': ['l1','l2'],
           'C': np.linspace(0.1,3,5)},
          {'n_estimators': [10,100],
           'max_features': [1,3,'auto']},
          {'n_estimators': [10,100],
           'learning_rate': np.linspace(0.1,3,5)}, 
          {'n_estimators': [10,100], 
           'learning_rate': np.linspace(0.1,3,5), 
           'max_features': [1,3,'auto']}]
for i,j,k in zip(model_name,models,params):
    grid = GridSearchCV(estimator=j,param_grid=k,scoring='roc_auc',cv=3,verbose=1,n_jobs=-1)
    best_model[i] = grid.fit(X_model,y_model)

In [None]:
# find the best model and corresponding parameters
best_estimator = model_name[np.argmax([best_model[i].best_score_ for i in model_name])]
model = best_model[best_estimator].best_estimator_
# model evaluation with cross validation
from sklearn.model_selection import KFold, cross_val_score
kf = KFold(n_splits=10,shuffle=True,random_state=0)
cv_result = cross_val_score(estimator=model, X=X_model, y=y_model, scoring='roc_auc', cv=kf, verbose=1, n_jobs=-1)
print('The selected model is: ', model)
print('The score of the selected model is: ', cv_result.mean())

In [None]:
df_test.columns==df_train.columns

In [None]:
# import unlabeled data
df_test = pd.read_csv('/kaggle/input/widsdatathon2020/unlabeled.csv')
print(df_test.shape)
df_test.head()

In [None]:
df_predict[X_feature].info()

In [None]:
# preprocessing test data before prediction
cat_feature_test = [i for i in df_test.columns.drop('hospital_death') if (df_test.dtypes[i]=='object')]
bin_feature_test = [i for i in df_test.columns.drop('hospital_death') if df_test[i].isin([0.0,1.0,'nan']).all()]
num_feature_test = [i for i in df_test.columns.drop('hospital_death') 
               if i not in set(cat_feature_test).union(set(bin_feature_test))
               .union(set(['encounter_id','patient_id','hospital_id']))]
df_num_test = pd.DataFrame(StandardScaler().fit_transform(df_test[num_feature_test]),columns=num_feature_test)
df_num_test = df_num_test[num_feature]
df_cat_test = pd.concat([pd.get_dummies(df_test[cat_feature_test]),df_test[bin_feature_test]],axis=1)
df_cat_test = df_cat_test[df_cat.columns]
df_predict = pd.concat([df_test[['encounter_id','hospital_death']],df_num_test,df_cat_test],axis=1)
print(df_predict.shape)
df_predict.head()

In [None]:
# predict hospital_death using adaboost and unlabeled data
X_test = SimpleImputer().fit_transform(df_predict[X_feature].values)
y_hat = model.predict_proba(X_test)
df_predict['hospital_death'] = y_hat
print(df_predict.shape)
df_predict.head()

In [None]:
df_predict[['encounter_id','hospital_death']].to_csv('submission_res.csv',index=False,float_format='%.2f')

In [None]:
ls

In [4]:
import pandas as pd
df_pred = pd.read_csv('./submission.csv')
df_pred.head()

Unnamed: 0,encounter_id,hospital_death
0,2,0
1,5,0
2,7,0
3,8,0
4,10,0
