# WiDS Datathon 2021 - Clean Feature Selection

Target : Area under the Receiver Operating Characteristic (ROC) curve between the predicted and the observed target (diabetes_mellitus_diagnosis).

by : Sebastián Uribe Ocampo

## Libraries

In [306]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import seaborn as sns
sns.set_style("whitegrid")
#import plotly.express as px
# pd.options.plotting.backend = "plotly"
pd.set_option('display.max_rows', 60)
pd.set_option('display.max_columns', 150)

In [50]:
from scipy.stats import chi2_contingency

# UTILS

In [51]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

# LOAD DATA

In [109]:
data_info = pd.read_csv("../docs/DataDictionaryWiDS2021.csv",sep=";")
demographic_features = data_info[data_info["Category"]=="demographic"].sort_values(["Data Type","Variable Name"])["Variable Name"].unique().tolist()
demographic_features.remove("icu_admit_type")
apache_comorbidity_features = data_info[data_info["Category"]=="APACHE comorbidity"].sort_values(["Data Type","Variable Name"])["Variable Name"].unique().tolist()
apache_covariate_features =  data_info[data_info["Category"]=="APACHE covariate"].sort_values(["Data Type","Variable Name"])["Variable Name"].unique().tolist()
vitals_features =  data_info[data_info["Category"]=="vitals"].sort_values(["Data Type","Variable Name"])["Variable Name"].unique().tolist()
labs_features =  data_info[data_info["Category"]=="labs"].sort_values(["Data Type","Variable Name"])["Variable Name"].unique().tolist()
labs_bloodgas_features =  data_info[data_info["Category"]=="labs blood gas"].sort_values(["Data Type","Variable Name"])["Variable Name"].unique().tolist()

# Clean

In [53]:
data_info["Data Type"].value_counts()

numeric    151
binary      15
string       9
integer      6
Name: Data Type, dtype: int64

In [54]:
data_info[data_info["Data Type"]=="binary"]["Variable Name"].unique()

array(['elective_surgery', 'readmission_status', 'apache_post_operative',
       'arf_apache', 'gcs_unable_apache', 'intubated_apache',
       'ventilated_apache', 'aids', 'cirrhosis', 'hepatic_failure',
       'immunosuppression', 'leukemia', 'lymphoma',
       'solid_tumor_with_metastasis', 'diabetes_mellitus'], dtype=object)

In [55]:
categorical_data = ['ethnicity', 'gender', 'hospital_admit_source','icu_admit_source', 'icu_stay_type', 'icu_type','apache_2_diagnosis', 'apache_3j_diagnosis']
dtype_dict = {}
for var in data_info["Variable Name"].unique():
    if data_info[data_info["Variable Name"]==var]["Data Type"].iloc[0]=="binary" :
        pass#dtype_dict[var]=pd.Int64Dtype()
    if data_info[data_info["Variable Name"]==var]["Data Type"].iloc[0]=="string" :
        dtype_dict[var]="object"
    if data_info[data_info["Variable Name"]==var]["Data Type"].iloc[0]=="numeric" :
        dtype_dict[var]="float64"
    if data_info[data_info["Variable Name"]==var]["Data Type"].iloc[0]=="integer" :
        pass#dtype_dict[var]=pd.Int64Dtype()
    if var in categorical_data :
        dtype_dict[var]="category"
        
dtype_dict.pop('hospital_id', None)

In [56]:
dtype_dict

{'age': 'float64',
 'bmi': 'float64',
 'ethnicity': 'category',
 'gender': 'category',
 'height': 'float64',
 'hospital_admit_source': 'category',
 'icu_admit_source': 'category',
 'icu_admit_type': 'object',
 'icu_stay_type': 'category',
 'icu_type': 'category',
 'pre_icu_los_days': 'float64',
 'weight': 'float64',
 'albumin_apache': 'float64',
 'apache_2_diagnosis': 'category',
 'apache_3j_diagnosis': 'category',
 'bilirubin_apache': 'float64',
 'bun_apache': 'float64',
 'creatinine_apache': 'float64',
 'fio2_apache': 'float64',
 'glucose_apache': 'float64',
 'heart_rate_apache': 'float64',
 'hematocrit_apache': 'float64',
 'map_apache': 'float64',
 'paco2_apache': 'float64',
 'paco2_for_ph_apache': 'float64',
 'pao2_apache': 'float64',
 'ph_apache': 'float64',
 'resprate_apache': 'float64',
 'sodium_apache': 'float64',
 'temp_apache': 'float64',
 'urineoutput_apache': 'float64',
 'wbc_apache': 'float64',
 'd1_diasbp_invasive_max': 'float64',
 'd1_diasbp_invasive_min': 'float64',
 'd

# DTypes

In [113]:
diabetes = pd.read_csv("../data/raw/TrainingWiDS2021.csv",dtype=dtype_dict)

In [58]:
diabetes.dtypes

Unnamed: 0                       int64
encounter_id                     int64
hospital_id                      int64
age                            float64
bmi                            float64
                                ...   
immunosuppression                int64
leukemia                         int64
lymphoma                         int64
solid_tumor_with_metastasis      int64
diabetes_mellitus                int64
Length: 181, dtype: object

In [59]:
print("Train File Size   : ",diabetes.shape)
print("# encounter_id    : ",diabetes.encounter_id.nunique())
print("# hospital_id     : ",diabetes.hospital_id.nunique())

Train File Size   :  (130157, 181)
# encounter_id    :  130157
# hospital_id     :  204


In [60]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 130157 entries, 0 to 130156
Columns: 181 entries, Unnamed: 0 to diabetes_mellitus
dtypes: category(8), float64(155), int64(18)
memory usage: 172.9 MB


## Clean

In [110]:
import logging
logging.basicConfig(level=logging.DEBUG)
from sklego.pandas_utils import log_step,log_step_extra
from collections import Counter

In [339]:
def count_target(df, **kwargs):
    """Get initial Value Count Target"""
    return "Target : " + str(Counter(df["diabetes_mellitus"]))

@log_step_extra(count_target)
def start_pipe(df):
    return df.set_index("encounter_id").copy()

# drop columns
@log_step
def drop_columns(df,perc_miss_col=97617):
    invasive_features = list(filter(lambda x:"_invasive_" in x,df.columns))
    df=df.drop(columns=["Unnamed: 0","hospital_id","hospital_admit_source","readmission_status","icu_id"])
    df=df.dropna(thresh=perc_miss_col,axis=1)
    return df

# drop rows
@log_step
@log_step_extra(count_target)
def drop_rows(df,perc_miss_col=97617):
    df=df.dropna(thresh=50)
    return df

# drop outliers
@log_step
@log_step_extra(count_target)
def drop_outliers(df):
    df=df[(df["age"]>=18)&\
          (df["weight"]>38.6)&\
          (df["pre_icu_los_days"]<30)
         ] 
    return df


# replace pre_icu_los_days <0 with 0 

In [340]:
clean_diabetes = diabetes.pipe(start_pipe)\
                         .pipe(drop_columns)\
                         .pipe(drop_rows)\
                         .pipe(drop_outliers)

[start_pipe(df)] Target : Counter({0: 102006, 1: 28151})
[drop_columns(df)] time=0:00:00.342973 n_obs=130157, n_col=101
[drop_rows(df)] Target : Counter({0: 101820, 1: 28125})
[drop_rows(df)] time=0:00:00.182999 n_obs=129945, n_col=101
[drop_outliers(df)] Target : Counter({0: 94116, 1: 26674})
[drop_outliers(df)] time=0:00:00.105028 n_obs=120790, n_col=101


In [333]:
# clean_diabetes[list(filter(lambda x:x in clean_diabetes.columns,demographic_features))]

## Hold OUT

In [342]:
X = clean_diabetes.drop("diabetes_mellitus",axis=1)
y = clean_diabetes["diabetes_mellitus"]

In [343]:
from sklearn.model_selection import train_test_split

In [344]:
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y, test_size=0.2, random_state=42)

In [352]:
pd.concat([X_train,y_train],axis=1).to_csv("../data/processed/Train.csv")
pd.concat([X_test,y_test],axis=1).to_csv("../data/processed/Test.csv")

## Feature Selection

In [353]:
df = pd.concat([X_train,y_train],axis=1)

In [363]:
df.corr()["diabetes_mellitus"].sort_values(ascending=False).head(50)

diabetes_mellitus           1.000000
d1_glucose_max              0.400148
glucose_apache              0.353342
bmi                         0.163943
d1_bun_max                  0.153020
bun_apache                  0.151339
weight                      0.149271
d1_bun_min                  0.143426
d1_glucose_min              0.139304
d1_creatinine_max           0.129310
d1_creatinine_min           0.127261
creatinine_apache           0.127096
arf_apache                  0.107027
d1_potassium_max            0.093732
age                         0.079158
d1_sysbp_max                0.074782
d1_sysbp_noninvasive_max    0.074721
h1_sysbp_noninvasive_max    0.047823
h1_sysbp_max                0.047403
d1_calcium_max              0.044451
d1_potassium_min            0.033585
h1_sysbp_noninvasive_min    0.028375
d1_sysbp_noninvasive_min    0.027452
d1_sysbp_min                0.027118
gcs_eyes_apache             0.027035
gcs_motor_apache            0.026435
h1_sysbp_min                0.026135
p

In [364]:
df.corr(method="spearman")["diabetes_mellitus"].sort_values(ascending=False).head(50)

diabetes_mellitus           1.000000
d1_glucose_max              0.409606
glucose_apache              0.323255
d1_bun_max                  0.180051
bun_apache                  0.179029
bmi                         0.170071
d1_creatinine_max           0.169626
d1_bun_min                  0.162381
creatinine_apache           0.159442
d1_creatinine_min           0.155606
weight                      0.147731
arf_apache                  0.107027
d1_glucose_min              0.101649
d1_potassium_max            0.089729
d1_sysbp_max                0.072501
d1_sysbp_noninvasive_max    0.072425
age                         0.068707
h1_sysbp_noninvasive_max    0.042351
d1_calcium_max              0.042220
h1_sysbp_max                0.042069
gcs_motor_apache            0.029142
pre_icu_los_days            0.026205
d1_platelets_min            0.026062
d1_potassium_min            0.025887
d1_heartrate_min            0.025147
h1_sysbp_noninvasive_min    0.024952
d1_platelets_max            0.024409
g

In [366]:
import phik
from phik import resources, report

In [368]:
%%time
df.phik_matrix()["diabetes_mellitus"].sort_values(ascending=False).head(50)

interval columns not set, guessing: ['age', 'bmi', 'elective_surgery', 'height', 'pre_icu_los_days', 'weight', 'apache_post_operative', 'arf_apache', 'bun_apache', 'creatinine_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'resprate_apache', 'sodium_apache', 'temp_apache', 'ventilated_apache', 'wbc_apache', 'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_max', 'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'd1_temp_max', 'd1_temp_min', 'h1_diasbp_max', 'h1_diasbp_min', 'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_max', 'h1_mbp_mi

diabetes_mellitus            1.000000
d1_glucose_max               0.553597
glucose_apache               0.510630
apache_3j_diagnosis          0.327080
apache_2_diagnosis           0.297864
d1_glucose_min               0.272360
d1_bun_max                   0.228022
bun_apache                   0.226986
bmi                          0.221897
d1_bun_min                   0.210021
d1_creatinine_min            0.206200
creatinine_apache            0.204065
d1_creatinine_max            0.201152
weight                       0.196298
arf_apache                   0.167138
age                          0.149470
d1_potassium_max             0.124642
d1_hco3_min                  0.123358
d1_hemaglobin_max            0.111602
d1_hemaglobin_min            0.104049
d1_hematocrit_max            0.099967
d1_hematocrit_min            0.099546
d1_sysbp_max                 0.099306
d1_sysbp_noninvasive_max     0.099267
hematocrit_apache            0.095826
d1_sodium_min                0.090605
h1_diasbp_mi