In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
import seaborn as sns

In [2]:
df = pd.read_csv('C2T1_Train.csv')

General insight of data

In [127]:
df.head()

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,5283,48330653,Caucasian,Female,[80-90),?,2,1,4,13,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
1,8499,63555809,Caucasian,Female,[90-100),?,3,3,4,12,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
2,9441,42519137,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
3,20997,89868902,AfricanAmerican,Female,[40-50),?,1,1,7,9,...,No,Steady,No,No,No,No,No,No,Yes,>30
4,28515,82637321,Caucasian,Male,[50-60),?,2,1,2,3,...,No,Steady,No,No,No,No,No,No,Yes,>30


Look at the shape of data

In [128]:
df.shape

(90766, 50)

Check missing value in dataset

In [129]:
for col in df.columns:
    if df[col].dtype == object:
         print(col,df[col][df[col].apply(lambda x : str(x).strip() == "?" or str(x).strip() == "Unknown/Invalid")].size/df.shape[0])

race 0.024315272238503403
gender 3.305202388559593e-05
age 0.0
weight 0.9677632593702488
payer_code 0.42670162836304343
medical_specialty 0.48117136372650554
diag_1 0.00018729480201837694
diag_2 0.003701826675186744
diag_3 0.014950532137584557
max_glu_serum 0.0
A1Cresult 0.0
metformin 0.0
repaglinide 0.0
nateglinide 0.0
chlorpropamide 0.0
glimepiride 0.0
acetohexamide 0.0
glipizide 0.0
glyburide 0.0
tolbutamide 0.0
pioglitazone 0.0
rosiglitazone 0.0
acarbose 0.0
miglitol 0.0
troglitazone 0.0
tolazamide 0.0
examide 0.0
citoglipton 0.0
insulin 0.0
glyburide-metformin 0.0
glipizide-metformin 0.0
glimepiride-pioglitazone 0.0
metformin-rosiglitazone 0.0
metformin-pioglitazone 0.0
change 0.0
diabetesMed 0.0
readmitted 0.0


Drop the feature with more than 40% of missing values

In [3]:
df = df.drop(['weight','payer_code','medical_specialty'], axis = 1)

Look at nunique values

In [131]:
pd.DataFrame(
    {
        "unique_values": df.nunique(axis=0),
        "data_type": df.dtypes,
    }
)

Unnamed: 0,unique_values,data_type
encounter_id2,90766,int64
patient_nbr2,65765,int64
race,6,object
gender,3,object
age,10,object
admission_type_id,8,int64
discharge_disposition_id,26,int64
admission_source_id,17,int64
time_in_hospital,14,int64
num_lab_procedures,118,int64


Drop the feature with only one value

In [4]:
df = df.drop(['citoglipton', 'examide'], axis = 1)

Drop the  rows with missing value

In [5]:
drop_Idx = set(df[(df['diag_1'] == '?') & (df['diag_2'] == '?') & (df['diag_3'] == '?')].index)

drop_Idx = drop_Idx.union(set(df['diag_1'][df['diag_1'] == '?'].index))
drop_Idx = drop_Idx.union(set(df['diag_2'][df['diag_2'] == '?'].index))
drop_Idx = drop_Idx.union(set(df['diag_3'][df['diag_3'] == '?'].index))
drop_Idx = drop_Idx.union(set(df['race'][df['race'] == '?'].index))
#drop_Idx = drop_Idx.union(set(df[df['discharge_disposition_id'] == 11].index))
drop_Idx = drop_Idx.union(set(df['gender'][df['gender'] == 'Unknown/Invalid'].index))
new_Idx = list(set(df.index) - set(drop_Idx))
df = df.iloc[new_Idx]

Replace the same mapping in features

In [6]:
df['admission_type_id'] = df['admission_type_id'].replace(8,5)
df['admission_type_id'] = df['admission_type_id'].replace(6,5)

df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(25,18)
df['discharge_disposition_id'] = df['discharge_disposition_id'].replace(26,18)


df['admission_source_id'] = df['admission_source_id'].replace(15,9)
df['admission_source_id'] = df['admission_source_id'].replace(20,9)
df['admission_source_id'] = df['admission_source_id'].replace(21,9)
df['admission_source_id'] = df['admission_source_id'].replace(17,9)

Use bin to categoized the diag 1, 2, 3

In [7]:
df['diag_1'] = df['diag_1'].replace(regex=r'(^.*V.*$)', value=0)
df['diag_1'] = df['diag_1'].replace(regex=r'(^.*E.*$)', value=1)
df['diag_2'] = df['diag_2'].replace(regex=r'(^.*V.*$)', value=0)
df['diag_2'] = df['diag_2'].replace(regex=r'(^.*E.*$)', value=1)
df['diag_3'] = df['diag_3'].replace(regex=r'(^.*V.*$)', value=0)
df['diag_3'] = df['diag_3'].replace(regex=r'(^.*E.*$)', value=1)

In [8]:
from sklearn.preprocessing import KBinsDiscretizer
est = KBinsDiscretizer(n_bins=9, encode='ordinal', strategy='quantile')
est.fit(df[['diag_1']])
df['diag_1'] = est.transform(df[['diag_1']])
df['diag_1']=df['diag_1'].astype('category')

In [9]:
est.fit(df[['diag_2']])
df['diag_2'] = est.transform(df[['diag_2']])
df['diag_2']=df['diag_2'].astype('category')

In [10]:
est.fit(df[['diag_3']])
df['diag_3'] = est.transform(df[['diag_3']])
df['diag_3']=df['diag_3'].astype('category')

Look at unique_values again

In [139]:
pd.DataFrame(
    {
        "unique_values": df.nunique(axis=0),
        "data_type": df.dtypes,
    }
)

Unnamed: 0,unique_values,data_type
encounter_id2,87194,int64
patient_nbr2,62953,int64
race,5,object
gender,2,object
age,10,object
admission_type_id,6,int64
discharge_disposition_id,25,int64
admission_source_id,15,int64
time_in_hospital,14,int64
num_lab_procedures,118,int64


Drop the  rows with missing value

In [11]:
df = df.drop(['metformin-rosiglitazone'], axis = 1)	

Count readmitted No, >30, <30

Check if it is balanced

In [141]:
df['readmitted'].value_counts()

NO     46929
>30    30372
<30     9893
Name: readmitted, dtype: int64

cols = ["num_lab_procedures","num_procedures","num_medications","number_outpatient",
"number_emergency","number_inpatient","number_diagnoses"]
Q1 = df[cols].quantile(0.25)
Q3 = df[cols].quantile(0.75)
IQR = Q3 - Q1

df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

Encode the category data to nominal

In [12]:
df[df.select_dtypes(['object']).columns] = df.select_dtypes(['object']).apply(lambda x: x.astype('category'))

In [13]:
encodecols = df.select_dtypes(['category']).drop('readmitted',axis = 1)
df[encodecols.columns] = encodecols.apply(lambda x: x.cat.codes)

Show the correlation

In [144]:
from matplotlib.colors import ListedColormap
my_cmap = ListedColormap(sns.light_palette((250, 100, 50), input="husl", n_colors=50).as_hex())
table = df.corr(method='pearson')
table.style.background_gradient(cmap=my_cmap, axis = 0)

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-pioglitazone,change,diabetesMed
encounter_id2,1.0,0.524871,0.100175,0.006058,0.063163,-0.164827,-0.130571,-0.018838,-0.071175,-0.035524,-0.017918,0.069102,0.112046,0.085817,0.028199,0.021345,0.072429,0.046741,0.264808,0.03241,0.018108,0.032727,0.024001,0.020123,-0.019571,0.029868,-0.002413,-0.015095,-0.065669,-0.009517,0.045582,-0.054789,-0.001525,-0.000429,-0.00879,-0.017377,0.053236,0.036448,0.00012,0.002193,0.007511,-0.096999,0.054134
patient_nbr2,0.524871,1.0,0.152504,0.00851,0.067137,-0.026754,-0.141802,0.020145,-0.031669,0.004726,-0.020897,0.013642,0.097969,0.059008,0.005191,0.019889,0.042188,0.031858,0.214723,0.013066,0.023855,0.010144,0.049742,0.021239,-0.011609,0.02349,-0.002738,0.002439,-0.049286,-0.003492,0.040026,-0.011206,0.007889,0.005902,-0.00194,-0.015647,0.007456,0.030416,-0.002708,-0.001034,-0.001036,-0.057107,0.023793
race,0.100175,0.152504,1.0,0.068404,0.131236,0.10927,-0.008435,-0.015464,-0.020382,-0.02583,0.029461,0.033258,0.044454,-0.021764,-0.028109,0.025312,0.038165,0.005712,0.072796,-0.015735,-0.005747,0.016378,0.021987,-0.009716,0.006888,0.015243,0.001348,0.020808,0.013349,0.004278,0.024943,0.003727,0.01432,-0.001939,0.002335,0.000668,-0.018284,0.010136,0.003813,0.001348,0.001348,-0.014815,0.003813
gender,0.006058,0.00851,0.068404,1.0,-0.053887,0.015681,-0.021506,-0.012114,-0.02789,-0.00259,0.062285,-0.020991,-0.013016,-0.018666,-0.012992,-0.039879,0.003677,0.003932,-0.003543,0.000927,-0.018245,0.000519,-0.007054,-0.006182,0.004271,0.00069,-0.003135,0.02557,0.026588,-0.001868,0.005328,0.009317,0.013135,1e-06,0.006336,0.009093,0.002138,0.006696,0.003142,-0.003135,-0.003135,-0.012415,0.014328
age,0.063163,0.067137,0.131236,-0.053887,1.0,-0.002695,0.11966,0.04855,0.097866,0.011585,-0.043652,0.020124,0.018863,-0.085258,-0.0489,0.044818,0.105878,0.072592,0.205088,-0.000834,0.057691,-0.059589,0.043726,0.014969,0.012882,0.030222,0.001909,0.040402,0.060465,0.013875,0.009376,-0.000238,0.007827,0.005941,-0.001704,0.006669,-0.020924,-0.004551,0.001564,-0.000261,-0.000261,0.038238,-0.022146
admission_type_id,-0.164827,-0.026754,0.10927,0.015681,-0.002695,1.0,0.048848,-0.247256,-0.008902,-0.181543,0.135876,0.090773,0.020042,-0.015364,-0.039029,-0.009957,-0.02359,-0.016216,-0.118316,-0.104606,0.01777,0.015657,-0.002926,-0.009049,0.007558,-0.000951,-0.002534,0.010123,0.001789,0.004569,0.015508,0.021849,0.004679,0.000848,0.001694,0.005181,-0.018609,-0.001558,-0.003443,-0.002534,0.002734,-0.010194,0.000507
discharge_disposition_id,-0.130571,-0.141802,-0.008435,-0.021506,0.11966,0.048848,1.0,-0.002769,0.168598,0.024582,0.016927,0.11242,-0.014998,-0.027434,0.021595,0.013768,0.02889,0.030444,0.040699,-0.015325,0.006851,-0.008799,-0.008287,-0.011616,0.018852,-0.018295,0.012033,-0.015795,0.041944,0.001497,-0.012689,-0.004931,0.005992,0.004139,0.00448,0.010923,-0.02507,-0.003768,1.5e-05,-0.001807,-0.000489,0.014469,-0.033822
admission_source_id,-0.018838,0.020145,-0.015464,-0.012114,0.04855,-0.247256,-0.002769,1.0,0.000243,0.152994,-0.199706,-0.098276,0.02036,0.067022,0.060632,0.037553,0.019556,0.022309,0.104319,-0.0561,-0.020462,-0.036702,0.008533,-0.013421,-0.003576,-0.023871,0.002063,0.004167,-0.004086,0.003121,-0.018268,-0.018283,0.001321,0.001769,0.003573,0.002635,0.001581,-0.017214,0.000799,0.002063,-0.00506,-0.005425,0.002296
time_in_hospital,-0.071175,-0.031669,-0.020382,-0.02789,0.097866,-0.008902,0.168598,0.000243,1.0,0.315877,0.193938,0.461201,-0.014025,-0.011633,0.071063,-0.108181,0.098501,0.123182,0.2111,-0.037282,-0.028712,-0.008734,0.029144,0.004811,0.003924,0.011532,0.010776,0.00975,0.016225,0.003338,0.006228,0.005734,0.005979,0.003953,0.0037,-0.003499,0.048937,-0.000595,0.002191,-0.002747,0.001761,-0.107881,0.062492
num_lab_procedures,-0.035524,0.004726,-0.02583,-0.00259,0.011585,-0.181543,0.024582,0.152994,0.315877,1.0,0.062136,0.264945,-0.013611,-0.006651,0.039522,-0.057102,0.044459,0.072719,0.139062,0.015465,-0.119412,-0.044756,0.011805,-0.004959,0.001074,-0.000755,0.004284,0.008927,-0.005309,-0.000759,-0.014965,-0.009342,0.000806,-0.003773,0.004071,0.000648,0.034314,-0.012953,-0.00707,-0.000835,-0.003395,-0.055942,0.029051


2 label model

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,OrdinalEncoder
xtrain = df.drop(['readmitted','encounter_id2','patient_nbr2'], axis=1)
ytrain = df[["readmitted"]].replace({'>30':'Yes','<30':'Yes'})
X_train, X_test, y_train, y_test = train_test_split(xtrain, ytrain, test_size=0.2, random_state=22)

In [21]:
from sklearn.ensemble import RandomForestClassifier

pip = Pipeline([
    ('model',RandomForestClassifier(max_depth=500, random_state=0,min_samples_split=50,class_weight='balanced'))])

In [22]:
pip.fit(X_train,y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [23]:
pip.score(X_test, y_test)

0.6430414587992431

In [24]:
import sklearn
y_pred = pip.predict(X_test)
sklearn.metrics.confusion_matrix(y_test,y_pred)

array([[6375, 3048],
       [3177, 4839]], dtype=int64)

3 label model

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,OrdinalEncoder
xtrain = df.drop(['readmitted','encounter_id2','patient_nbr2'], axis=1)
ytrain = df[["readmitted"]]
X_train, X_test, y_train, y_test = train_test_split(xtrain, ytrain, test_size=0.2, random_state=42)

In [28]:
from sklearn.ensemble import RandomForestClassifier

pip = Pipeline([
    ('model',RandomForestClassifier(max_depth=28, random_state=0,min_samples_split=10,class_weight = 'balanced_subsample'))])

In [29]:
pip.fit(X_train,y_train)

  self._final_estimator.fit(Xt, y, **fit_params_last_step)


In [30]:
pip.score(X_test, y_test)

0.5727966053099375

In [31]:
y_pred = pip.predict(X_test)
sklearn.metrics.confusion_matrix(y_test,y_pred)

array([[ 155,  853,  939],
       [ 220, 2798, 3019],
       [ 133, 2286, 7036]], dtype=int64)

In [32]:
pip.named_steps['model'].feature_importances_

array([2.40411625e-02, 1.82550818e-02, 5.39086174e-02, 2.83211824e-02,
       5.32280859e-02, 2.65540163e-02, 6.21180190e-02, 1.04844904e-01,
       4.38366937e-02, 8.96903448e-02, 2.14491641e-02, 1.89406452e-02,
       6.12167959e-02, 6.08845583e-02, 6.26834145e-02, 6.16003895e-02,
       4.34940150e-02, 9.05105919e-03, 2.24822495e-02, 1.52578386e-02,
       4.09957140e-03, 2.07572800e-03, 2.58784086e-04, 9.25808260e-03,
       0.00000000e+00, 1.55660326e-02, 1.39136566e-02, 6.27272776e-05,
       1.01092012e-02, 9.12969311e-03, 8.64018925e-04, 8.52653050e-05,
       7.46746663e-06, 1.33643569e-04, 2.92294113e-02, 1.80029602e-03,
       1.68563826e-05, 1.98365452e-06, 8.41419679e-07, 1.20599431e-02,
       9.46855899e-03])

In [33]:
test = pd.read_csv('C2T1_Test.csv')

In [34]:
test = test.drop(['weight','payer_code','medical_specialty'], axis = 1)
test = test.drop(['citoglipton', 'examide'], axis = 1)
test['diag_1'] = test['diag_1'].replace('?',test['diag_1'].value_counts().idxmax())
test['diag_2'] = test['diag_2'].replace('?',test['diag_2'].value_counts().idxmax())
test['diag_3'] = test['diag_3'].replace('?',test['diag_3'].value_counts().idxmax())
test['race'] = test['race'].replace('?',test['race'].value_counts().idxmax())
test['gender'] = test['gender'].replace('?',test['gender'].value_counts().idxmax())

test['diag_1'] = test['diag_1'].replace(regex=r'(^.*V.*$)', value=0)
test['diag_1'] = test['diag_1'].replace(regex=r'(^.*E.*$)', value=1)
test['diag_2'] = test['diag_2'].replace(regex=r'(^.*V.*$)', value=0)
test['diag_2'] = test['diag_2'].replace(regex=r'(^.*E.*$)', value=1)
test['diag_3'] = test['diag_3'].replace(regex=r'(^.*V.*$)', value=0)
test['diag_3'] = test['diag_3'].replace(regex=r'(^.*E.*$)', value=1)
est.fit(test[['diag_1']])
test['diag_1'] = est.transform(test[['diag_1']])
test['diag_1']=test['diag_1'].astype('category')
est.fit(test[['diag_2']])
test['diag_2'] = est.transform(test[['diag_2']])
test['diag_2']=test['diag_2'].astype('category')
est.fit(test[['diag_3']])
test['diag_3'] = est.transform(test[['diag_3']])
test['diag_3']=test['diag_3'].astype('category')
test = test.drop(['metformin-rosiglitazone'], axis = 1)
test[test.select_dtypes(['object']).columns] = test.select_dtypes(['object']).apply(lambda x: x.astype('category'))
test[test.select_dtypes(['category']).columns] = test.select_dtypes(['category']).apply(lambda x: x.cat.codes)

In [35]:
x_test = test.drop(['readmitted','encounter_id','patient_nbr'], axis=1)
y_pred =pip.predict(x_test)

In [36]:
pd.DataFrame(
    {'encounter_id': test.encounter_id, 'patient_nbr':test.patient_nbr, 'readmitted':y_pred}).to_csv(' C2T1_Test_Lableled.csv', index=False)