In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# 1. Import raw dataset

In [3]:
df = pd.read_csv('diabetic_data.csv')
df_dtype_object = pd.read_csv('diabetic_data.csv',dtype='object')

## 1.1 Visualize Raw data 

In [4]:
pd.set_option('display.max_columns', 500)
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


# 2. Parsing and Cleaning

## 2.1 Parsing Features

As we can see, **"weight", "payer_code" and "medical_specialty"** is missing alot of the data. It will not be ideal to use these feature. 

**"encounter_id" and "patient_nbr"** are also not an potential predictors.

Also removing all the **24 features for medications, glucose serum test result and A1c test result**, Which we will not study for this project

In [5]:
df_parse = df[[i for i in df.columns if i not in ('weight','payer_code','medical_specialty','encounter_id','patient_nbr')]]
med_24 = df_parse.loc[:,'max_glu_serum':'metformin-pioglitazone'].columns
df_parse = df_parse.drop(med_24, axis=1)
df_parse.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,0,0,0,250.83,?,?,1,No,No,NO
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,0,0,0,276.0,250.01,255,9,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,2,0,1,648.0,250,V27,6,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,0,0,0,8.0,250.43,403,7,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,197.0,157,250,5,Ch,Yes,NO


## 2.2 Removing rows contain missing value

Currently Race and Diag have missing value. However it is ok to only have one diagnosis code. Therefore, we only eliminating row if "race" and "diag_1" is equal to "?"

In [6]:
df_clean = df_parse[df_parse.race != '?']
df_clean = df_clean[df_clean.diag_1 != '?']
df_clean.count()

race                        99474
gender                      99474
age                         99474
admission_type_id           99474
discharge_disposition_id    99474
admission_source_id         99474
time_in_hospital            99474
num_lab_procedures          99474
num_procedures              99474
num_medications             99474
number_outpatient           99474
number_emergency            99474
number_inpatient            99474
diag_1                      99474
diag_2                      99474
diag_3                      99474
number_diagnoses            99474
change                      99474
diabetesMed                 99474
readmitted                  99474
dtype: int64

## 2.3 Redefining Features

### Recategorized Diagnosis Code

![image.png](attachment:image.png)

In [7]:
def replace_diag_code(code):
    if (code <= '459' and code >= '390') or code == '785':
        return 'Circulatory'
    elif (code <= '519' and code >= '460') or code == '786':
        return 'Respiratory'
    elif (code <= '579' and code >= '520') or code == '787':
        return 'Digestive'
    elif (code < '251' and code > '249'):
        return 'Diabetes'
    elif (code <= '999' and code >= '800'):
        return 'Injury'
    elif (code <= '739' and code >= '710'):
        return 'Musculoskeletal'
    elif (code <= '629' and code >= '580') or code == '788':
        return 'Genitourinary'
    elif (code <= '239' and code >= '140'):
        return 'Neoplasms'
    elif code == '?': 
        return None
    else:
        return 'Others'

### Re-mapped Catogory Variables


In [47]:
df_redefine = df_clean.copy()
## outcome
df_redefine.readmitted.replace(['NO','<30','>30'],[0,1,0],inplace=True)

'''
## gender
df_redefine.gender.replace(['Male','Female','Unknown/Invalid'],[0,1,2],inplace=True)

## race
df_redefine.race.replace(['AfricanAmerican','Asian','Caucasian','Hispanic','Other'],[0,1,2,3,4],inplace=True)

## age
df_redefine.age.replace(
    ['[0-10)','[10-20)','[20-30)','[30-40)','[40-50)','[50-60)','[60-70)','[70-80)','[80-90)','[90-100)'],
    [5,15,25,35,45,55,65,75,85,95],
    inplace=True)

## Change of medications
df_redefine.change.replace(['No','Ch'],[0,1],inplace=True)

## Diabetes medications
df_redefine.diabetesMed.replace(['No','Yes'],[0,1],inplace=True)
'''

## diag_1
df_redefine.diag_1 = df_redefine.diag_1.apply(replace_diag_code)

## diag_2
df_redefine.diag_2 = df_redefine.diag_2.apply(replace_diag_code)

## diag_3
df_redefine.diag_3 = df_redefine.diag_3.apply(replace_diag_code)


## Display
df_redefine

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,change,diabetesMed,readmitted
0,Caucasian,Female,[0-10),6,25,1,1,41,0,1,0,0,0,Diabetes,,,1,No,No,0
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,0,0,0,Others,Diabetes,Others,9,Ch,Yes,0
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,2,0,1,Others,Diabetes,Others,6,No,Yes,0
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,0,0,0,Others,Diabetes,Circulatory,7,Ch,Yes,0
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,Neoplasms,Neoplasms,Diabetes,5,Ch,Yes,0
5,Caucasian,Male,[50-60),2,1,2,3,31,6,16,0,0,0,Circulatory,Circulatory,Diabetes,9,No,Yes,0
6,Caucasian,Male,[60-70),3,1,2,4,70,1,21,0,0,0,Circulatory,Circulatory,Others,7,Ch,Yes,0
7,Caucasian,Male,[70-80),1,1,7,5,73,0,12,0,0,0,Circulatory,Respiratory,Diabetes,8,No,Yes,0
8,Caucasian,Female,[80-90),2,1,4,13,68,2,28,0,0,0,Circulatory,Circulatory,Others,8,Ch,Yes,0
9,Caucasian,Female,[90-100),3,3,4,12,33,3,18,0,0,0,Circulatory,Neoplasms,Respiratory,8,Ch,Yes,0


In [48]:
x_raw = df_redefine.iloc[:,:-1]
y_raw = df_redefine.readmitted

# 5. Modeling

In [9]:
from sklearn.ensemble import RandomForestClassifier
#from sklearn.datasets import make_classification
#from sklearn.ensemble import ExtraTreesClassifier
from sklearn.utils import resample
from sklearn.cross_validation import cross_val_score



In [28]:
def print_accuracy_report(classifier, X, y, num_validations=5):
    accuracy = cross_val_score(classifier, 
            X, y, scoring='accuracy', cv=num_validations)
    print ('Accuracy:' + str(accuracy), 'Average Accuracy:' + str(accuracy.mean())) 
    
    AUC = cross_val_score(classifier, 
            X, y, scoring='roc_auc', cv=num_validations)
    print ('CV AUC:' + str(AUC), 'Average AUC:' + str(AUC.mean())) 

## Model with Demographic

In [50]:
features_demograph = ['race','gender','age']
dummy_demograph = ['race','gender','age']
df_x_demograph = pd.get_dummies(x_raw[features_demograph], columns=dummy_demograph,drop_first=True)
df_x_demograph.head()

Unnamed: 0,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Male,gender_Unknown/Invalid,age_[10-20),age_[20-30),age_[30-40),age_[40-50),age_[50-60),age_[60-70),age_[70-80),age_[80-90),age_[90-100)
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0
4,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0


In [51]:
model = RandomForestClassifier(n_estimators = 50)
print_accuracy_report(model, df_x_demograph, y_raw, num_validations=5)

Accuracy:[ 0.88776074  0.88776074  0.88776074  0.88776074  0.88780537] Average Accuracy:0.887769668815
CV AUC:[ 0.51785726  0.50923292  0.51604555  0.52188011  0.5217211 ] Average AUC:0.517347388629


In [54]:
model.fit(df_x_demograph,y_raw)
feature_importances = model.feature_importances_

features_df_demograph = pd.DataFrame({'Features': df_x_demograph.columns, 'Importance Score': feature_importances})
features_df_demograph.sort_values('Importance Score', inplace=True, ascending=False)

features_df_demograph.head()

Unnamed: 0,Features,Importance Score
4,gender_Male,0.177843
10,age_[50-60),0.113241
2,race_Hispanic,0.092796
1,race_Caucasian,0.085226
7,age_[20-30),0.085152


## Model with Diagnosis

In [55]:
features_diag = ['diag_1','diag_2','diag_3']
dummy_diag = ['diag_1','diag_2','diag_3']
df_x_diag = pd.get_dummies(x_raw[features_diag], columns=dummy_diag,drop_first=True)
df_x_diag.head()

Unnamed: 0,diag_1_Diabetes,diag_1_Digestive,diag_1_Genitourinary,diag_1_Injury,diag_1_Musculoskeletal,diag_1_Neoplasms,diag_1_Others,diag_1_Respiratory,diag_2_Diabetes,diag_2_Digestive,diag_2_Genitourinary,diag_2_Injury,diag_2_Musculoskeletal,diag_2_Neoplasms,diag_2_Others,diag_2_Respiratory,diag_3_Diabetes,diag_3_Digestive,diag_3_Genitourinary,diag_3_Injury,diag_3_Musculoskeletal,diag_3_Neoplasms,diag_3_Others,diag_3_Respiratory
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0


In [56]:
model = RandomForestClassifier(n_estimators = 20)
model.fit(df_x_diag,y_raw)

print_accuracy_report(model, df_x_diag, y_raw, num_validations=5)  

Accuracy:[ 0.88760995  0.88760995  0.88730837  0.88735863  0.88720217] Average Accuracy:0.887417815553
CV AUC:[ 0.53110683  0.53860994  0.52276257  0.53046758  0.52292506] Average AUC:0.529174395993


In [23]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df_x_diag, y_raw, test_size=0.3, random_state=0)
model_diag2 = RandomForestClassifier(n_estimators = 20)
model_diag2.fit(X_train,y_train)
model_diag2.score(X_test,y_test)

0.88637201353751294

In [57]:
feature_importances = model.feature_importances_

features_df_diag = pd.DataFrame({'Features': df_x_diag.columns, 'Importance Score': feature_importances})
features_df_diag.sort_values('Importance Score', inplace=True, ascending=False)

features_df_diag.head()

Unnamed: 0,Features,Importance Score
6,diag_1_Others,0.079597
16,diag_3_Diabetes,0.054499
8,diag_2_Diabetes,0.051869
10,diag_2_Genitourinary,0.050821
14,diag_2_Others,0.044374


## Model with Linear features

In [58]:
features_linear = ['num_lab_procedures','num_procedures','num_medications','number_diagnoses']
df_x_linear = x_raw[features_linear]
df_x_linear.head()

Unnamed: 0,num_lab_procedures,num_procedures,num_medications,number_diagnoses
0,41,0,1,1
1,59,0,18,9
2,11,5,13,6
3,44,1,16,7
4,51,0,8,5


In [59]:
model = RandomForestClassifier(n_estimators = 20)

print_accuracy_report(model, df_x_linear, y_raw, num_validations=5)

Accuracy:[ 0.86132194  0.86016587  0.86338276  0.86282986  0.85930431] Average Accuracy:0.86140094802
CV AUC:[ 0.51041213  0.50831672  0.50895181  0.52035348  0.52257247] Average AUC:0.514121320549


In [60]:
model.fit(df_x_linear,y_raw)
feature_importances = model.feature_importances_

features_df_diag = pd.DataFrame({'Features': df_x_linear.columns, 'Importance Score': feature_importances})
features_df_diag.sort_values('Importance Score', inplace=True, ascending=False)

features_df_diag.head()

Unnamed: 0,Features,Importance Score
0,num_lab_procedures,0.541627
2,num_medications,0.285868
1,num_procedures,0.096119
3,number_diagnoses,0.076387


## Inbalance Data Set, Might be Bias to predict value

In [61]:
y_raw.value_counts()
# 0s Outcome has a way larger weights than 1s

0    88310
1    11164
Name: readmitted, dtype: int64

## Resample

### Up Sample

In [71]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = df_redefine[df_redefine.readmitted != 1 ]
df_minority = df_redefine[df_redefine.readmitted == 1 ]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=88310,  # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.readmitted.value_counts()

1    88310
0    88310
Name: readmitted, dtype: int64

In [73]:
features_upsample = ['gender','age','num_lab_procedures','num_procedures','num_medications','diag_1','diag_2','diag_3','number_diagnoses']

dummy_upsample = ['gender', 'age','diag_1','diag_2','diag_3']

df_upsample_x = pd.get_dummies(df_upsampled[features_upsample], columns=dummy_upsample,drop_first=True)

In [74]:
model = RandomForestClassifier(n_estimators = 20)

print_accuracy_report(model, df_upsample_x, df_upsampled.readmitted, num_validations=5)

Accuracy:[ 0.98148568  0.98074963  0.97817348  0.97647492  0.97542747] Average Accuracy:0.978462235307
CV AUC:[ 0.99876774  0.99865424  0.99741433  0.99771845  0.99812945] Average AUC:0.99813684309


In [76]:
# Use higher fold of cross validation

model = RandomForestClassifier(n_estimators = 20)

print_accuracy_report(model, df_upsample_x, df_upsampled.readmitted, num_validations=30)

Accuracy:[ 0.98980978  0.99048913  0.9876019   0.99014946  0.99133832  0.98522418
  0.98692255  0.98947011  0.9889606   0.99031929  0.98709239  0.98590353
  0.98743207  0.98284647  0.98658288  0.9830163   0.98488451  0.98471467
  0.9857337   0.98658288  0.98335032  0.9840299   0.98606864  0.98470948
  0.98691811  0.9841998   0.98521916  0.9840299   0.98284064  0.98369011] Average Accuracy:0.986337693117
CV AUC:[ 0.99957477  0.99957518  0.99933207  0.99938382  0.9997738   0.99893898
  0.99847297  0.99910241  0.99905136  0.99916506  0.99935792  0.99942345
  0.99870211  0.99873193  0.99875668  0.99895381  0.9987831   0.99861909
  0.99933496  0.99898357  0.9982801   0.9984646   0.99879815  0.9988497
  0.99892192  0.99831422  0.99838476  0.99915064  0.99899535  0.99902826] Average AUC:0.998973492412


In [75]:
model.fit(df_upsample_x, df_upsampled.readmitted)
features = df_upsample_x.columns
feature_importances = model.feature_importances_

features_df = pd.DataFrame({'Features': features, 'Importance Score': feature_importances})
features_df.sort_values('Importance Score', inplace=True, ascending=False)

features_df.head()

Unnamed: 0,Features,Importance Score
0,num_lab_procedures,0.258909
2,num_medications,0.197487
1,num_procedures,0.076117
3,number_diagnoses,0.071611
4,gender_Male,0.039147
