In [44]:
from __future__ import division

import pandas as pd
import numpy as np
%matplotlib inline

# model imports
from sklearn.linear_model import SGDRegressor, LogisticRegression
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

# feature selection and preprocessing
from sklearn.feature_selection import SelectFromModel

# cross validation and grid search
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV

# metrics
from sklearn.metrics import classification_report

from sklearn.svm import SVC

In [45]:
df = pd.read_csv("diabetes-training.csv")

In [46]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392.0,8222157.0,Caucasian,Female,[0-10),?,6.0,25.0,1.0,1.0,...,No,No,No,No,No,No,No,No,No,NO
1,149190.0,55629189.0,Caucasian,Female,[10-20),?,1.0,1.0,7.0,3.0,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410.0,86047875.0,AfricanAmerican,Female,[20-30),?,1.0,1.0,7.0,2.0,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364.0,82442376.0,Caucasian,Male,[30-40),?,1.0,1.0,7.0,2.0,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680.0,42519267.0,Caucasian,Male,[40-50),?,1.0,1.0,7.0,1.0,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [47]:
categorical = """
Race
Gender
Age
Admission_source_id
Medical_specialty
Diag_1
Diag_2
Diag_3
Metformin
Repaglinide
Nateglinide
Chlorpropamide
Glimepiride
Acetohexamide
Glipizide
Glyburide
Tolbutamide
Pioglitazone
Rosiglitazone
Acarbose
Miglitol
Troglitazone
Tolazamide
Examide
Citoglipton
Insulin
Glyburide-metformin
Glipizide-metformin
Glimepiride-pioglitazone
Metformin-rosiglitazone
Metformin-pioglitazone
diabetesMed
Change
readmitted
"""

In [48]:
discrete = """
Time_in_hospital
Num_lab_procedures
Num_procedures
Num_medications
number_diagnoses
"""

In [49]:
cat_ft = categorical.split("\n")
cat_ft = [cat.strip() for cat in cat_ft]
cat_ft = [cat.lower() for cat in cat_ft if len(cat) > 0]
dmed = cat_ft.index("diabetesmed")
cat_ft.pop(dmed)
cat_ft.append("diabetesMed")

In [50]:
dis_ft = discrete.split("\n")
dis_ft = [dis.strip() for dis in dis_ft]
dis_ft = [dis.lower() for dis in dis_ft if len(dis) > 0]

In [51]:
# dtype converted dataframe
df_conv = df.copy()

In [52]:
exclusions = """Payer_code
Weight
Number_outpatient
Number_emergency
Number_inpatient
Max_glu_serum
A1Cresult"""

In [53]:
exclusions_arr = exclusions.split('\n')
exclusions_arr = [x.lower().strip() for x in exclusions_arr]
a1c = exclusions_arr.index("a1cresult")
exclusions_arr.pop(a1c)
exclusions_arr.append("A1Cresult")

df_conv.drop(exclusions_arr, axis=1,inplace=True)
df_conv.columns

Index([u'encounter_id', u'patient_nbr', u'race', u'gender', u'age',
       u'admission_type_id', u'discharge_disposition_id',
       u'admission_source_id', u'time_in_hospital', u'medical_specialty',
       u'num_lab_procedures', u'num_procedures', u'num_medications', u'diag_1',
       u'diag_2', u'diag_3', u'number_diagnoses', u'metformin', u'repaglinide',
       u'nateglinide', u'chlorpropamide', u'glimepiride', u'acetohexamide',
       u'glipizide', u'glyburide', u'tolbutamide', u'pioglitazone',
       u'rosiglitazone', u'acarbose', u'miglitol', u'troglitazone',
       u'tolazamide', u'examide', u'citoglipton', u'insulin',
       u'glyburide-metformin', u'glipizide-metformin',
       u'glimepiride-pioglitazone', u'metformin-rosiglitazone',
       u'metformin-pioglitazone', u'change', u'diabetesMed', u'readmitted'],
      dtype='object')

In [54]:
encoded_ft = cat_ft + dis_ft

In [55]:
for col in encoded_ft:
    df_conv[col] = df_conv[col].apply(lambda x: str(x))

In [56]:
prefixes = {col: "_%s_=" % col for col in encoded_ft}
df_encoded = pd.get_dummies(df_conv, columns=encoded_ft, prefix=prefixes)

In [57]:
df_encoded.head()

Unnamed: 0,encounter_id,patient_nbr,admission_type_id,discharge_disposition_id,_race_=_?,_race_=_AfricanAmerican,_race_=_Asian,_race_=_Caucasian,_race_=_Hispanic,_race_=_Other,...,_number_diagnoses_=_16.0,_number_diagnoses_=_2.0,_number_diagnoses_=_3.0,_number_diagnoses_=_4.0,_number_diagnoses_=_5.0,_number_diagnoses_=_6.0,_number_diagnoses_=_7.0,_number_diagnoses_=_8.0,_number_diagnoses_=_9.0,_number_diagnoses_=_nan
0,2278392.0,8222157.0,6.0,25.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,149190.0,55629189.0,1.0,1.0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
2,64410.0,86047875.0,1.0,1.0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,500364.0,82442376.0,1.0,1.0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
4,16680.0,42519267.0,1.0,1.0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


In [58]:
features = [feat for feat in df_encoded.columns if feat not in ['encounter_id', 'patient_nbr', 'admission_type_id', "discharge_disposition_id"]]

In [59]:
x = df_encoded[features].values
y = df_encoded["admission_type_id"].values
y = map(str, y)

In [17]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=1000)
pca.fit(x)
reduced_x = pca.transform(x)

In [18]:
print pca.explained_variance_ratio_.cumsum()[-1]

split = int(len(x)*0.7)
x_train = reduced_x[:split]
x_test = reduced_x[split:]
y_train = y[:split]
y_test = y[split:]

0.990760577469


In [60]:
# without PCA
split = int(len(x)*0.7)
x_train = x[:split]
x_test = x[split:]
y_train = y[:split]
y_test = y[split:]

In [61]:
logreg = LogisticRegression()

In [62]:
logreg.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [63]:
asc_indices = np.argsort(logreg.coef_)
for i in range(8):
    for j in range(10):
        print features[asc_indices[i][-j]]
        
    print "\n"

_diag_1_=_V57
_admission_source_id_=_6.0
_admission_source_id_=_7.0
_admission_source_id_=_5.0
_diag_1_=_342
_diag_1_=_852
_diag_3_=_250.12
_diag_1_=_70
_diag_3_=_685
_diag_2_=_V63


_admission_source_id_=_6.0
_medical_specialty_=_Emergency/Trauma
_admission_source_id_=_9.0
_medical_specialty_=_Pathology
_admission_source_id_=_20.0
_medical_specialty_=_Cardiology-Pediatric
_medical_specialty_=_Surgery-Maxillofacial
_admission_source_id_=_4.0
_diag_2_=_54
_diag_1_=_250.92


_admission_source_id_=_17.0
_medical_specialty_=_Emergency/Trauma
_medical_specialty_=_Hospitalist
_admission_source_id_=_1.0
_diag_2_=_719
_medical_specialty_=_Otolaryngology
_diag_1_=_185
_admission_source_id_=_3.0
_medical_specialty_=_Psychology
_diag_3_=_614


_admission_source_id_=_7.0
_diag_3_=_401
_admission_source_id_=_4.0
_num_lab_procedures_=_39.0
_num_medications_=_18.0
_num_medications_=_7.0
_medical_specialty_=_?
_admission_source_id_=_14.0
_diag_1_=_870
_diag_3_=_873


_diag_1_=_V57
_admission_source_id

In [64]:
prediction = logreg.predict(x_test)
print "Logistic Regression"
print classification_report(y_test, prediction)

Logistic Regression
             precision    recall  f1-score   support

        1.0       0.93      0.93      0.93     18174
        2.0       0.50      0.55      0.52      4786
        3.0       0.75      0.69      0.72      6341
        4.0       0.00      0.00      0.00         3
        5.0       0.03      0.16      0.06        98
        6.0       0.95      0.61      0.74       699
        7.0       0.00      0.00      0.00        11
        8.0       0.67      0.02      0.04        97
        nan       0.00      0.00      0.00         3

avg / total       0.82      0.81      0.81     30212



In [None]:
svc = SVC()
svc.fit(x_train, y_train)
prediction = svc.predict(x_test)
print "Support Vector Classifier"
print classification_report(y_test, prediction)

In [102]:
rf_reg = RandomForestRegressor(n_estimators=10, verbose=2)
rf_reg.fit(x_train, y_train)
prediction = rf_reg.predict(x_test)
print "Random Forest Regressor"
print "root mean squared: %s" % mean_squared_error(y_test, prediction)**(1/2)

building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.7min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 23.7min finished


Random Forest Regressor
root mean squared: 47219210.6696


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


In [65]:
# Testing

test_df = pd.read_csv("diabetes-problem-1.csv")

# dtype converted dataframe
test_df_conv = test_df.copy()

for col in encoded_ft:
    test_df_conv[col] = test_df_conv[col].apply(lambda x: str(x))
    
prefixes = {col: "_%s_=" % col for col in encoded_ft}

test_df_encoded = pd.get_dummies(test_df_conv, columns=encoded_ft, prefix=prefixes)

cols = df_encoded.columns

for i in range(len(cols)):
    if cols[i] not in test_df_encoded.columns:
        test_df_encoded[cols[i]] = [0]*len(test_df_encoded)
    
test_df_encoded = test_df_encoded[cols]
print test_df_encoded.shape

(999, 2708)


In [66]:
x_test = test_df_encoded[features].values

y_pred = logreg.predict(x_test)

In [67]:
output = []
for i in range(len(test_df)):
    output.append( {"encounter_id": test_df.iloc[i].encounter_id, "admission_type_id": y_pred[i]} )
                                                                      
output_df = pd.DataFrame(output)
output_df.to_csv('output1-ch2.csv')