In [342]:
from __future__ import division

import pandas as pd
import numpy as np
%matplotlib inline

# model imports
from sklearn.linear_model import SGDRegressor, LinearRegression
from sklearn.svm import LinearSVR, SVR
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

# feature selection and preprocessing
from sklearn.feature_selection import SelectFromModel

# cross validation and grid search
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.grid_search import GridSearchCV

# metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [343]:
df = pd.read_csv("diabetes-training.csv")

In [344]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392.0,8222157.0,Caucasian,Female,[0-10),?,6.0,25.0,1.0,1.0,...,No,No,No,No,No,No,No,No,No,NO
1,149190.0,55629189.0,Caucasian,Female,[10-20),?,1.0,1.0,7.0,3.0,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410.0,86047875.0,AfricanAmerican,Female,[20-30),?,1.0,1.0,7.0,2.0,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364.0,82442376.0,Caucasian,Male,[30-40),?,1.0,1.0,7.0,2.0,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680.0,42519267.0,Caucasian,Male,[40-50),?,1.0,1.0,7.0,1.0,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [345]:
categorical = """
Race
Gender
Age
Admission_type_id
Discharge_disposition_id
Admission_source_id
Medical_specialty
Diag_1
Diag_2
Diag_3
Metformin
Repaglinide
Nateglinide
Chlorpropamide
Glimepiride
Acetohexamide
Glipizide
Glyburide
Tolbutamide
Pioglitazone
Rosiglitazone
Acarbose
Miglitol
Troglitazone
Tolazamide
Examide
Citoglipton
Insulin
Glyburide-metformin
Glipizide-metformin
Glimepiride-pioglitazone
Metformin-rosiglitazone
Metformin-pioglitazone
diabetesMed
Change
readmitted
"""

In [346]:
discrete = """
Time_in_hospital
Num_lab_procedures
Num_procedures
Num_medications
number_diagnoses
"""

In [347]:
cat_ft = categorical.split("\n")
cat_ft = [cat.strip() for cat in cat_ft]
cat_ft = [cat.lower() for cat in cat_ft if len(cat) > 0]
dmed = cat_ft.index("diabetesmed")
cat_ft.pop(dmed)
cat_ft.append("diabetesMed")

In [348]:
dis_ft = discrete.split("\n")
dis_ft = [dis.strip() for dis in dis_ft]
dis_ft = [dis.lower() for dis in dis_ft if len(dis) > 0]

In [353]:
# dtype converted dataframe
df_conv = df.copy()

In [354]:
exclusions = """Payer_code
Weight
Number_outpatient
Number_emergency
Number_inpatient
Max_glu_serum
A1Cresult"""

In [355]:
exclusions_arr = exclusions.split('\n')
exclusions_arr = [x.lower().strip() for x in exclusions_arr]
a1c = exclusions_arr.index("a1cresult")
exclusions_arr.pop(a1c)
exclusions_arr.append("A1Cresult")

df_conv.drop(exclusions_arr, axis=1,inplace=True)
df_conv.columns

Index([u'encounter_id', u'patient_nbr', u'race', u'gender', u'age',
       u'admission_type_id', u'discharge_disposition_id',
       u'admission_source_id', u'time_in_hospital', u'medical_specialty',
       u'num_lab_procedures', u'num_procedures', u'num_medications', u'diag_1',
       u'diag_2', u'diag_3', u'number_diagnoses', u'metformin', u'repaglinide',
       u'nateglinide', u'chlorpropamide', u'glimepiride', u'acetohexamide',
       u'glipizide', u'glyburide', u'tolbutamide', u'pioglitazone',
       u'rosiglitazone', u'acarbose', u'miglitol', u'troglitazone',
       u'tolazamide', u'examide', u'citoglipton', u'insulin',
       u'glyburide-metformin', u'glipizide-metformin',
       u'glimepiride-pioglitazone', u'metformin-rosiglitazone',
       u'metformin-pioglitazone', u'change', u'diabetesMed', u'readmitted'],
      dtype='object')

In [356]:
encoded_ft = cat_ft + dis_ft

In [357]:
for col in encoded_ft:
    df_conv[col] = df_conv[col].apply(lambda x: str(x))

In [358]:
prefixes = {col: "_%s_=" % col for col in encoded_ft}
df_encoded = pd.get_dummies(df_conv, columns=encoded_ft, prefix=prefixes)

In [359]:
df_encoded.head()

Unnamed: 0,encounter_id,patient_nbr,_race_=_?,_race_=_AfricanAmerican,_race_=_Asian,_race_=_Caucasian,_race_=_Hispanic,_race_=_Other,_race_=_nan,_gender_=_Female,...,_number_diagnoses_=_16.0,_number_diagnoses_=_2.0,_number_diagnoses_=_3.0,_number_diagnoses_=_4.0,_number_diagnoses_=_5.0,_number_diagnoses_=_6.0,_number_diagnoses_=_7.0,_number_diagnoses_=_8.0,_number_diagnoses_=_9.0,_number_diagnoses_=_nan
0,2278392.0,8222157.0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,149190.0,55629189.0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,64410.0,86047875.0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,500364.0,82442376.0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,16680.0,42519267.0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [360]:
features = [feat for feat in df_encoded.columns if feat not in ['encounter_id', 'patient_nbr', 'admission_type_id']]


In [None]:
x = df_encoded[features].values
y = df_encoded["encounter_id"].values

In [264]:
from sklearn import decomposition
pca = decomposition.PCA(n_components=1000)
pca.fit(x)
reduced_x = pca.transform(x)

In [265]:
print pca.explained_variance_ratio_.cumsum()[-1]

split = int(len(x)*0.7)
x_train = reduced_x[:split]
x_test = reduced_x[split:]
y_train = y[:split]
y_test = y[split:]

0.99318735328


In [92]:
# without PCA
split = int(len(x)*0.7)
x_train = x[:split]
x_test = x[split:]
y_train = y[:split]
y_test = y[split:]

In [266]:
linreg = LinearRegression()

In [267]:
linreg.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [268]:
asc_indices = np.argsort(linreg.coef_).tolist()

for i in range(10):
    print features[asc_indices[-i]]

_diag_2_=_250.22
_diag_2_=_250.51
_diag_2_=_478
_diag_2_=_484
_diag_2_=_585
_diag_2_=_453
_diag_2_=_446
_diag_2_=_441
_diag_2_=_250.82
_diag_2_=_512


In [269]:
prediction = linreg.predict(x_test)
print "Linear Regression"
print "root mean squared: %s" % mean_squared_error(y_test, prediction)**(1/2)

Linear Regression
root mean squared: 45905738.1789


In [270]:
svr = SVR(verbose=2)
svr.fit(x_train, y_train)
prediction = svr.predict(x_test)
print "Support Vector Regressor"
print "root mean squared: %s" % mean_squared_error(y_test, prediction)**(1/2)

[LibSVM]Support Vector Regressor
root mean squared: 54595406.2802


In [102]:
rf_reg = RandomForestRegressor(n_estimators=10, verbose=2)
rf_reg.fit(x_train, y_train)
prediction = rf_reg.predict(x_test)
print "Random Forest Regressor"
print "root mean squared: %s" % mean_squared_error(y_test, prediction)**(1/2)

building tree 1 of 10


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.7min remaining:    0.0s


building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10
building tree 9 of 10
building tree 10 of 10


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 23.7min finished


Random Forest Regressor
root mean squared: 47219210.6696


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.2s finished


In [271]:
# Testing

test_df = pd.read_csv("diabetes-problem-2.csv")

# dtype converted dataframe
test_df_conv = test_df.copy()

for col in encoded_ft:
    test_df_conv[col] = test_df_conv[col].apply(lambda x: str(x))
    
prefixes = {col: "_%s_=" % col for col in encoded_ft}


test_df_encoded = pd.get_dummies(test_df_conv, columns=encoded_ft, prefix=prefixes)

cols = df_encoded.columns

for i in range(len(cols)):
    if cols[i] not in test_df_encoded.columns:
        test_df_encoded[cols[i]] = [0]*len(test_df_encoded)
    
test_df_encoded = test_df_encoded[cols]
print test_df_encoded.shape

(64, 2131)


In [272]:
x_test = test_df_encoded[features].values

reduced_x_test = pca.transform(x_test)

y_pred = linreg.predict(reduced_x_test)

In [273]:
patient_encounter_map = {}
for row in df.iterrows():
    if row[1]["patient_nbr"] in patient_encounter_map and row[1]["encounter_id"] > patient_encounter_map[row[1]["patient_nbr"]]:
        patient_encounter_map[row[1]["patient_nbr"]] = row[1]["encounter_id"]
        continue
    patient_encounter_map[row[1]["patient_nbr"]] = row[1]["encounter_id"]    

In [280]:
output = []
for i in range(len(test_df)):
    next_encounter_id = int(round(patient_encounter_map[test_df.iloc[i].patient_nbr]+y_pred[i]))
    output.append( {"patient_nbr": test_df.iloc[i].patient_nbr, "next_encounter_id": next_encounter_id} )
                                                                      
output_df = pd.DataFrame(output)
output_df.to_csv('output.csv')