In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
np.random.seed(0)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 999

In [2]:
df_train = pd.read_csv("C2T1Data//C2T1_Train.csv")
df_train = df_train.replace({'?':np.nan})

In [3]:
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,5283,48330653,Caucasian,Female,[80-90),,2,1,4,13,,,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
1,8499,63555809,Caucasian,Female,[90-100),,3,3,4,12,,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
2,9441,42519137,Caucasian,Male,[40-50),,1,1,7,1,,,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
3,20997,89868902,AfricanAmerican,Female,[40-50),,1,1,7,9,,,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
4,28515,82637321,Caucasian,Male,[50-60),,2,1,2,3,,,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90761,443840309,100162346,AfricanAmerican,Male,[70-80),,1,3,7,3,MC,,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
90762,443840543,74694092,AfricanAmerican,Female,[80-90),,1,4,5,5,MC,,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
90763,443846909,41088659,Caucasian,Male,[70-80),,1,1,7,1,MC,,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO
90764,443849927,31693541,Caucasian,Female,[80-90),,2,3,7,10,MC,Surgery-General,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO


In [4]:
def missing_values_table(df_train):
        mis_val = df_train.isnull().sum()
        mis_val_percent = 100 * df_train.isnull().sum() / len(df_train)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df_train.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns
missing_values_table(df_train)

Your selected dataframe has 50 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
weight,87840,96.8
medical_specialty,43674,48.1
payer_code,38730,42.7
race,2207,2.4
diag_3,1357,1.5
diag_2,336,0.4
diag_1,17,0.0


In [5]:
#removing every column with more than 40% of missing values
df_train = df_train.drop(columns=['weight', 'medical_specialty', 'payer_code'])

In [6]:
#removing the rows that have missing values in column 'race'
df_train = df_train.dropna(subset=['race', 'diag_3', 'diag_1', 'diag_2'])

In [7]:
#replacing values in columns to a legit string to avoid any errors
df_train.A1Cresult = df_train.A1Cresult.replace({'>8':'Greater than 8', '>7':'Greater than 7'})

In [8]:
df_train.gender.value_counts()

Female             46955
Male               40239
Unknown/Invalid        1
Name: gender, dtype: int64

In [9]:
#removing the Undknow/Invalid enter in the gender #it would have been of no use!
df_train = df_train[df_train.gender != 'Unknown/Invalid']

In [10]:
#convert values starting with 'V' and 'E' to a numeric value so that Binning can be used on the whole column in diag_1
df_train.loc[df_train['diag_1'].str.contains('V',na=False), 'diag_1'] = 1000
df_train.loc[df_train['diag_1'].str.contains('E',na=False), 'diag_1'] = 1300
df_train.diag_1.value_counts()

428       6001
414       5836
786       3523
410       3183
486       3012
427       2403
491       1910
715       1809
434       1793
682       1783
780       1743
996       1725
276       1642
1000      1436
38        1436
250.8     1407
599       1384
584       1291
250.6     1050
518        945
577        915
820        909
493        872
435        861
562        841
574        818
250.7      764
296        762
440        750
560        745
433        695
998        662
250.13     630
722        627
578        556
507        539
250.02     532
789        488
530        467
453        461
403        434
250.11     432
8          430
535        412
402        404
295        390
415        378
997        375
162        373
458        371
724        361
250.82     354
250.12     338
285        320
733        310
278        306
401        302
590        298
592        295
482        292
280        280
197        277
558        276
153        269
824        258
569        250
426       

In [11]:
#convert values starting with 'V' and 'E' to a numeric value so that Binning can be used on the whole column in diag_2
df_train.loc[df_train['diag_2'].str.contains('V',na=False), 'diag_2'] = 1000
df_train.loc[df_train['diag_2'].str.contains('E',na=False), 'diag_2'] = 1300
df_train.diag_2.value_counts()

428       5756
276       5663
250       4886
427       4271
401       3248
599       2870
496       2819
414       2360
403       2306
411       2287
250.02    1754
707       1718
585       1655
1000      1529
584       1471
250.01    1376
491       1315
285       1292
425       1275
780       1265
682       1227
486       1226
518       1196
424        911
413        906
250.6      803
493        763
1300       630
305        592
786        558
280        517
998        487
410        477
511        463
785        460
38         423
996        393
571        362
272        361
197        352
440        350
560        349
162        349
577        344
295        342
41         337
997        336
788        335
453        304
789        304
574        297
578        297
648        292
402        288
287        279
198        274
426        265
530        255
799        253
278        245
303        243
70         240
438        231
730        228
250.41     217
342        210
790       

In [12]:
#applying binning to diag_1 by taking its float value into bin_diag and then binning in values of bin_diag in bin_diag_1
#binning intervals are selected from 
bins = [1, 140, 240, 280, 290, 320, 390, 460, 520, 580, 630, 680, 710, 740, 760, 780, 800, 1000, 1200, 1400]
df_train['bin_diag'] = df_train.diag_1.astype('float')
df_train['bin_diag2'] = df_train.diag_2.astype('float')
df_train['bin_diag_1'] = pd.cut(df_train['bin_diag'], bins=bins, right=False)
df_train['bin_diag_2'] = pd.cut(df_train['bin_diag2'], bins=bins, right=False)
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,bin_diag,bin_diag2,bin_diag_1,bin_diag_2
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,398.00,427.0,"[390, 460)","[390, 460)"
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,434.00,198.0,"[390, 460)","[140, 240)"
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,197.00,157.0,"[140, 240)","[140, 240)"
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,250.70,403.0,"[240, 280)","[390, 460)"
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,414.00,411.0,"[390, 460)","[390, 460)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90761,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,0,0,0,250.13,291,458,9,,Greater than 8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,250.13,291.0,"[240, 280)","[290, 320)"
90762,443840543,74694092,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,560.00,276.0,"[520, 580)","[240, 280)"
90763,443846909,41088659,Caucasian,Male,[70-80),1,1,7,1,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO,38.00,590.0,"[1, 140)","[580, 630)"
90764,443849927,31693541,Caucasian,Female,[80-90),2,3,7,10,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,996.00,285.0,"[800, 1000)","[280, 290)"


In [13]:
#now labeling all of the categorical features/columns using sklearn library (LabelEncoder)
#better coding practice can be followed to implement the below code - I would optimize this after completing the code!

lb_make = LabelEncoder()
df_train['race_code'] = lb_make.fit_transform(df_train['race'])
df_train['gender_code'] = lb_make.fit_transform(df_train['gender'])
df_train['max_glu_serum_code'] = lb_make.fit_transform(df_train['max_glu_serum'])
df_train['age_code'] = lb_make.fit_transform(df_train['age'])
df_train['A1Cresult_code'] = lb_make.fit_transform(df_train['A1Cresult'])
df_train['insulin_code'] = lb_make.fit_transform(df_train['insulin'])
df_train['change_code'] = lb_make.fit_transform(df_train['change'])
df_train['diabetesMed_code'] = lb_make.fit_transform(df_train['diabetesMed'])
df_train['bin_diag_1_code'] = lb_make.fit_transform(df_train['bin_diag_1'])
df_train['bin_diag_2_code'] = lb_make.fit_transform(df_train['bin_diag_2'])
df_train['num_diag'] = df_train['number_diagnoses']
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,bin_diag,bin_diag2,bin_diag_1,bin_diag_2,race_code,gender_code,max_glu_serum_code,age_code,A1Cresult_code,insulin_code,change_code,diabetesMed_code,bin_diag_1_code,bin_diag_2_code,num_diag
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,398.00,427.0,"[390, 460)","[390, 460)",2,0,2,8,2,2,0,1,6,6,8
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,434.00,198.0,"[390, 460)","[140, 240)",2,0,2,9,2,2,0,1,6,1,8
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,197.00,157.0,"[140, 240)","[140, 240)",2,1,2,4,2,2,0,1,1,1,5
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,250.70,403.0,"[240, 280)","[390, 460)",0,0,2,4,2,2,1,1,2,6,9
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,414.00,411.0,"[390, 460)","[390, 460)",2,1,2,5,2,2,1,1,6,6,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90761,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,0,0,0,250.13,291,458,9,,Greater than 8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,250.13,291.0,"[240, 280)","[290, 320)",0,1,2,7,1,0,0,1,2,4,9
90762,443840543,74694092,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,560.00,276.0,"[520, 580)","[240, 280)",0,0,2,8,2,2,1,1,8,2,9
90763,443846909,41088659,Caucasian,Male,[70-80),1,1,7,1,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO,38.00,590.0,"[1, 140)","[580, 630)",2,1,2,7,2,0,0,1,0,9,13
90764,443849927,31693541,Caucasian,Female,[80-90),2,3,7,10,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,996.00,285.0,"[800, 1000)","[280, 290)",2,0,2,8,2,3,0,1,15,3,9


In [14]:
#df_train['truth_le'] = lb_make.fit_transform(df_train['readmitted'])
#df_train.truth_le.value_counts()

df_train['truth_le'] = df_train.readmitted.replace({'NO':'1', '>30':'3', '<30':'2'})
df_train.truth_le.value_counts()

1    46929
3    30372
2     9893
Name: truth_le, dtype: int64

In [15]:
df_train.truth_le = df_train.truth_le.astype('int32')

In [16]:
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,bin_diag,bin_diag2,bin_diag_1,bin_diag_2,race_code,gender_code,max_glu_serum_code,age_code,A1Cresult_code,insulin_code,change_code,diabetesMed_code,bin_diag_1_code,bin_diag_2_code,num_diag,truth_le
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,398.00,427.0,"[390, 460)","[390, 460)",2,0,2,8,2,2,0,1,6,6,8,1
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,434.00,198.0,"[390, 460)","[140, 240)",2,0,2,9,2,2,0,1,6,1,8,1
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,197.00,157.0,"[140, 240)","[140, 240)",2,1,2,4,2,2,0,1,1,1,5,1
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,250.70,403.0,"[240, 280)","[390, 460)",0,0,2,4,2,2,1,1,2,6,9,3
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,414.00,411.0,"[390, 460)","[390, 460)",2,1,2,5,2,2,1,1,6,6,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90761,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,0,0,0,250.13,291,458,9,,Greater than 8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,250.13,291.0,"[240, 280)","[290, 320)",0,1,2,7,1,0,0,1,2,4,9,3
90762,443840543,74694092,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO,560.00,276.0,"[520, 580)","[240, 280)",0,0,2,8,2,2,1,1,8,2,9,1
90763,443846909,41088659,Caucasian,Male,[70-80),1,1,7,1,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO,38.00,590.0,"[1, 140)","[580, 630)",2,1,2,7,2,0,0,1,0,9,13,1
90764,443849927,31693541,Caucasian,Female,[80-90),2,3,7,10,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,996.00,285.0,"[800, 1000)","[280, 290)",2,0,2,8,2,3,0,1,15,3,9,1


In [17]:
new_df = pd.concat((df_train.iloc[:,5:15], df_train.iloc[:, 51:62]), axis = 1)
df_col_fnl = new_df.columns
trial_two = new_df.copy()
df_col_fnl #not the best of the naming conventions - but its okay, we don't need this much.

Index(['admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'race_code', 'gender_code', 'max_glu_serum_code',
       'age_code', 'A1Cresult_code', 'insulin_code', 'change_code',
       'diabetesMed_code', 'bin_diag_1_code', 'bin_diag_2_code', 'num_diag'],
      dtype='object')

In [18]:
trial_two

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,race_code,gender_code,max_glu_serum_code,age_code,A1Cresult_code,insulin_code,change_code,diabetesMed_code,bin_diag_1_code,bin_diag_2_code,num_diag
0,2,1,4,13,68,2,28,0,0,0,2,0,2,8,2,2,0,1,6,6,8
1,3,3,4,12,33,3,18,0,0,0,2,0,2,9,2,2,0,1,6,1,8
2,1,1,7,1,51,0,8,0,0,0,2,1,2,4,2,2,0,1,1,1,5
3,1,1,7,9,47,2,17,0,0,0,0,0,2,4,2,2,1,1,2,6,9
4,2,1,2,3,31,6,16,0,0,0,2,1,2,5,2,2,1,1,6,6,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90761,1,3,7,3,51,0,16,0,0,0,0,1,2,7,1,0,0,1,2,4,9
90762,1,4,5,5,33,3,18,0,0,1,0,0,2,8,2,2,1,1,8,2,9
90763,1,1,7,1,53,0,9,1,0,0,2,1,2,7,2,0,0,1,0,9,13
90764,2,3,7,10,45,2,21,0,0,1,2,0,2,8,2,3,0,1,15,3,9


In [19]:
labels = np.array(df_train['truth_le'])
labels

array([1, 1, 1, ..., 1, 1, 1])

In [20]:
features_list = list(trial_two.columns)
features_list

['admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'race_code',
 'gender_code',
 'max_glu_serum_code',
 'age_code',
 'A1Cresult_code',
 'insulin_code',
 'change_code',
 'diabetesMed_code',
 'bin_diag_1_code',
 'bin_diag_2_code',
 'num_diag']

In [21]:
trial_two = np.array(trial_two)

In [35]:
train_features, test_features, train_labels, test_labels = train_test_split(trial_two, labels, test_size = 0.25, random_state = 100, stratify = labels)

In [23]:
train_features

array([[ 3,  1,  1, ...,  8,  7,  9],
       [ 1,  3,  7, ..., 16,  2,  9],
       [ 3,  3,  1, ...,  2, 12,  9],
       ...,
       [ 5,  3,  1, ...,  6,  7,  9],
       [ 1,  3,  7, ...,  2,  6,  9],
       [ 3,  1,  1, ...,  6,  6,  9]], dtype=int64)

In [24]:
test_features.shape

(21799, 21)

In [25]:
train_labels.shape

(65395,)

In [26]:
test_labels.shape

(21799,)

In [36]:
#Train the random forest classifier
rand_clas = RandomForestClassifier(n_jobs=2, random_state=0, n_estimators=100, bootstrap = True)
rand_clas.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [28]:
rand_clas.predict(test_features)

array([1, 1, 1, ..., 3, 3, 1])

In [37]:
predictions = rand_clas.predict(test_features)

In [30]:
#predictions = rand_clas.predict(train_features)
#predictions.shape

In [38]:
errors = abs(predictions - test_labels)
print('Mean Absolute Error:', round(np.mean(errors), 2), '??')

Mean Absolute Error: 0.73 ??


In [32]:
#performance matrics
#mape = 100* (errors/test_labels)
#accuracy = 100- np.mean(mape)

#print('Accuracy:', round(accuracy, 2), '%')

In [42]:
pd.crosstab(test_labels, predictions, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,9718,28,1987
2,1482,61,930
3,4685,70,2838


In [41]:
precision_recall_fscore_support(test_labels, predictions, average='weighted')

(0.5445693796038367, 0.5787880177989816, 0.5321549595227049, None)