In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
from sklearn.impute import SimpleImputer
np.random.seed(0)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 999

In [2]:
#loading the dataset in pandas dataframe
#replacing all '?' values with NaN
df = pd.read_csv("C2T1Data//C2T1_Train.csv")
df = df.replace({'?':np.nan})

In [3]:
#checking for duplicate values, since a single patient can have multiple visits
df.duplicated(subset='patient_nbr2').sum()

25001

In [4]:
#checking missing values in the dataset
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns
missing_values_table(df)

Your selected dataframe has 50 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
weight,87840,96.8
medical_specialty,43674,48.1
payer_code,38730,42.7
race,2207,2.4
diag_3,1357,1.5
diag_2,336,0.4
diag_1,17,0.0


In [5]:
#removing every column with more than 90% of missing values
#df_train = df_train.drop(columns=['weight', 'medical_specialty', 'payer_code'])
df = df.drop(columns=['weight'])

In [6]:
#removing the rows that have missing values in column 'race'
df = df.dropna(subset=['race', 'diag_3', 'diag_1', 'diag_2'])

In [8]:
#imputing most frequent values in the missing cells
imp = SimpleImputer(strategy = "most_frequent")
df_train = pd.DataFrame(
     imp.fit_transform(df), columns=df.columns
).astype(df.dtypes.to_dict()).copy()
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,MC,InternalMedicine,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,MC,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,MC,InternalMedicine,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,MC,InternalMedicine,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,MC,InternalMedicine,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87190,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,MC,InternalMedicine,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
87191,443840543,74694092,AfricanAmerican,Female,[80-90),1,4,5,5,MC,InternalMedicine,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,NO
87192,443846909,41088659,Caucasian,Male,[70-80),1,1,7,1,MC,InternalMedicine,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,NO
87193,443849927,31693541,Caucasian,Female,[80-90),2,3,7,10,MC,Surgery-General,45,2,21,0,0,1,996,285,998,9,,,No,No,No,No,No,No,Steady,No,No,Steady,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO


In [9]:
#dropping duplicate values in the dataset
df_train = df_train.drop_duplicates(subset='patient_nbr2')

In [10]:
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,MC,InternalMedicine,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,MC,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,MC,InternalMedicine,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,MC,InternalMedicine,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,MC,InternalMedicine,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87185,443834831,140199364,Other,Female,[60-70),1,1,7,2,MD,InternalMedicine,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
87187,443835101,120975184,Caucasian,Female,[80-90),1,1,7,5,MC,InternalMedicine,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
87188,443835539,86472113,Caucasian,Male,[80-90),1,1,7,1,MC,InternalMedicine,1,0,15,3,0,0,435,784,250,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
87190,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,MC,InternalMedicine,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30


In [12]:
for x in df_train.columns:
    print(df_train[x].value_counts())

165283839    1
118527495    1
75716463     1
35304963     1
78595581     1
            ..
123769209    1
115532151    1
31538373     1
100462965    1
41289729     1
Name: encounter_id2, Length: 62954, dtype: int64
57727004     1
7873745      1
23520371     1
69393614     1
1756364      1
            ..
104654390    1
20774453     1
24704564     1
88399409     1
5638145      1
Name: patient_nbr2, Length: 62954, dtype: int64
Caucasian          48435
AfricanAmerican    11634
Hispanic            1363
Other               1063
Asian                459
Name: race, dtype: int64
Female             33569
Male               29384
Unknown/Invalid        1
Name: gender, dtype: int64
[70-80)     16259
[60-70)     14141
[50-60)     10972
[80-90)     10303
[40-50)      5943
[30-40)      2321
[90-100)     1698
[20-30)       928
[10-20)       327
[0-10)         62
Name: age, dtype: int64
1    32384
3    12094
2    11379
6     4250
5     2572
8      249
7       17
4        9
Name: admission_type_id, dtyp

None    52004
>8       5267
Norm     3205
>7       2478
Name: A1Cresult, dtype: int64
No        49935
Steady    11921
Up          721
Down        377
Name: metformin, dtype: int64
No        62112
Steady      751
Up           61
Down         30
Name: repaglinide, dtype: int64
No        62517
Steady      413
Up           16
Down          8
Name: nateglinide, dtype: int64
No        62890
Steady       59
Up            4
Down          1
Name: chlorpropamide, dtype: int64
No        59659
Steady     2961
Up          208
Down        126
Name: glimepiride, dtype: int64
No        62953
Steady        1
Name: acetohexamide, dtype: int64
No        54904
Steady     7182
Up          532
Down        336
Name: glipizide, dtype: int64
No        55950
Steady     6068
Up          554
Down        382
Name: glyburide, dtype: int64
No        62937
Steady       17
Name: tolbutamide, dtype: int64
No        58347
Steady     4378
Up          157
Down         72
Name: pioglitazone, dtype: int64
No        58897
St

In [None]:
rosiglitazone, pioglitazone, glyburide, glipizide, glimepiride, metformin

In [13]:
#replacing values in columns to a legit string to avoid any errors
df_train.A1Cresult = df_train.A1Cresult.replace({'>8':'Greater than 8', '>7':'Greater than 7'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [15]:
df_train.gender.value_counts()

Female    33569
Male      29384
Name: gender, dtype: int64

In [14]:
#removing the Undknow/Invalid enter in the gender #it would have been of no use!
df_train = df_train[df_train.gender != 'Unknown/Invalid']

In [16]:
#convert values starting with 'V' and 'E' to a numeric value so that Binning can be used on the whole column in diag_1
df_train.loc[df_train['diag_1'].str.contains('V',na=False), 'diag_1'] = 1000
df_train.loc[df_train['diag_1'].str.contains('E',na=False), 'diag_1'] = 1300

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [17]:
#convert values starting with 'V' and 'E' to a numeric value so that Binning can be used on the whole column in diag_2
df_train.loc[df_train['diag_2'].str.contains('V',na=False), 'diag_2'] = 1000
df_train.loc[df_train['diag_2'].str.contains('E',na=False), 'diag_2'] = 1300

In [18]:
#applying binning to diag_1 by taking its float value into bin_diag and then binning in values of bin_diag in bin_diag_1
#binning intervals are selected from ICD-9-CM codes
bins = [1, 140, 240, 280, 290, 320, 390, 460, 520, 580, 630, 680, 710, 740, 760, 780, 800, 1000, 1200, 1400]
df_train['bin_diag'] = df_train.diag_1.astype('float')
df_train['bin_diag2'] = df_train.diag_2.astype('float')
df_train['bin_diag_1'] = pd.cut(df_train['bin_diag'], bins=bins, right=False)
df_train['bin_diag_2'] = pd.cut(df_train['bin_diag2'], bins=bins, right=False)
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,bin_diag,bin_diag2,bin_diag_1,bin_diag_2
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,MC,InternalMedicine,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,398.00,427.0,"[390, 460)","[390, 460)"
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,MC,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,434.00,198.0,"[390, 460)","[140, 240)"
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,MC,InternalMedicine,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,197.00,157.0,"[140, 240)","[140, 240)"
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,MC,InternalMedicine,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,250.70,403.0,"[240, 280)","[390, 460)"
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,MC,InternalMedicine,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,414.00,411.0,"[390, 460)","[390, 460)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87185,443834831,140199364,Other,Female,[60-70),1,1,7,2,MD,InternalMedicine,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,996.00,585.0,"[800, 1000)","[580, 630)"
87187,443835101,120975184,Caucasian,Female,[80-90),1,1,7,5,MC,InternalMedicine,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,292.00,8.0,"[290, 320)","[1, 140)"
87188,443835539,86472113,Caucasian,Male,[80-90),1,1,7,1,MC,InternalMedicine,1,0,15,3,0,0,435,784,250,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,435.00,784.0,"[390, 460)","[780, 800)"
87190,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,MC,InternalMedicine,51,0,16,0,0,0,250.13,291,458,9,,Greater than 8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,250.13,291.0,"[240, 280)","[290, 320)"


In [23]:
#now labeling all of the categorical features/columns using sklearn library (LabelEncoder)
#better coding practice can be followed to implement the below code - I would optimize this after completing the code!

lb_make = LabelEncoder()
df_train['payer_code_1'] = lb_make.fit_transform(df_train['payer_code'])
df_train['race_code'] = lb_make.fit_transform(df_train['race'])
df_train['gender_code'] = lb_make.fit_transform(df_train['gender'])
df_train['max_glu_serum_code'] = lb_make.fit_transform(df_train['max_glu_serum'])
df_train['age_code'] = lb_make.fit_transform(df_train['age'])
df_train['A1Cresult_code'] = lb_make.fit_transform(df_train['A1Cresult'])
df_train['insulin_code'] = lb_make.fit_transform(df_train['insulin'])
df_train['change_code'] = lb_make.fit_transform(df_train['change'])
df_train['diabetesMed_code'] = lb_make.fit_transform(df_train['diabetesMed'])
df_train['bin_diag_1_code'] = lb_make.fit_transform(df_train['bin_diag_1'])
df_train['bin_diag_2_code'] = lb_make.fit_transform(df_train['bin_diag_2'])
df_train['num_diag'] = df_train['number_diagnoses']
df_train['rosiglitazone_code'] = lb_make.fit_transform(df_train['rosiglitazone'])
df_train['pioglitazone_code'] =lb_make.fit_transform( df_train['pioglitazone'])
df_train['glyburide_code'] = lb_make.fit_transform(df_train['glyburide'])
df_train['glipizide_code'] = lb_make.fit_transform(df_train['glipizide'])
df_train['metformin_code'] = lb_make.fit_transform(df_train['metformin'])
df_train['glimepiride_code'] = lb_make.fit_transform(df_train['glimepiride'])

df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/panda

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,bin_diag,bin_diag2,bin_diag_1,bin_diag_2,payer_code_1,race_code,gender_code,max_glu_serum_code,age_code,A1Cresult_code,insulin_code,change_code,diabetesMed_code,bin_diag_1_code,bin_diag_2_code,num_diag,rosiglitazone_code,pioglitazone_code,glyburide_code,glipizide_code,metformin_code,glimepiride_code,truth_le
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,MC,InternalMedicine,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,398.00,427.0,"[390, 460)","[390, 460)",6,2,0,2,8,2,2,0,1,6,6,8,1,1,1,2,1,1,1
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,MC,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,434.00,198.0,"[390, 460)","[140, 240)",6,2,0,2,9,2,2,0,1,6,1,8,2,1,1,1,1,1,1
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,MC,InternalMedicine,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,197.00,157.0,"[140, 240)","[140, 240)",6,2,1,2,4,2,2,0,1,1,1,5,1,1,1,2,1,1,1
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,MC,InternalMedicine,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,250.70,403.0,"[240, 280)","[390, 460)",6,0,0,2,4,2,2,1,1,2,6,9,1,1,1,1,1,1,3
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,MC,InternalMedicine,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,414.00,411.0,"[390, 460)","[390, 460)",6,2,1,2,5,2,2,1,1,6,6,9,1,1,1,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87185,443834831,140199364,Other,Female,[60-70),1,1,7,2,MD,InternalMedicine,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,996.00,585.0,"[800, 1000)","[580, 630)",7,4,0,2,6,2,2,1,1,15,9,9,1,1,1,1,1,1,3
87187,443835101,120975184,Caucasian,Female,[80-90),1,1,7,5,MC,InternalMedicine,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,292.00,8.0,"[290, 320)","[1, 140)",6,2,0,2,8,2,3,0,1,4,0,9,1,1,1,1,1,1,1
87188,443835539,86472113,Caucasian,Male,[80-90),1,1,7,1,MC,InternalMedicine,1,0,15,3,0,0,435,784,250,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,435.00,784.0,"[390, 460)","[780, 800)",6,2,1,2,8,2,3,0,1,6,14,7,1,1,1,1,1,1,1
87190,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,MC,InternalMedicine,51,0,16,0,0,0,250.13,291,458,9,,Greater than 8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,250.13,291.0,"[240, 280)","[290, 320)",6,0,1,2,7,1,0,0,1,2,4,9,1,1,1,1,2,1,3


In [20]:
df_train['truth_le'] = df_train.readmitted.replace({'NO':'1', '>30':'3', '<30':'2'})
df_train.truth_le.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


1    37396
3    19872
2     5685
Name: truth_le, dtype: int64

In [21]:
df_train.truth_le = df_train.truth_le.astype('int32')

In [24]:
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,bin_diag,bin_diag2,bin_diag_1,bin_diag_2,payer_code_1,race_code,gender_code,max_glu_serum_code,age_code,A1Cresult_code,insulin_code,change_code,diabetesMed_code,bin_diag_1_code,bin_diag_2_code,num_diag,rosiglitazone_code,pioglitazone_code,glyburide_code,glipizide_code,metformin_code,glimepiride_code,truth_le
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,MC,InternalMedicine,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,398.00,427.0,"[390, 460)","[390, 460)",6,2,0,2,8,2,2,0,1,6,6,8,1,1,1,2,1,1,1
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,MC,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,434.00,198.0,"[390, 460)","[140, 240)",6,2,0,2,9,2,2,0,1,6,1,8,2,1,1,1,1,1,1
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,MC,InternalMedicine,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,197.00,157.0,"[140, 240)","[140, 240)",6,2,1,2,4,2,2,0,1,1,1,5,1,1,1,2,1,1,1
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,MC,InternalMedicine,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,250.70,403.0,"[240, 280)","[390, 460)",6,0,0,2,4,2,2,1,1,2,6,9,1,1,1,1,1,1,3
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,MC,InternalMedicine,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,414.00,411.0,"[390, 460)","[390, 460)",6,2,1,2,5,2,2,1,1,6,6,9,1,1,1,1,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87185,443834831,140199364,Other,Female,[60-70),1,1,7,2,MD,InternalMedicine,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,996.00,585.0,"[800, 1000)","[580, 630)",7,4,0,2,6,2,2,1,1,15,9,9,1,1,1,1,1,1,3
87187,443835101,120975184,Caucasian,Female,[80-90),1,1,7,5,MC,InternalMedicine,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,292.00,8.0,"[290, 320)","[1, 140)",6,2,0,2,8,2,3,0,1,4,0,9,1,1,1,1,1,1,1
87188,443835539,86472113,Caucasian,Male,[80-90),1,1,7,1,MC,InternalMedicine,1,0,15,3,0,0,435,784,250,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,435.00,784.0,"[390, 460)","[780, 800)",6,2,1,2,8,2,3,0,1,6,14,7,1,1,1,1,1,1,1
87190,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,MC,InternalMedicine,51,0,16,0,0,0,250.13,291,458,9,,Greater than 8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,250.13,291.0,"[240, 280)","[290, 320)",6,0,1,2,7,1,0,0,1,2,4,9,1,1,1,1,2,1,3


In [48]:
new_df = pd.concat((df_train.iloc[:,5:9], df_train.iloc[:,11:17], df_train.iloc[:,20:21], df_train.iloc[:, 54:71]), axis = 1)
df_col_fnl = new_df.columns
trial_two = new_df.copy()
df_col_fnl #not the best of the naming conventions - but its okay, we don't need this much.

Index(['admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses', 'race_code', 'gender_code',
       'max_glu_serum_code', 'age_code', 'A1Cresult_code', 'insulin_code',
       'change_code', 'diabetesMed_code', 'bin_diag_1_code', 'bin_diag_2_code',
       'num_diag', 'rosiglitazone_code', 'pioglitazone_code', 'glyburide_code',
       'glipizide_code', 'metformin_code', 'glimepiride_code'],
      dtype='object')

In [49]:
trial_two

Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_code,gender_code,max_glu_serum_code,age_code,A1Cresult_code,insulin_code,change_code,diabetesMed_code,bin_diag_1_code,bin_diag_2_code,num_diag,rosiglitazone_code,pioglitazone_code,glyburide_code,glipizide_code,metformin_code,glimepiride_code
0,2,1,4,13,68,2,28,0,0,0,8,2,0,2,8,2,2,0,1,6,6,8,1,1,1,2,1,1
1,3,3,4,12,33,3,18,0,0,0,8,2,0,2,9,2,2,0,1,6,1,8,2,1,1,1,1,1
2,1,1,7,1,51,0,8,0,0,0,5,2,1,2,4,2,2,0,1,1,1,5,1,1,1,2,1,1
3,1,1,7,9,47,2,17,0,0,0,9,0,0,2,4,2,2,1,1,2,6,9,1,1,1,1,1,1
4,2,1,2,3,31,6,16,0,0,0,9,2,1,2,5,2,2,1,1,6,6,9,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87185,1,1,7,2,46,6,17,1,1,1,9,4,0,2,6,2,2,1,1,15,9,9,1,1,1,1,1,1
87187,1,1,7,5,76,1,22,0,1,0,9,2,0,2,8,2,3,0,1,4,0,9,1,1,1,1,1,1
87188,1,1,7,1,1,0,15,3,0,0,7,2,1,2,8,2,3,0,1,6,14,7,1,1,1,1,1,1
87190,1,3,7,3,51,0,16,0,0,0,9,0,1,2,7,1,0,0,1,2,4,9,1,1,1,1,2,1


In [27]:
labels = np.array(df_train['truth_le'])
labels

array([1, 1, 1, ..., 1, 3, 1])

In [28]:
features_list = list(trial_two.columns)
features_list

['admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses',
 'race_code',
 'gender_code',
 'max_glu_serum_code',
 'age_code',
 'A1Cresult_code',
 'insulin_code',
 'change_code',
 'diabetesMed_code',
 'bin_diag_1_code',
 'bin_diag_2_code',
 'num_diag',
 'rosiglitazone_code',
 'pioglitazone_code',
 'glyburide_code',
 'glipizide_code',
 'metformin_code',
 'glimepiride_code']

In [29]:
trial_two = np.array(trial_two)

In [30]:
train_features, test_features, train_labels, test_labels = train_test_split(trial_two, labels, test_size = 0.25, random_state = 42, stratify = labels)

In [31]:
train_features

array([[ 3,  1,  7, ...,  1,  1,  1],
       [ 3,  1,  7, ...,  1,  1,  1],
       [ 2,  1,  7, ...,  1,  1,  1],
       ...,
       [ 5,  1, 17, ...,  1,  1,  1],
       [ 1,  1,  7, ...,  1,  1,  1],
       [ 5,  1, 17, ...,  1,  1,  1]], dtype=int64)

In [32]:
test_features.shape

(15739, 28)

In [33]:
train_labels.shape

(47214,)

In [34]:
test_labels.shape

(15739,)

In [35]:
#Train the random forest classifier
rand_clas = RandomForestClassifier(n_jobs=2, random_state=0, n_estimators=500, bootstrap = True)
rand_clas.fit(train_features, train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [36]:
predictions = rand_clas.predict(test_features)

In [132]:
#predictions = rand_clas.predict(train_features)
#predictions.shape

In [37]:
errors = abs(predictions - test_labels)
print('Mean Absolute Error:', round(np.mean(errors), 2),'??')

Mean Absolute Error: 0.7 ??


In [134]:
#performance matrics
#mape = 100* (errors/test_labels)
#accuracy = 100- np.mean(mape)

#print('Accuracy:', round(accuracy, 2), '%')

In [38]:
pd.crosstab(test_labels, predictions, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,8483,4,863
2,1105,5,311
3,3897,2,1069


In [39]:
precision_recall_fscore_support(test_labels, predictions, average='micro')

(0.6072177393735307, 0.6072177393735307, 0.6072177393735307, None)

In [40]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#Create an object of the classifier.
bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(n_estimators = 100),
                                sampling_strategy='all', n_estimators = 10,
                                replacement=True,
                                random_state=0)

#Train the classifier.
bbc.fit(train_features, train_labels)
pred_bbc = bbc.predict(test_features)

In [41]:
pd.crosstab(test_labels, pred_bbc, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5971,568,2811
2,559,230,632
3,2126,395,2447


In [139]:
precision_recall_fscore_support(test_labels, pred_bbc, average='micro')

(0.5503526272317174, 0.5503526272317174, 0.5503526272317174, None)

In [42]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#Create an object of the classifier.
bbc = BalancedBaggingClassifier(base_estimator=RandomForestClassifier(n_estimators = 100),
                                sampling_strategy='not majority', n_estimators = 10,
                                replacement=True,
                                random_state=0)

#y_train = credit_df['Class']
#X_train = credit_df.drop(['Class'], axis=1, inplace=False)

#Train the classifier.
bbc.fit(train_features, train_labels)
pred_bbc = bbc.predict(test_features)

In [43]:
pd.crosstab(test_labels, pred_bbc, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2,3
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,9346,3,1
2,1405,10,6
3,4941,14,13


In [44]:
precision_recall_fscore_support(test_labels, pred_bbc, average='micro')

(0.5952728890018426, 0.5952728890018426, 0.5952728890018426, None)

In [45]:
df_train['truth_le_1'] = df_train.readmitted.replace({'NO':'1', '>30':'2', '<30':'2'})
df_train.truth_le_1.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


1    37396
2    25557
Name: truth_le_1, dtype: int64

In [46]:
df_train.truth_le = df_train.truth_le.astype('int32')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [47]:
df_train

Unnamed: 0,encounter_id2,patient_nbr2,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,bin_diag,bin_diag2,bin_diag_1,bin_diag_2,payer_code_1,race_code,gender_code,max_glu_serum_code,age_code,A1Cresult_code,insulin_code,change_code,diabetesMed_code,bin_diag_1_code,bin_diag_2_code,num_diag,rosiglitazone_code,pioglitazone_code,glyburide_code,glipizide_code,metformin_code,glimepiride_code,truth_le,truth_le_1
0,5283,48330653,Caucasian,Female,[80-90),2,1,4,13,MC,InternalMedicine,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,398.00,427.0,"[390, 460)","[390, 460)",6,2,0,2,8,2,2,0,1,6,6,8,1,1,1,2,1,1,1,1
1,8499,63555809,Caucasian,Female,[90-100),3,3,4,12,MC,InternalMedicine,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,434.00,198.0,"[390, 460)","[140, 240)",6,2,0,2,9,2,2,0,1,6,1,8,2,1,1,1,1,1,1,1
2,9441,42519137,Caucasian,Male,[40-50),1,1,7,1,MC,InternalMedicine,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO,197.00,157.0,"[140, 240)","[140, 240)",6,2,1,2,4,2,2,0,1,1,1,5,1,1,1,2,1,1,1,1
3,20997,89868902,AfricanAmerican,Female,[40-50),1,1,7,9,MC,InternalMedicine,47,2,17,0,0,0,250.7,403,996,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,250.70,403.0,"[240, 280)","[390, 460)",6,0,0,2,4,2,2,1,1,2,6,9,1,1,1,1,1,1,3,2
4,28515,82637321,Caucasian,Male,[50-60),2,1,2,3,MC,InternalMedicine,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,414.00,411.0,"[390, 460)","[390, 460)",6,2,1,2,5,2,2,1,1,6,6,9,1,1,1,1,1,1,3,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87185,443834831,140199364,Other,Female,[60-70),1,1,7,2,MD,InternalMedicine,46,6,17,1,1,1,996,585,403,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30,996.00,585.0,"[800, 1000)","[580, 630)",7,4,0,2,6,2,2,1,1,15,9,9,1,1,1,1,1,1,3,2
87187,443835101,120975184,Caucasian,Female,[80-90),1,1,7,5,MC,InternalMedicine,76,1,22,0,1,0,292,8,304,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,292.00,8.0,"[290, 320)","[1, 140)",6,2,0,2,8,2,3,0,1,4,0,9,1,1,1,1,1,1,1,1
87188,443835539,86472113,Caucasian,Male,[80-90),1,1,7,1,MC,InternalMedicine,1,0,15,3,0,0,435,784,250,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO,435.00,784.0,"[390, 460)","[780, 800)",6,2,1,2,8,2,3,0,1,6,14,7,1,1,1,1,1,1,1,1
87190,443840309,100162346,AfricanAmerican,Male,[70-80),1,3,7,3,MC,InternalMedicine,51,0,16,0,0,0,250.13,291,458,9,,Greater than 8,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30,250.13,291.0,"[240, 280)","[290, 320)",6,0,1,2,7,1,0,0,1,2,4,9,1,1,1,1,2,1,3,2


In [50]:
labels_1 = np.array(df_train['truth_le_1'])
labels_1

array(['1', '1', '1', ..., '1', '2', '1'], dtype=object)

In [52]:
train_features_1, test_features_1, train_labels_1, test_labels_1 = train_test_split(trial_two, labels_1, test_size = 0.25, random_state = 42, stratify = labels_1)

In [53]:
#Train the random forest classifier
rand_clas = RandomForestClassifier(n_jobs=2, random_state=0, n_estimators=500, bootstrap = True)
rand_clas.fit(train_features_1, train_labels_1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=2,
                       oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [56]:
predictions_1 = rand_clas.predict(test_features_1)

In [57]:
pd.crosstab(test_labels_1, predictions_1, rownames=['Actual'], colnames=['Predicted'])

Predicted,1,2
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1
1,7666,1683
2,4168,2222


In [None]:
l