## Import the needed Libraries

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# data partition
from sklearn.model_selection import train_test_split

#filter methods
# spearman 
# chi-square
import scipy.stats as stats
from scipy.stats import chi2_contingency

#wrapper methods
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import RFE


# embedded methods
from sklearn.linear_model import LassoCV

import warnings
warnings.filterwarnings('ignore')

## Import the dataset

In [59]:
train = pd.read_csv('/Users/tomasverissimo/Desktop/Master/1st Year/1st Semester/Machine Learning/Project/project_data/train.csv')
train.head()

Unnamed: 0,encounter_id,country,patient_id,race,gender,age,weight,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,...,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary,readmitted_multiclass
0,533253,USA,70110,Caucasian,Female,[70-80),?,?,0,0,...,276,466,8,,,No,No,[],No,>30 days
1,426224,USA,29775006,AfricanAmerican,Male,[50-60),?,?,0,0,...,785,162,9,,,No,Yes,['insulin'],No,No
2,634063,USA,80729253,Caucasian,Female,[60-70),?,?,0,0,...,135,250,6,,,Ch,Yes,"['glimepiride', 'insulin']",No,No
3,890610,USA,2919042,AfricanAmerican,Male,[60-70),?,MC,0,0,...,562,455,5,,,No,No,[],No,No
4,654194,USA,84871971,Caucasian,Female,[70-80),?,HM,1,0,...,599,428,9,,,No,No,[],No,>30 days


## See the data

In [23]:
train.set_index('patient_id', inplace = True)

In [24]:
train['readmitted_binary'].unique()

array(['No', 'Yes'], dtype=object)

In [25]:
train.describe(include = 'all').T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
encounter_id,71236.0,,,,548798.623716,259704.723154,100000.0,323118.5,548370.0,774058.5,999980.0
country,71236.0,1.0,USA,71236.0,,,,,,,
race,67682.0,6.0,Caucasian,50693.0,,,,,,,
gender,71236.0,3.0,Female,38228.0,,,,,,,
age,67679.0,10.0,[70-80),17359.0,,,,,,,
weight,71236.0,10.0,?,68990.0,,,,,,,
payer_code,71236.0,18.0,?,28201.0,,,,,,,
outpatient_visits_in_previous_year,71236.0,,,,0.369588,1.287469,0.0,0.0,0.0,0.0,42.0
emergency_visits_in_previous_year,71236.0,,,,0.196249,0.910854,0.0,0.0,0.0,0.0,76.0
inpatient_visits_in_previous_year,71236.0,,,,0.640154,1.267271,0.0,0.0,0.0,1.0,21.0


In [26]:
train.dtypes

encounter_id                              int64
country                                  object
race                                     object
gender                                   object
age                                      object
weight                                   object
payer_code                               object
outpatient_visits_in_previous_year        int64
emergency_visits_in_previous_year         int64
inpatient_visits_in_previous_year         int64
admission_type                           object
medical_specialty                        object
average_pulse_bpm                         int64
discharge_disposition                    object
admission_source                         object
length_of_stay_in_hospital                int64
number_lab_tests                          int64
non_lab_procedures                        int64
number_of_medications                     int64
primary_diagnosis                        object
secondary_diagnosis                     

In [60]:
non_metric_features = ['country', 'race', 'gender', 'age', 'payer_code', 'admission_type', 'medical_specialty', 'discharge_disposition', 'admission_source', 'primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis', 'change_in_meds_during_hospitalization', 'prescribed_diabetes_meds', 'medication']
metric_features = train.columns.drop(non_metric_features).tolist()

## Treatment of missing values

In [28]:
train_central = train.copy()

In [29]:
train_central.isna().sum()

encounter_id                                 0
country                                      0
race                                      3554
gender                                       0
age                                       3557
weight                                       0
payer_code                                   0
outpatient_visits_in_previous_year           0
emergency_visits_in_previous_year            0
inpatient_visits_in_previous_year            0
admission_type                            3706
medical_specialty                            0
average_pulse_bpm                            0
discharge_disposition                     2590
admission_source                          4718
length_of_stay_in_hospital                   0
number_lab_tests                             0
non_lab_procedures                           0
number_of_medications                        0
primary_diagnosis                            0
secondary_diagnosis                          0
additional_di

In [30]:
train_central.replace('?', np.nan, inplace = True)
train_central.replace('[]', np.nan, inplace = True)


In [31]:
train_central.drop(['weight','glucose_test_result', 'a1c_test_result'], axis=1, inplace=True)

In [32]:
train_central.drop(['readmitted_multiclass'], axis=1, inplace=True)

In [33]:
train_central

Unnamed: 0_level_0,encounter_id,country,race,gender,age,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,...,non_lab_procedures,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70110,533253,USA,Caucasian,Female,[70-80),,0,0,2,Emergency,...,0,20,515,276,466,8,No,No,,No
29775006,426224,USA,AfricanAmerican,Male,[50-60),,0,0,0,Emergency,...,0,25,38,785,162,9,No,Yes,['insulin'],No
80729253,634063,USA,Caucasian,Female,[60-70),,0,0,1,,...,1,22,534,135,250,6,Ch,Yes,"['glimepiride', 'insulin']",No
2919042,890610,USA,AfricanAmerican,Male,[60-70),MC,0,0,1,Emergency,...,2,9,569,562,455,5,No,No,,No
84871971,654194,USA,Caucasian,Female,[70-80),HM,1,0,0,Elective,...,1,15,715,599,428,9,No,No,,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24531381,660160,USA,AfricanAmerican,Female,[80-90),MC,0,1,0,Elective,...,1,18,453,786,250,9,Ch,Yes,"['metformin', 'glyburide', 'insulin']",No
4663818,826429,USA,AfricanAmerican,Female,[70-80),,0,0,0,Urgent,...,0,9,157,197,V66,6,Ch,Yes,"['metformin', 'glyburide']",No
23397147,332030,USA,Caucasian,Female,[60-70),,0,2,2,,...,0,24,428,491,276,9,Ch,Yes,"['glyburide', 'insulin']",Yes
52161750,757560,USA,Caucasian,Male,[60-70),BC,0,0,2,Emergency,...,1,13,820,157,250.8,5,Ch,Yes,"['glyburide', 'insulin']",No


In [34]:
medians = train_central.median(numeric_only = True)
medians
train_central.fillna(medians, inplace = True)

In [35]:
modes = train_central[non_metric_features].mode().loc[0]

In [36]:
train_central.fillna(modes, inplace= True)

In [37]:
# Seeing rows with NaNs
nans_index = train_central.isna().any(axis=1)
train_central[nans_index] #this function is like a filter, only shows the ones that match the condition of the variable, 
#i.e. that have missing values

Unnamed: 0_level_0,encounter_id,country,race,gender,age,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,...,non_lab_procedures,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [38]:
train_central

Unnamed: 0_level_0,encounter_id,country,race,gender,age,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,admission_type,...,non_lab_procedures,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication,readmitted_binary
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70110,533253,USA,Caucasian,Female,[70-80),MC,0,0,2,Emergency,...,0,20,515,276,466,8,No,No,['insulin'],No
29775006,426224,USA,AfricanAmerican,Male,[50-60),MC,0,0,0,Emergency,...,0,25,38,785,162,9,No,Yes,['insulin'],No
80729253,634063,USA,Caucasian,Female,[60-70),MC,0,0,1,Emergency,...,1,22,534,135,250,6,Ch,Yes,"['glimepiride', 'insulin']",No
2919042,890610,USA,AfricanAmerican,Male,[60-70),MC,0,0,1,Emergency,...,2,9,569,562,455,5,No,No,['insulin'],No
84871971,654194,USA,Caucasian,Female,[70-80),HM,1,0,0,Elective,...,1,15,715,599,428,9,No,No,['insulin'],No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24531381,660160,USA,AfricanAmerican,Female,[80-90),MC,0,1,0,Elective,...,1,18,453,786,250,9,Ch,Yes,"['metformin', 'glyburide', 'insulin']",No
4663818,826429,USA,AfricanAmerican,Female,[70-80),MC,0,0,0,Urgent,...,0,9,157,197,V66,6,Ch,Yes,"['metformin', 'glyburide']",No
23397147,332030,USA,Caucasian,Female,[60-70),MC,0,2,2,Emergency,...,0,24,428,491,276,9,Ch,Yes,"['glyburide', 'insulin']",Yes
52161750,757560,USA,Caucasian,Male,[60-70),BC,0,0,2,Emergency,...,1,13,820,157,250.8,5,Ch,Yes,"['glyburide', 'insulin']",No


In [61]:
train_central.isna().sum()

encounter_id             0
race                     0
gender                   0
age                      0
payer_code               0
outpatient_visits        0
emergency_visits         0
inpatient_visits         0
admission_type           0
medical_specialty        0
average_pulse_bpm        0
discharge_disposition    0
admission_source         0
length_of_stay           0
number_lab_tests         0
non_lab_procedures       0
number_of_medications    0
primary_diagnosis        0
secondary_diagnosis      0
additional_diagnosis     0
number_diagnoses         0
change_in_meds           0
diabetes_meds            0
medication               0
readmitted_binary        0
dtype: int64

## Substitute column names

In [41]:
new_column_names = {'outpatient_visits_in_previous_year': 'outpatient_visits', 
                    'emergency_visits_in_previous_year': 'emergency_visits',
                    'inpatient_visits_in_previous_year': 'inpatient_visits',
                    'change_in_meds_during_hospitalization': 'change_in_meds',
                    'length_of_stay_in_hospital': 'length_of_stay',
                    'prescribed_diabetes_meds': 'diabetes_meds'
                   }
train_central = train_central.rename(columns=new_column_names)

In [42]:
train_central.head()

Unnamed: 0_level_0,encounter_id,country,race,gender,age,payer_code,outpatient_visits,emergency_visits,inpatient_visits,admission_type,...,non_lab_procedures,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,change_in_meds,diabetes_meds,medication,readmitted_binary
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70110,533253,USA,Caucasian,Female,[70-80),MC,0,0,2,Emergency,...,0,20,515,276,466,8,No,No,['insulin'],No
29775006,426224,USA,AfricanAmerican,Male,[50-60),MC,0,0,0,Emergency,...,0,25,38,785,162,9,No,Yes,['insulin'],No
80729253,634063,USA,Caucasian,Female,[60-70),MC,0,0,1,Emergency,...,1,22,534,135,250,6,Ch,Yes,"['glimepiride', 'insulin']",No
2919042,890610,USA,AfricanAmerican,Male,[60-70),MC,0,0,1,Emergency,...,2,9,569,562,455,5,No,No,['insulin'],No
84871971,654194,USA,Caucasian,Female,[70-80),HM,1,0,0,Elective,...,1,15,715,599,428,9,No,No,['insulin'],No


In [62]:
train_central.drop('country', axis = 1, inplace = True)

KeyError: "['country'] not found in axis"

## Categorical Data Encoding

In [54]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
label_encoder.fit(train_central['age'])
train_central['age'] = label_encoder.transform(train_central['age'])
train_central

Unnamed: 0_level_0,encounter_id,race,gender,age,payer_code,outpatient_visits,emergency_visits,inpatient_visits,admission_type,medical_specialty,...,non_lab_procedures,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,change_in_meds,diabetes_meds,medication,readmitted_binary
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
70110,533253,Caucasian,Female,7,MC,0,0,2,Emergency,Family/GeneralPractice,...,0,20,515,276,466,8,No,No,['insulin'],No
29775006,426224,AfricanAmerican,Male,5,MC,0,0,0,Emergency,InternalMedicine,...,0,25,38,785,162,9,No,Yes,['insulin'],No
80729253,634063,Caucasian,Female,6,MC,0,0,1,Emergency,Family/GeneralPractice,...,1,22,534,135,250,6,Ch,Yes,"['glimepiride', 'insulin']",No
2919042,890610,AfricanAmerican,Male,6,MC,0,0,1,Emergency,InternalMedicine,...,2,9,569,562,455,5,No,No,['insulin'],No
84871971,654194,Caucasian,Female,7,HM,1,0,0,Elective,InternalMedicine,...,1,15,715,599,428,9,No,No,['insulin'],No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24531381,660160,AfricanAmerican,Female,8,MC,0,1,0,Elective,InternalMedicine,...,1,18,453,786,250,9,Ch,Yes,"['metformin', 'glyburide', 'insulin']",No
4663818,826429,AfricanAmerican,Female,7,MC,0,0,0,Urgent,InternalMedicine,...,0,9,157,197,V66,6,Ch,Yes,"['metformin', 'glyburide']",No
23397147,332030,Caucasian,Female,6,MC,0,2,2,Emergency,InternalMedicine,...,0,24,428,491,276,9,Ch,Yes,"['glyburide', 'insulin']",Yes
52161750,757560,Caucasian,Male,6,BC,0,0,2,Emergency,Emergency/Trauma,...,1,13,820,157,250.8,5,Ch,Yes,"['glyburide', 'insulin']",No


In [55]:
from sklearn.preprocessing import OrdinalEncoder
enc1 = OrdinalEncoder() 

In [56]:
numerical_columns = train_central.select_dtypes(include=np.number).columns
categorical_columns = pd.DataFrame(train_central.drop(numerical_columns, axis=1))

In [57]:
enc1.fit(categorical_columns)
train_encoded = pd.DataFrame(enc1.transform(categorical_columns), columns = categorical_columns.columns)
train_encoded.set_index(train.index, inplace=True)
train_encoded

Unnamed: 0_level_0,race,gender,payer_code,admission_type,medical_specialty,discharge_disposition,admission_source,primary_diagnosis,secondary_diagnosis,additional_diagnosis,change_in_meds,diabetes_meds,medication,readmitted_binary
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
70110,2.0,0.0,7.0,1.0,10.0,1.0,1.0,337.0,122.0,291.0,1.0,0.0,92.0,0.0
29775006,0.0,1.0,7.0,1.0,17.0,4.0,12.0,229.0,485.0,24.0,1.0,1.0,92.0,0.0
80729253,2.0,0.0,7.0,1.0,10.0,1.0,1.0,355.0,10.0,80.0,0.0,1.0,25.0,0.0
2919042,0.0,1.0,7.0,1.0,17.0,1.0,9.0,379.0,345.0,281.0,1.0,0.0,92.0,0.0
84871971,2.0,0.0,6.0,0.0,17.0,11.0,5.0,488.0,377.0,257.0,1.0,0.0,92.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24531381,0.0,0.0,7.0,0.0,17.0,1.0,5.0,289.0,486.0,80.0,0.0,1.0,144.0,0.0
4663818,0.0,0.0,7.0,6.0,17.0,1.0,5.0,24.0,39.0,741.0,0.0,1.0,155.0,0.0
23397147,2.0,0.0,7.0,1.0,17.0,23.0,1.0,267.0,297.0,131.0,0.0,1.0,73.0,1.0
52161750,2.0,1.0,0.0,1.0,7.0,3.0,1.0,557.0,22.0,103.0,0.0,1.0,73.0,0.0


In [58]:
train_encoded.isna().sum()

race                     0
gender                   0
payer_code               0
admission_type           0
medical_specialty        0
discharge_disposition    0
admission_source         0
primary_diagnosis        0
secondary_diagnosis      0
additional_diagnosis     0
change_in_meds           0
diabetes_meds            0
medication               0
readmitted_binary        0
dtype: int64