In [532]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import scipy.stats as stats
from scipy.stats import pointbiserialr
from scipy.stats import chi2_contingency
#wrapper methods
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import f1_score


from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#scaler methods
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# embedded methods
from sklearn.linear_model import LassoCV

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingClassifier

# Importing the datasets

In [533]:
sample_submission = pd.read_csv('https://raw.githubusercontent.com/Sebastiao199/Project_Group18_ML/main/sample_submission.csv')
sample_submission.head()

Unnamed: 0,encounter_id,readmitted_binary
0,499502,No
1,447319,No
2,309126,Yes
3,181183,Yes
4,359339,No


In [534]:
test = pd.read_csv('https://raw.githubusercontent.com/Sebastiao199/Project_Group18_ML/main/test.csv', na_values=['?', ''], index_col='encounter_id')
test.head()

Unnamed: 0_level_0,country,patient_id,race,gender,age,weight,payer_code,outpatient_visits_in_previous_year,emergency_visits_in_previous_year,inpatient_visits_in_previous_year,...,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds_during_hospitalization,prescribed_diabetes_meds,medication
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
499502,USA,103232799,Caucasian,Male,[80-90),,HM,0,0,0,...,14,491.0,414.0,250,6,,,No,Yes,['metformin']
447319,USA,93395304,Caucasian,Male,[20-30),,HM,0,0,1,...,7,250.13,70.0,794,7,>300,,No,No,[]
309126,USA,6281586,AfricanAmerican,Male,[30-40),,,0,0,0,...,12,786.0,250.6,536,6,,,No,Yes,['insulin']
181183,USA,67381308,Caucasian,Male,[50-60),,BC,0,0,0,...,16,820.0,873.0,E884,9,,,Ch,Yes,"['metformin', 'glyburide', 'insulin']"
359339,USA,71670204,Caucasian,Male,[60-70),,,0,0,0,...,10,599.0,427.0,414,9,,,No,Yes,['metformin']


In [535]:
test.drop(['country','weight'], axis=1, inplace=True)

In [536]:
test.isna().sum()

patient_id                                   0
race                                      2191
gender                                       0
age                                       1531
payer_code                               12055
outpatient_visits_in_previous_year           0
emergency_visits_in_previous_year            0
inpatient_visits_in_previous_year            0
admission_type                            1585
medical_specialty                        15027
average_pulse_bpm                            0
discharge_disposition                     1101
admission_source                          2063
length_of_stay_in_hospital                   0
number_lab_tests                             0
non_lab_procedures                           0
number_of_medications                        0
primary_diagnosis                            5
secondary_diagnosis                         96
additional_diagnosis                       415
number_diagnoses                             0
glucose_test_

In [537]:
test['race'].fillna('Unknown', inplace=True)
test['medical_specialty'].fillna('Unknown', inplace=True)
test['admission_source'].fillna('Unknown', inplace=True)
test['payer_code'].fillna('None', inplace=True)
test['glucose_test_result'].fillna('None', inplace=True)
test['a1c_test_result'].fillna('None', inplace=True)
test['admission_type'].fillna('Unknown', inplace=True)
test['age'].fillna('Unknown', inplace=True)
test['discharge_disposition'].fillna('Unknown', inplace=True)
test['primary_diagnosis'].fillna('Unknown', inplace=True)
test['secondary_diagnosis'].fillna('Unknown', inplace=True)
test['additional_diagnosis'].fillna('Unknown', inplace=True)

In [538]:
test.isna().sum()

patient_id                               0
race                                     0
gender                                   0
age                                      0
payer_code                               0
outpatient_visits_in_previous_year       0
emergency_visits_in_previous_year        0
inpatient_visits_in_previous_year        0
admission_type                           0
medical_specialty                        0
average_pulse_bpm                        0
discharge_disposition                    0
admission_source                         0
length_of_stay_in_hospital               0
number_lab_tests                         0
non_lab_procedures                       0
number_of_medications                    0
primary_diagnosis                        0
secondary_diagnosis                      0
additional_diagnosis                     0
number_diagnoses                         0
glucose_test_result                      0
a1c_test_result                          0
change_in_m

In [539]:
new_column_names = {'outpatient_visits_in_previous_year': 'outpatient_visits', 
                    'emergency_visits_in_previous_year': 'emergency_visits',
                    'inpatient_visits_in_previous_year': 'inpatient_visits',
                    'change_in_meds_during_hospitalization': 'change_in_meds',
                    'length_of_stay_in_hospital': 'length_of_stay',
                    'prescribed_diabetes_meds': 'diabetes_meds'
                   }
test = test.rename(columns=new_column_names)

In [540]:
test['length_of_stay']

encounter_id
499502     3
447319    10
309126     2
181183     4
359339     1
          ..
451150     4
549298     2
327586     2
351214    14
914270     3
Name: length_of_stay, Length: 30530, dtype: int64

In [541]:
test['race'].value_counts()

race
Caucasian          21617
AfricanAmerican     5535
Unknown             2191
Hispanic             563
Other                437
Asian                187
Name: count, dtype: int64

In [542]:
unknown = train[train['race']== 'Unknown']['patient_id'].unique()

known_race = train[train['patient_id'].isin(unknown) & train['race']!= 'Unknown']

In [543]:
def fill_race(df):
    for i, row in df.iterrows():
        if row['race'] == 'Unknown':
            # Check if there are any other rows with the same 'person_id' and 'race' not equal to 'Unknown'
            other_rows = df[(df['patient_id'] == row['patient_id']) & (df['race'] != 'Unknown')]
            if not other_rows.empty:
                # If such a row is found, replace the 'race' of the current row with the 'race' of the found row
                df.at[i, 'race'] = other_rows.iloc[0]['race']
    return df

fill_race(test)

Unnamed: 0_level_0,patient_id,race,gender,age,payer_code,outpatient_visits,emergency_visits,inpatient_visits,admission_type,medical_specialty,...,number_of_medications,primary_diagnosis,secondary_diagnosis,additional_diagnosis,number_diagnoses,glucose_test_result,a1c_test_result,change_in_meds,diabetes_meds,medication
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
499502,103232799,Caucasian,Male,[80-90),HM,0,0,0,Emergency,Unknown,...,14,491,414,250,6,,,No,Yes,['metformin']
447319,93395304,Caucasian,Male,[20-30),HM,0,0,1,Emergency,Unknown,...,7,250.13,70,794,7,>300,,No,No,[]
309126,6281586,AfricanAmerican,Male,[30-40),,0,0,0,Emergency,Pulmonology,...,12,786,250.6,536,6,,,No,Yes,['insulin']
181183,67381308,Caucasian,Male,[50-60),BC,0,0,0,Not Available,Unknown,...,16,820,873,E884,9,,,Ch,Yes,"['metformin', 'glyburide', 'insulin']"
359339,71670204,Caucasian,Male,[60-70),,0,0,0,Emergency,InternalMedicine,...,10,599,427,414,9,,,No,Yes,['metformin']
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451150,85707450,Caucasian,Male,[70-80),HM,0,0,0,Emergency,Unknown,...,11,584,276,276,9,,,No,No,[]
549298,328887,Caucasian,Male,[60-70),,0,0,4,Emergency,Family/GeneralPractice,...,19,428,250.83,276,9,,,No,Yes,['insulin']
327586,21915459,Caucasian,Female,[80-90),,0,0,0,Emergency,Nephrology,...,22,996,403,250,6,,,No,Yes,['insulin']
351214,17823546,Caucasian,Male,Unknown,MC,0,0,0,Urgent,InternalMedicine,...,21,153,780,285,5,,>7,Ch,Yes,"['glipizide', 'pioglitazone', 'insulin']"


In [544]:
replace_race = {
    'Unknown': 'Other',
    'Hispanic': 'Other',
    'Asian': 'Other'}

test['race'] = train['race'].replace(replace_race)

In [545]:
test['race'].value_counts()

race
Caucasian          21617
AfricanAmerican     5535
Other               1187
Name: count, dtype: int64

In [546]:
test['medication'] = test['medication'].str.replace('[]', 'no medication')

In [547]:
test['number_prescriptions'] = test['medication'].apply(lambda x: 0 if x=='no medication' else len((x.replace("[", "").replace("]", "")).split(',')))

In [548]:
#no medication = 0 | medication taken=1
test['medication'] = np.where(test['medication']== 'no medication',0,1)

In [549]:
test['medication'].value_counts(normalize=True)

medication
1    0.76885
0    0.23115
Name: proportion, dtype: float64

In [550]:
# pd.set_option('display.max_colwidth', None)
test[test['number_prescriptions']==5]['medication']

encounter_id
990126    1
252924    1
707897    1
915560    1
353917    1
965253    1
763262    1
658521    1
821858    1
410586    1
954243    1
319387    1
991505    1
760667    1
440154    1
413789    1
182499    1
Name: medication, dtype: int64

In [551]:
data = test['age']
df = pd.DataFrame(data)

# Extract the lower and upper age values as separate columns
df[['Lower_Age', 'Upper_Age']] = df['age'].str.extract(r'\[(\d+)-(\d+)\)')

df['Lower_Age'] = pd.to_numeric(df['Lower_Age'], errors='coerce', downcast='integer')
df['Upper_Age'] = pd.to_numeric(df['Upper_Age'], errors='coerce', downcast='integer')


df['Midpoint_Age'] = average_age = np.where(df['Lower_Age'].notna() & df['Upper_Age'].notna(),
                       (df['Lower_Age'] + df['Upper_Age']) / 2,
                       np.nan)

test['Midpoint_Age'] = df['Midpoint_Age']
test.drop('age', axis=1, inplace=True)

In [552]:
test['gender'].value_counts()

gender
Female    16480
Male      14050
Name: count, dtype: int64

In [553]:
#Female = 0 Male = 1 and the Unknown/Invalid becomes Female because it is the mode
test['gender_binary'] = np.where(test['gender']== 'Male',1,0)
test.drop('gender', axis=1, inplace=True)

In [554]:
#No = 0 Yes = 1
test['diabetes_meds_binary'] = np.where(test['diabetes_meds']== 'Yes',1,0)
test.drop('diabetes_meds', axis=1, inplace=True)

In [555]:
#no change = 0 change = 1
test['change_in_meds_binary'] = np.where(test['change_in_meds']== 'Ch',1,0)
test.drop('change_in_meds', axis=1, inplace=True)

In [556]:
n_categorical_columns= len(test.select_dtypes(exclude=np.number).columns)
categorical_columns = list(test.select_dtypes(exclude=np.number).columns)

print('There are',n_categorical_columns, 'columns with categorical values:')
print()
print(categorical_columns)

There are 11 columns with categorical values:

['race', 'payer_code', 'admission_type', 'medical_specialty', 'discharge_disposition', 'admission_source', 'primary_diagnosis', 'secondary_diagnosis', 'additional_diagnosis', 'glucose_test_result', 'a1c_test_result']


In [557]:
value_counts = test['payer_code'].value_counts(normalize=True)
replace_payer_code = value_counts[value_counts < 0.05].index

test['payer_code'] = test['payer_code'].replace(replace_payer_code, 'Other_code')
test['payer_code'].value_counts(normalize=True)

payer_code
None          0.394858
MC            0.319555
Other_code    0.221553
HM            0.064035
Name: proportion, dtype: float64

In [558]:
test["admission_type"].value_counts()

admission_type
Emergency        16248
Elective          5658
Urgent            5456
Unknown           1585
Not Available     1465
Not Mapped         106
Trauma Center        8
Newborn              4
Name: count, dtype: int64

In [559]:
test[test["admission_type"]=='Newborn'][['Midpoint_Age',"admission_type"]]

Unnamed: 0_level_0,Midpoint_Age,admission_type
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1
783279,85.0,Newborn
221436,65.0,Newborn
366149,65.0,Newborn
276317,65.0,Newborn


In [560]:
test[test["admission_type"]=='Trauma Center'][['Midpoint_Age',"admission_type"]]

Unnamed: 0_level_0,Midpoint_Age,admission_type
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1
403796,65.0,Trauma Center
192244,95.0,Trauma Center
231841,65.0,Trauma Center
438710,55.0,Trauma Center
249717,85.0,Trauma Center
990687,45.0,Trauma Center
777137,15.0,Trauma Center
733152,55.0,Trauma Center


In [561]:
value_counts = test['admission_type'].value_counts(normalize=True)
replace_admission_type = value_counts[value_counts < 0.05].index

test['admission_type'] = test['admission_type'].replace('Unknown', 'Other_type')
test['admission_type'] = test['admission_type'].replace(replace_admission_type, 'Other_type')
test['admission_type'].value_counts(normalize=True)

admission_type
Emergency     0.532198
Elective      0.185326
Urgent        0.178709
Other_type    0.103767
Name: proportion, dtype: float64

In [562]:
surgery = test[test['medical_specialty'].str.contains('Surgery|Surgeon|Reconstructive')]
test.loc[surgery.index, 'medical_specialty'] = 'Surgery'

pediatric = test[test['medical_specialty'].str.contains('Pediatrics|Pediatric')]
test.loc[pediatric.index, 'medical_specialty'] = 'Pediatric'

test['medical_specialty'].value_counts(normalize=True)

medical_specialty
Unknown                                 0.492204
InternalMedicine                        0.142254
Emergency/Trauma                        0.073567
Family/GeneralPractice                  0.072814
Surgery                                 0.063020
Cardiology                              0.053587
Nephrology                              0.015624
Orthopedics                             0.014609
Radiologist                             0.010580
Pulmonology                             0.008516
Psychiatry                              0.008385
Urology                                 0.006780
ObstetricsandGynecology                 0.006649
Gastroenterology                        0.005470
Pediatric                               0.005241
PhysicalMedicineandRehabilitation       0.003636
Oncology                                0.003439
Neurology                               0.002129
Hematology/Oncology                     0.001769
Endocrinology                           0.001277
Ot

In [563]:
value_counts = test['medical_specialty'].value_counts(normalize=True)
replace_medical_specialty = value_counts[value_counts < 0.05].index

test['medical_specialty'] = test['medical_specialty'].replace('Unknown', 'Other_specialty')
test['medical_specialty'] = test['medical_specialty'].replace(replace_medical_specialty, 'Other_specialty')
test['medical_specialty'].value_counts(normalize=True)

medical_specialty
Other_specialty           0.594759
InternalMedicine          0.142254
Emergency/Trauma          0.073567
Family/GeneralPractice    0.072814
Surgery                   0.063020
Cardiology                0.053587
Name: proportion, dtype: float64

In [564]:
test['medical_specialty'] = test['medical_specialty'].str.strip().str.replace('/', '_')

In [565]:
test['discharge_disposition'].value_counts(normalize=True)

discharge_disposition
Discharged to home                                                                                           0.588863
Discharged/transferred to SNF                                                                                0.136718
Discharged/transferred to home with home health service                                                      0.127645
Unknown                                                                                                      0.036063
Discharged/transferred to another short term hospital                                                        0.020963
Discharged/transferred to another rehab fac including rehab units of a hospital .                            0.019653
Expired                                                                                                      0.016607
Discharged/transferred to another type of inpatient care institution                                         0.011857
Not Mapped                        

In [566]:
expired = test[test['discharge_disposition'].str.contains('Expired')]
test.loc[expired.index, 'discharge_disposition'] = 'Expired'

hospice = test[test['discharge_disposition'].str.contains('Hospice')]
test.loc[hospice.index, 'discharge_disposition'] = 'Hospice'

another_medical_facility = ['Discharged/transferred to SNF', 'Discharged/transferred to another short term hospital', 'Discharged/transferred to another rehab fac including rehab units of a hospital', 'Discharged/transferred to another type of inpatient care institution', 'Discharged/transferred to a long term care hospital', 'Discharged/transferred to a federal health care facility','Neonate discharged to another hospital for neonatal aftercare']
test['discharge_disposition'] = test['discharge_disposition'].replace(another_medical_facility, 'Transferred_to_another_medical_facility')

outpatient = ['Discharged/transferred/referred to this institution for outpatient services','Discharged/transferred/referred another institution for outpatient services']
test['discharge_disposition'] = test['discharge_disposition'].replace(outpatient, 'Discharged_transferred_referred_another_this_institution_for_outpatient_services')

home = test[test['discharge_disposition'].str.contains('home')]
test.loc[home.index, 'discharge_disposition'] = 'Discharged_to_home'

other = ['Discharged/transferred to another rehab fac including rehab units of a hospital','Discharged/transferred to ICF','Hospice',
         'Left AMA','Discharged/transferred to a long term care hospital.','Discharged/transferred/referred to a psychiatric hospital of psychiatric distinct part unit of a hospital',
         'Discharged/transferred within this institution to Medicare approved swing bed','Discharged/transferred to a nursing facility certified under Medicaid but not certified under Medicare.',
         'Discharged/transferred/referred another/this institution for outpatient services',
         'Discharged/transferred to a federal health care facility.','Discharged/transferred to another rehab fac including rehab units of a hospital .','Unknown',
         'Admitted as an inpatient to this hospital', 'Still patient or expected to return for outpatient services']

test['discharge_disposition'] = test['discharge_disposition'].replace(other, 'Other')


test['discharge_disposition'] = test['discharge_disposition'].replace('Not Mapped', 'Unknown')

test['discharge_disposition'].value_counts(normalize=True)

discharge_disposition
Discharged_to_home                                                                  0.717393
Transferred_to_another_medical_facility                                             0.169669
Other                                                                               0.085752
Expired                                                                             0.016705
Unknown                                                                             0.010154
Discharged_transferred_referred_another_this_institution_for_outpatient_services    0.000328
Name: proportion, dtype: float64

In [567]:
unknown = [' Not Mapped',' Not Available']
test['admission_source'] = test['admission_source'].replace(unknown, 'Unknown')
test['admission_source'].value_counts(normalize=True)

admission_source
 Emergency Room                                               0.562561
 Physician Referral                                           0.291091
Unknown                                                       0.070554
Transfer from a hospital                                      0.031346
 Transfer from another health care facility                   0.022994
Clinic Referral                                               0.010645
 Transfer from a Skilled Nursing Facility (SNF)               0.008516
HMO Referral                                                  0.001900
 Court/Law Enforcement                                        0.000164
 Transfer from hospital inpt/same fac reslt in a sep claim    0.000131
Normal Delivery                                               0.000033
 Transfer from critial access hospital                        0.000033
 Extramural Birth                                             0.000033
Name: proportion, dtype: float64

In [568]:
test['admission_source'].unique()

array([' Emergency Room', ' Physician Referral', 'Unknown',
       ' Transfer from another health care facility',
       'Transfer from a hospital',
       ' Transfer from a Skilled Nursing Facility (SNF)',
       'Clinic Referral', ' Court/Law Enforcement', 'HMO Referral',
       ' Transfer from hospital inpt/same fac reslt in a sep claim',
       'Normal Delivery', ' Transfer from critial access hospital',
       ' Extramural Birth'], dtype=object)

In [569]:
test['admission_source'] = test['admission_source'].str.strip().str.replace(' ', '_')

In [570]:
test['admission_source'].unique()

array(['Emergency_Room', 'Physician_Referral', 'Unknown',
       'Transfer_from_another_health_care_facility',
       'Transfer_from_a_hospital',
       'Transfer_from_a_Skilled_Nursing_Facility_(SNF)',
       'Clinic_Referral', 'Court/Law_Enforcement', 'HMO_Referral',
       'Transfer_from_hospital_inpt/same_fac_reslt_in_a_sep_claim',
       'Normal_Delivery', 'Transfer_from_critial_access_hospital',
       'Extramural_Birth'], dtype=object)

In [571]:
value_counts = test['admission_source'].value_counts(normalize=True)
replace_admission_source = value_counts[value_counts < 0.05].index

test['admission_source'] = test['admission_source'].replace(replace_admission_source, 'Other_source')
test['admission_source'] = test['admission_source'].replace('Unknown', 'Other_source')
test['admission_source'].value_counts(normalize=True)

admission_source
Emergency_Room        0.562561
Physician_Referral    0.291091
Other_source          0.146348
Name: proportion, dtype: float64

In [572]:
def diagosis_types_division(x):
    try:
        x = int(x)
        if x <= 139 and x>=1:
            return 'Infectious_and_parasitic_diseases'
        elif x <= 239 and x>=140:
            return 'Neoplasms'
        elif x <= 279 and x>=240:
            return 'Endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders'
        elif x <= 289 and x>=280:
            return 'Diseases_of_the_blood_and_blood_forming_organs'
        elif x <= 319 and x>=290:
            return 'Mental_disorders'
        elif x <= 389 and x>=320:
            return 'Diseases_of_the_nervous_system_and_sense_organs'
        elif x <= 459 and x>=390:
            return 'Diseases_of_the_circulatory_system'
        elif x <= 519 and x>=460:
            return 'Diseases_of_the_respiratory_system'
        elif x <= 579 and x>=520:
            return 'Diseases_of_the_digestive_system'
        elif x <= 629 and x>=580:
            return 'Diseases_of_the_genitourinary_system'
        elif x <= 679 and x>=630:
            return 'Complications_of_pregnancy_childbirth_and_the_puerperium'
        elif x <= 709 and x>=680:
            return 'Diseases_of_the_skin_and_subcutaneous_tissue'
        elif x <= 739 and x>=710:
            return 'Diseases_of_the_musculoskeletal_system_and_connective_tissue'
        elif x <= 759 and x>=740:
            return 'Congenital_anomalies'
        elif x <= 779 and x>=760:
            return 'Certain_conditions_originating_in_the_perinatal_period'
        elif x <= 799 and x>=780:
            return 'Symptoms_signs_and_ill_defined_conditions'
        elif x <= 999 and x>=800:
            return 'Injury_and_poisoning'
        else:
            return 'External_causes_of_injury_and_supplemental_classification'
    except ValueError:
        return 'External_causes_of_injury_and_supplemental_classification'
    
    
test['primary_diagnosis_types'] = test['primary_diagnosis'].apply(diagosis_types_division)
test['secondary_diagnosis_types'] = test['secondary_diagnosis'].apply(diagosis_types_division)
test['additional_diagnosis_types'] = test['additional_diagnosis'].apply(diagosis_types_division)
test.drop(['primary_diagnosis','secondary_diagnosis','additional_diagnosis'], axis=1, inplace=True)

In [573]:
value_counts = test['primary_diagnosis_types'].value_counts(normalize=True)
replace_primary_diagnosis = value_counts[value_counts < 0.05].index

test['primary_diagnosis_types'] = test['primary_diagnosis_types'].replace(replace_primary_diagnosis, 'Other_type')
test['primary_diagnosis_types'].value_counts(normalize=True)

primary_diagnosis_types
Diseases_of_the_circulatory_system                           0.298493
Other_type                                                   0.266459
Diseases_of_the_respiratory_system                           0.102064
External_causes_of_injury_and_supplemental_classification    0.100950
Diseases_of_the_digestive_system                             0.089486
Symptoms_signs_and_ill_defined_conditions                    0.073993
Injury_and_poisoning                                         0.068556
Name: proportion, dtype: float64

In [574]:
value_counts = test['secondary_diagnosis_types'].value_counts(normalize=True)
replace_primary_diagnosis = value_counts[value_counts < 0.05].index

test['secondary_diagnosis_types'] = test['secondary_diagnosis_types'].replace(replace_primary_diagnosis, 'Other_type')
test['secondary_diagnosis_types'].value_counts(normalize=True)

secondary_diagnosis_types
Diseases_of_the_circulatory_system                                     0.306780
Other_type                                                             0.279758
Endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders    0.140550
Diseases_of_the_respiratory_system                                     0.100197
External_causes_of_injury_and_supplemental_classification              0.094792
Diseases_of_the_genitourinary_system                                   0.077923
Name: proportion, dtype: float64

In [575]:
value_counts = test['additional_diagnosis_types'].value_counts(normalize=True)
replace_primary_diagnosis = value_counts[value_counts < 0.05].index

test['additional_diagnosis_types'] = test['additional_diagnosis_types'].replace(replace_primary_diagnosis, 'Other_type')
test['additional_diagnosis_types'].value_counts(normalize=True)

additional_diagnosis_types
Diseases_of_the_circulatory_system                                     0.293449
Other_type                                                             0.252964
Endocrine_nutritional_and_metabolic_diseases_and_immunity_disorders    0.205306
External_causes_of_injury_and_supplemental_classification              0.120439
Diseases_of_the_respiratory_system                                     0.066983
Diseases_of_the_genitourinary_system                                   0.060858
Name: proportion, dtype: float64

In [576]:
test['glucose_test_result'].value_counts(normalize=True)

glucose_test_result
None    0.945693
Norm    0.025909
>300    0.014314
>200    0.014085
Name: proportion, dtype: float64

In [577]:
test['a1c_test_result'].value_counts(normalize=True)

a1c_test_result
None    0.832886
>8      0.082247
Norm    0.048706
>7      0.036161
Name: proportion, dtype: float64

In [578]:
test.columns

Index(['patient_id', 'race', 'payer_code', 'outpatient_visits',
       'emergency_visits', 'inpatient_visits', 'admission_type',
       'medical_specialty', 'average_pulse_bpm', 'discharge_disposition',
       'admission_source', 'length_of_stay', 'number_lab_tests',
       'non_lab_procedures', 'number_of_medications', 'number_diagnoses',
       'glucose_test_result', 'a1c_test_result', 'medication',
       'number_prescriptions', 'Midpoint_Age', 'gender_binary',
       'diabetes_meds_binary', 'change_in_meds_binary',
       'primary_diagnosis_types', 'secondary_diagnosis_types',
       'additional_diagnosis_types'],
      dtype='object')

In [579]:
type(test['Midpoint_Age'].iloc[0])

numpy.float64

In [580]:
def label_pulse(row):
    if row['Midpoint_Age']==5.0:
        if row['average_pulse_bpm'] >= 70 and row['average_pulse_bpm'] <= 130:
            return 1
        else:
            return 0
    elif row['Midpoint_Age']<= 45.0 and row['Midpoint_Age'] >= 25.0:
        if row['average_pulse_bpm'] >= 60 and row['average_pulse_bpm'] <= 110:
            return 1
        else:
            return 0
    elif row['Midpoint_Age']==55.0:
        if row['average_pulse_bpm'] >= 60 and row['average_pulse_bpm'] <= 100:
            return 1
        else:
            return 0
    elif row['Midpoint_Age']>=65.0:
        if row['average_pulse_bpm'] >= 60 and row['average_pulse_bpm'] <= 95:
            return 1
        else:
            return 0
    else:
        if row['average_pulse_bpm'] >= 60 and row['average_pulse_bpm'] <= 100:
            return 1
        else:
            return 0

test['is_normal_pulse'] = test.apply(lambda row: label_pulse(row), axis=1)

In [581]:
test['is_normal_pulse'].value_counts(normalize=True)

is_normal_pulse
0    0.511464
1    0.488536
Name: proportion, dtype: float64

In [582]:
test['diabetes_meds_binary'].value_counts()

diabetes_meds_binary
1    23473
0     7057
Name: count, dtype: int64

In [583]:
test['change_in_meds_binary'].value_counts()

change_in_meds_binary
0    16429
1    14101
Name: count, dtype: int64

In [584]:
test['non_lab_procedures'].value_counts(normalize=True)

non_lab_procedures
0    0.459220
1    0.203374
2    0.125156
3    0.092663
6    0.048346
4    0.041009
5    0.030233
Name: proportion, dtype: float64

In [591]:
test.drop(['patient_id'], axis=1, inplace=True)

## Encoding categrical variables

In [592]:
enc1 = OrdinalEncoder() 

In [593]:
columns_to_encode = ['race', 'payer_code', 'admission_type', 'medical_specialty','discharge_disposition', 'admission_source',
                    'primary_diagnosis_types', 'secondary_diagnosis_types', 'additional_diagnosis_types', 'glucose_test_result', 
                     'a1c_test_result']

other_columns_test = pd.DataFrame(test.drop(columns_to_encode, axis=1))

In [594]:
# Perform one-hot encoding on the specified columns
test_encoded = pd.get_dummies(test[columns_to_encode])
test_encoded = test_encoded.astype(int)

test_encoded = pd.concat([test_encoded, other_columns_test], axis=1)
test_encoded.head()

Unnamed: 0_level_0,race_AfricanAmerican,race_Caucasian,race_Other,payer_code_HM,payer_code_MC,payer_code_None,payer_code_Other_code,admission_type_Elective,admission_type_Emergency,admission_type_Other_type,...,non_lab_procedures,number_of_medications,number_diagnoses,medication,number_prescriptions,Midpoint_Age,gender_binary,diabetes_meds_binary,change_in_meds_binary,is_normal_pulse
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
499502,0,1,0,1,0,0,0,0,1,0,...,0,14,6,1,1,85.0,1,1,0,0
447319,0,1,0,1,0,0,0,0,1,0,...,1,7,7,0,0,25.0,1,0,0,1
309126,1,0,0,0,0,1,0,0,1,0,...,3,12,6,1,1,35.0,1,1,0,1
181183,0,1,0,0,0,0,1,0,0,1,...,1,16,9,1,3,55.0,1,1,1,0
359339,0,1,0,0,0,1,0,0,1,0,...,0,10,9,1,1,65.0,1,1,0,0


## Imputing age

In [595]:
known_age = test_encoded[test_encoded['Midpoint_Age'].notnull()]
unknown_age = test_encoded[test_encoded['Midpoint_Age'].isnull()]

# Reset indices
known_age = known_age.reset_index(drop=True)
unknown_age = unknown_age.reset_index(drop=True)

# Create the training data for the model
X = known_age.drop(['Midpoint_Age'], axis=1)
y = known_age['Midpoint_Age']

# Fit the model
regr = RandomForestRegressor(random_state=0, n_estimators=200, n_jobs=-1)
regr.fit(X, y)

# Predict the missing ages
predicted_ages = regr.predict(unknown_age.drop(['Midpoint_Age'], axis=1))

# Fill in the missing values using the original indices
test_encoded.loc[test_encoded['Midpoint_Age'].isnull(), 'Midpoint_Age'] = predicted_ages

In [596]:
test_encoded.isna().sum()

race_AfricanAmerican     0
race_Caucasian           0
race_Other               0
payer_code_HM            0
payer_code_MC            0
                        ..
Midpoint_Age             0
gender_binary            0
diabetes_meds_binary     0
change_in_meds_binary    0
is_normal_pulse          0
Length: 69, dtype: int64

In [597]:
test_encoded.head()

Unnamed: 0_level_0,race_AfricanAmerican,race_Caucasian,race_Other,payer_code_HM,payer_code_MC,payer_code_None,payer_code_Other_code,admission_type_Elective,admission_type_Emergency,admission_type_Other_type,...,non_lab_procedures,number_of_medications,number_diagnoses,medication,number_prescriptions,Midpoint_Age,gender_binary,diabetes_meds_binary,change_in_meds_binary,is_normal_pulse
encounter_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
499502,0,1,0,1,0,0,0,0,1,0,...,0,14,6,1,1,85.0,1,1,0,0
447319,0,1,0,1,0,0,0,0,1,0,...,1,7,7,0,0,25.0,1,0,0,1
309126,1,0,0,0,0,1,0,0,1,0,...,3,12,6,1,1,35.0,1,1,0,1
181183,0,1,0,0,0,0,1,0,0,1,...,1,16,9,1,3,55.0,1,1,1,0
359339,0,1,0,0,0,1,0,0,1,0,...,0,10,9,1,1,65.0,1,1,0,0
