In [None]:
from warnings import filterwarnings
filterwarnings(action='ignore')

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import plotly.express as px

import scipy.stats as stats
from scipy.stats import zscore

from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder
from sklearn.preprocessing import PolynomialFeatures,PowerTransformer,StandardScaler

from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector

from sklearn.metrics import classification_report,accuracy_score,precision_score,confusion_matrix
from sklearn.metrics import recall_score,f1_score,balanced_accuracy_score
from sklearn.metrics import precision_recall_curve

from sklearn.model_selection import StratifiedKFold,cross_val_score,train_test_split
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.impute import KNNImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,BaggingClassifier,VotingClassifier

from imblearn.over_sampling import SMOTE,SMOTEN,SMOTENC

import re

In [None]:
data = pd.read_csv('/content/drive/MyDrive/PROJECT/diabetic_data_original.csv')

In [None]:
df = data.copy()

In [None]:
admission_type = pd.read_excel('/content/drive/MyDrive/PROJECT/Mapping.xlsx',sheet_name='Admission_type_id')
discharge_disposition = pd.read_excel('/content/drive/MyDrive/PROJECT/Mapping.xlsx',sheet_name='Discharge_disposition')
admission_source = pd.read_excel('/content/drive/MyDrive/PROJECT/Mapping.xlsx',sheet_name='Admission_source_id')

In [None]:
mapping_1 = dict(zip(discharge_disposition.discharge_disposition_id.values,
                     discharge_disposition.description.values))
mapping_2 = dict(zip(admission_type.admission_type_id.values,admission_type.description.values))
mapping_3 = dict(zip(admission_source.admission_source_id.values,
                     admission_source.description.values))

In [None]:
df.discharge_disposition_id = df.discharge_disposition_id.map(mapping_1).astype('string')
df.admission_type_id = df.admission_type_id.map(mapping_2).astype('string')
df.admission_source_id = df.admission_source_id.map(mapping_3).astype('string')

In [None]:
df = df.replace('?',np.nan)

In [None]:
df['readmitted'] = df['readmitted'].map({'<30':'Yes','>30':'No','NO':'No'})

In [None]:
df = df.rename(columns={'diag_1':'diagnosis_1','diag_2':'diagnosis_2','diag_3':'diagnosis_3'})

In [None]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,,Not Mapped,Physician Referral,1,...,No,No,No,No,No,No,No,No,No,No
1,149190,55629189,Caucasian,Female,[10-20),,Emergency,Discharged to home,Emergency Room,3,...,No,Up,No,No,No,No,No,Ch,Yes,No
2,64410,86047875,AfricanAmerican,Female,[20-30),,Emergency,Discharged to home,Emergency Room,2,...,No,No,No,No,No,No,No,No,Yes,No
3,500364,82442376,Caucasian,Male,[30-40),,Emergency,Discharged to home,Emergency Room,2,...,No,Up,No,No,No,No,No,Ch,Yes,No
4,16680,42519267,Caucasian,Male,[40-50),,Emergency,Discharged to home,Emergency Room,1,...,No,Steady,No,No,No,No,No,Ch,Yes,No


In [None]:
# Patients who have visited multiple times

both_multiple_visit = list(df['patient_nbr'].value_counts()[df['patient_nbr'].value_counts() > 1].index)

# Patients who have not filled 'race' information & visited multiple times

race_multiple_but_null = df[(df['patient_nbr'].isin(both_multiple_visit)) & (df['race'].isnull())]
race_multiple_but_null_unique = race_multiple_but_null['patient_nbr'].unique()

# Patients who have filled 'race' information & visited multiple times

race_multiple_but_filled = df[(df['patient_nbr'].isin(both_multiple_visit)) & (df['race'].notna())]
race_multiple_but_filled_unique = race_multiple_but_filled['patient_nbr'].unique()

patients_imputation = [patient_nbr for patient_nbr in race_multiple_but_null_unique 
 if patient_nbr in race_multiple_but_filled_unique]

print("Patients who have not filled 'race' in one of the visits among multiple visits :",
     len(patients_imputation))

Patients who have not filled 'race' in one of the visits among multiple visits : 127


In [None]:
race_imputation_df = race_multiple_but_filled[race_multiple_but_filled['patient_nbr'].isin(patients_imputation)]
race_imputation_dict = race_imputation_df.set_index('patient_nbr')['race'].to_dict()

def bivariate_imputation(row,map_dict):
    if (pd.isna(row['race'])) and (row['patient_nbr'] in map_dict):
        row['race'] = map_dict[row['patient_nbr']]
    return row

In [None]:
# checking null values before imputation

df['race'].isnull().sum()

2273

In [None]:
df = df.apply(bivariate_imputation,axis=1,args=[race_imputation_dict])

In [None]:
# Checking null values after imputation

df['race'].isnull().sum()

# so we have filled 127 records
# Fill rest through KNN-Imputer

2129

In [None]:
df['readmitted'] = df['readmitted'] = df.readmitted.map({'<30':'Readmitted','>30':'Not readmitted','NO':'Not readmitted'})

In [None]:
df['readmitted'] = LabelEncoder().fit_transform(df['readmitted'])

In [None]:
df['gender'] = df['gender'].fillna(df['gender'].mode())

In [None]:
df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'diagnosis_1', 'diagnosis_2', 'diagnosis_3', 'number_diagnoses',
       'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide',
       'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose',
       'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton',
       'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [None]:
drop_list = ['encounter_id', 'patient_nbr','weight','payer_code',
            'acetohexamide','troglitazone','examide','citoglipton','glimepiride-pioglitazone',
            'metformin-rosiglitazone','metformin-pioglitazone']
drop_list_no_medicine = ['encounter_id', 'patient_nbr','weight','payer_code']

In [None]:
df = df.drop(columns=drop_list)

In [None]:
def admission_source_encoding(x):
    if 'Referral' in str(x).split():
        return 'Referral'
    elif 'Emergency' in str(x).split():
        return 'Emergency'
    else:
        return 'other'
df.admission_source_id = df.admission_source_id.map(admission_source_encoding)

In [None]:
df.admission_source_id.value_counts()

Emergency    57494
Referral     30856
other        13416
Name: admission_source_id, dtype: int64

In [None]:
df.admission_type_id.value_counts(dropna=False)

Emergency        53990
Elective         18869
Urgent           18480
NaN               5291
Not Available     4785
Not Mapped         320
Trauma Center       21
Newborn             10
Name: admission_type_id, dtype: int64

In [None]:
df.admission_type_id.isnull().sum()

5291

In [None]:
df.loc[df.admission_type_id.isnull(),'admission_type_id'] = np.nan

In [None]:
def admission_type_encoding(x):
    if x in ['Not Available','Not Mapped','Trauma Center','Newborn']:
        return 'others'
    elif pd.isna(x):
        return 'Not mentioned'
    else:
        return x
df.admission_type_id = df.admission_type_id.map(admission_type_encoding)

In [None]:
df.admission_type_id.value_counts()

Emergency        53990
Elective         18869
Urgent           18480
Not mentioned     5291
others            5136
Name: admission_type_id, dtype: int64

In [None]:
df.discharge_disposition_id.value_counts()

Discharged to home                                                                                           60234
Discharged/transferred to SNF                                                                                13954
Discharged/transferred to home with home health service                                                      12902
Discharged/transferred to another short term hospital                                                         2128
Discharged/transferred to another rehab fac including rehab units of a hospital .                             1993
Expired                                                                                                       1642
Discharged/transferred to another type of inpatient care institution                                          1184
Not Mapped                                                                                                     989
Discharged/transferred to ICF                                                   

In [None]:
df.discharge_disposition_id = df.discharge_disposition_id.astype(str)

In [None]:
def dishcharge_encoding(x):
    # let us remove patients who are transferred to 'Hospice' or 'expired'
    if ('hospice' in str(x).lower()) or ('Expired' in str(x).lower()):
        return np.nan
    elif 'discharged to home' in str(x).lower():
        return 'Discharge'
    elif 'transfer' in str(x).lower():
        return 'Transfer'
    else:
        return 'Other'
df.discharge_disposition_id = df.discharge_disposition_id.apply(dishcharge_encoding)

In [None]:
# Converting all strings to -1 to make attribute homogeneous
# Converting these columns to float64 datatype

diagnosis = ['diagnosis_1','diagnosis_2','diagnosis_3']
for diag in diagnosis:
    df[diag].loc[df[diag].astype('string').str.contains('[A-Za-z].+[0-9]*')] = -1
    df[diag] = df[diag].astype(np.float64)

In [None]:
def diagnosis_encoding(x):
    if pd.isna(x):
        return np.nan
    elif (x >= 140) and (x <= 239):
        return 'Neoplasms'
    elif x == 785 or (x >=390 and x <= 459):
        return 'circulatory'
    elif x == 786 or (x >= 460 and x <= 519):
        return 'Respiratory'
    elif x == 787 or (x >= 520 and x <= 579):
        return 'Digestive'
    elif x >= 250 and (x >= 250 and x <= 251):
        return 'Diabetes'
    elif x >= 800 and x <= 1000:
        return 'Injury'
    elif x >= 710 and x <= 739:
        return 'Musculoskeletal'
    elif (x >=580 and x <= 629) or x == 788: 
        return 'Genitourinary'
    elif x == -1:
        return 'other'
    else: 
        return 'other'

In [None]:
df['diagnosis_3'].apply(diagnosis_encoding)

0                 NaN
1               other
2               other
3         circulatory
4            Diabetes
             ...     
101761    circulatory
101762      Digestive
101763          other
101764         Injury
101765      Digestive
Name: diagnosis_3, Length: 101766, dtype: object

In [None]:
# Applying function to primary, secondary and Additional secondary columnnosis

diagnosis = ['diagnosis_1','diagnosis_2','diagnosis_3']

for column in diagnosis:
    df[column] = df[column].apply(diagnosis_encoding)

In [None]:
df.medical_specialty.value_counts().nlargest(10)

InternalMedicine              14635
Emergency/Trauma               7565
Family/GeneralPractice         7440
Cardiology                     5352
Surgery-General                3099
Nephrology                     1613
Orthopedics                    1400
Orthopedics-Reconstructive     1233
Radiologist                    1140
Pulmonology                     871
Name: medical_specialty, dtype: int64

In [None]:
def medical_speciality_encoding(x):
    if pd.isna(x):
        return 'Missing'
    else:
        if x in 'InternalMedicine':
            return 'InternalMedicine'
        elif x in 'Emergency/Trauma':
            return 'Emergency/Trauma'
        elif x in 'Family/GeneralPractise':
            return 'Family/GeneralPractise'
        elif x in 'Cardiology':
            return 'Cardiology'
        elif 'Surgery' in x:
            return 'Surgery'
        else:
            return 'Others'
df.medical_specialty = df.medical_specialty.apply(medical_speciality_encoding)

In [None]:
df.medical_specialty.value_counts(dropna=False)

Missing             49949
Others              19234
InternalMedicine    14635
Emergency/Trauma     7565
Cardiology           5352
Surgery              5031
Name: medical_specialty, dtype: int64

In [None]:
df.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'medical_specialty', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'diagnosis_1', 'diagnosis_2', 'diagnosis_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin', 'change', 'diabetesMed',
       'readmitted'],
      dtype='object')

In [None]:
df.admission_type_id.value_counts()

Emergency        53990
Elective         18869
Urgent           18480
Not mentioned     5291
others            5136
Name: admission_type_id, dtype: int64

## Final features

In [None]:
patient_info = ['race', 'gender', 'age', 'admission_type_id',
                'discharge_disposition_id','admission_source_id',
                'medical_specialty','diagnosis_1','diagnosis_2', 'diagnosis_3',
                'max_glu_serum', 'A1Cresult','change', 'diabetesMed']
print("patient_information features : ",len(patient_info))

drop_list_info = ['encounter_id', 'patient_nbr','weight','payer_code']

patient_info_numeric = ['time_in_hospital','num_lab_procedures',
                        'num_procedures','num_medications',
                        'number_outpatient','number_emergency',
                        'number_inpatient','number_diagnoses']
print("Numeric features : ",len(patient_info_numeric))

features_medicine = ['metformin','repaglinide', 'nateglinide', 
                     'chlorpropamide', 'glimepiride', 'glipizide', 
                     'glyburide', 'tolbutamide','pioglitazone', 
                     'rosiglitazone', 'acarbose', 'miglitol','tolazamide',
                     'insulin','glyburide-metformin', 'glipizide-metformin']
print("Appropriate medicine features : ",len(features_medicine))                  

drop_list_medicine = ['acetohexamide','troglitazone','examide','citoglipton',
                      'glimepiride-pioglitazone','metformin-rosiglitazone',
                      'metformin-pioglitazone']

patient_information features :  14
Numeric features :  8
Appropriate medicine features :  16


In [None]:
trf1 = ColumnTransformer([('labelencoder',OrdinalEncoder(),patient_info)],remainder='passthrough')

trf2 = ColumnTransformer([('labelencoder',OrdinalEncoder(),features_medicine)],remainder='passthrough')

## Final Encoding

In [None]:
patient_info_encoder = OrdinalEncoder()
encoding1 = patient_info_encoder.fit_transform(df[patient_info])

In [None]:
encoding1 .shape

(101766, 14)

In [None]:
medicine_encoder = OrdinalEncoder()
encoding2 = medicine_encoder.fit_transform(df[features_medicine])

In [None]:
encoding2.shape

(101766, 16)

In [None]:
final_df = pd.concat([pd.DataFrame(encoding1 ,columns=patient_info),pd.DataFrame(encoding2,columns=features_medicine),
                      df[patient_info_numeric]],axis=1)

In [None]:
final_df.shape

(101766, 38)

In [None]:
df_with_dummies = pd.get_dummies(pd.concat([df[patient_info],df[features_medicine],df[patient_info_numeric]],axis=1),drop_first=True)

## KNN-Imputer

In [None]:
Imputer = KNNImputer(n_neighbors=25)

In [None]:
df_modeling = pd.DataFrame(Imputer.fit_transform(final_df),columns=final_df.columns)

In [None]:
df_modeling

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,medical_specialty,diagnosis_1,diagnosis_2,diagnosis_3,...,glyburide-metformin,glipizide-metformin,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,2.0,0.0,0.0,2.0,1.0,1.0,4.0,0.0,6.84,6.04,...,1.0,0.0,1.0,41.0,0.0,1.0,0.0,0.0,0.0,1.0
1,2.0,0.0,1.0,1.0,0.0,0.0,3.0,8.0,0.00,8.00,...,1.0,0.0,3.0,59.0,0.0,18.0,0.0,0.0,0.0,9.0
2,0.0,0.0,2.0,1.0,0.0,0.0,3.0,8.0,0.00,8.00,...,1.0,0.0,2.0,11.0,5.0,13.0,2.0,0.0,1.0,6.0
3,2.0,1.0,3.0,1.0,0.0,0.0,3.0,8.0,0.00,7.00,...,1.0,0.0,2.0,44.0,1.0,16.0,0.0,0.0,0.0,7.0
4,2.0,1.0,4.0,1.0,0.0,0.0,3.0,5.0,5.00,0.00,...,1.0,0.0,1.0,51.0,0.0,8.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,0.0,1.0,7.0,1.0,2.0,0.0,3.0,0.0,8.00,7.00,...,1.0,0.0,3.0,51.0,0.0,16.0,0.0,0.0,0.0,9.0
101762,0.0,0.0,8.0,1.0,2.0,2.0,3.0,1.0,8.00,1.00,...,1.0,0.0,5.0,33.0,3.0,18.0,0.0,0.0,1.0,9.0
101763,2.0,1.0,7.0,1.0,0.0,0.0,3.0,8.0,2.00,8.00,...,1.0,0.0,1.0,53.0,0.0,9.0,1.0,0.0,0.0,13.0
101764,2.0,0.0,8.0,3.0,2.0,0.0,5.0,3.0,8.00,3.00,...,1.0,0.0,10.0,45.0,2.0,21.0,0.0,0.0,1.0,9.0


In [None]:
df_modeling.to_csv('/content/drive/MyDrive/PROJECT/KNN25_imputed.csv')

In [None]:
df_modeling.medical_specialty.round(0).value_counts()

2.0    47880
3.0    30823
1.0    12387
0.0     5520
4.0     5156
Name: medical_specialty, dtype: int64

In [None]:
df_modeling.diagnosis_1.round(0).value_counts()

7.0    30439
8.0    18157
6.0    14427
1.0     9475
0.0     8772
3.0     6977
2.0     5117
4.0     4962
5.0     3440
Name: diagnosis_1, dtype: int64

In [None]:
!pip install fancyimpute

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
Collecting knnimpute>=0.1.0
  Downloading knnimpute-0.1.0.tar.gz (8.3 kB)
Collecting nose
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[K     |████████████████████████████████| 154 kB 11.3 MB/s 
Building wheels for collected packages: fancyimpute, knnimpute
  Building wheel for fancyimpute (setup.py) ... [?25l[?25hdone
  Created wheel for fancyimpute: filename=fancyimpute-0.7.0-py3-none-any.whl size=29899 sha256=ac8f63aea4805bed899a2970acb7dcbacba17ec61741b1246586ef04177a094a
  Stored in directory: /root/.cache/pip/wheels/e3/04/06/a1a7d89ef4e631ce6268ea2d8cde04f7290651c1ff1025ce68
  Building wheel for knnimpute (setup.py) ... [?25l[?25hdone
  Created wheel for knnimpute: filename=knnimpute-0.1.0-py3-none-any.whl size=11353 sha256=f6d36e0fbbac7ce977d1afe763ec129d7d3fb0c4b1fa827205359ea0b7e57f7e
  Stored in dir

In [None]:
from fancyimpute import 

In [None]:
Imputer.feature_names_in_

array(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id',
       'medical_specialty', 'diagnosis_1', 'diagnosis_2', 'diagnosis_3',
       'max_glu_serum', 'A1Cresult', 'change', 'diabetesMed', 'metformin',
       'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
       'glipizide', 'glyburide', 'tolbutamide', 'pioglitazone',
       'rosiglitazone', 'acarbose', 'miglitol', 'tolazamide', 'insulin',
       'glyburide-metformin', 'glipizide-metformin', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient',
       'number_diagnoses'], dtype=object)