In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy.stats import entropy
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder, MinMaxScaler

In [8]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

# ID to description

In [279]:
data = pd.read_csv('./IDs_mapping.csv')

In [280]:
data

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped
8,,
9,discharge_disposition_id,description


In [214]:
data = data.dropna()

In [215]:
data = data.reset_index().drop('index', axis=1)

In [148]:
data[~data.description.str.contains('Not|Invalid')]

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
5,7,Trauma Center
7,discharge_disposition_id,description
8,1,Discharged to home
9,2,Discharged/transferred to another short term h...
10,3,Discharged/transferred to SNF
11,4,Discharged/transferred to ICF


In [193]:
id2desc = {'admission_type_id': {}, 'discharge_disposition_id': {}, 'admission_source_id': {}}

In [245]:
a = data.iloc[:7].to_dict('records')

In [248]:
b = data.iloc[8:37].to_dict('records')

In [250]:
c = data.iloc[38:].to_dict('records')

In [251]:
id2desc['admission_type_id'] = {d['admission_type_id']: d['description'] for d in a}
id2desc['discharge_disposition_id'] = {d['admission_type_id']: d['description'] for d in b}
id2desc['admission_source_id'] = {d['admission_type_id']: d['description'] for d in c}

In [267]:
with open('./id2desc.json', 'w') as f:
    json.dump(id2desc, f)

In [269]:
with open('./id2desc.json', 'r') as f:
    id2de = json.load(f)

In [278]:
id2de['discharge_disposition_id'].get('1', None)

'Discharged to home'

# Data

In [9]:
data2 = pd.read_csv('./diabetic_data.csv')

In [10]:
data2 = data2.drop(['weight', 'payer_code', 'medical_specialty', 'encounter_id', 'patient_nbr'], axis=1)

In [11]:
num_col = []
cat_col = []

In [12]:
for col in data2.columns:
    if data2[col].dtype == 'int64':
        num_col.append(col)
    else:
        cat_col.append(col)

In [13]:
num_col

['admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses']

In [14]:
cat_col

['race',
 'gender',
 'age',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed',
 'readmitted']

In [15]:
col_miss = []

In [16]:
for col in cat_col:
    if data2[col].str.contains('\?').any():
        col_miss.append(col)

In [17]:
col_miss

['race', 'diag_1', 'diag_2', 'diag_3']

In [18]:
idx_miss = eval('|'.join([f"data2.{c}.str.contains('\?')" for c in col_miss]))

In [19]:
data2 = data2[~idx_miss]

In [20]:
low_ent_col = []

In [21]:
for c in data2.columns:

    value,counts = np.unique(data2[c], return_counts=True)
    ent = entropy(counts)
#     print(f'entropy of {c} is {ent}')
    if ent < 1e-1:
        low_ent_col.append(c)
    
#     if len(data2[c].value_counts())==1:
#         print(c)

In [22]:
low_ent_col

['repaglinide',
 'nateglinide',
 'chlorpropamide',
 'acetohexamide',
 'tolbutamide',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone']

In [23]:
data2 = data2.drop(low_ent_col, axis=1)

In [24]:
data2.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [25]:
data2

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,change,diabetesMed,readmitted
1,Caucasian,Female,[10-20),1,1,7,3,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,Up,Ch,Yes,>30
2,AfricanAmerican,Female,[20-30),1,1,7,2,11,5,13,2,0,1,648,250,V27,6,,,No,No,Steady,No,No,No,No,No,Yes,NO
3,Caucasian,Male,[30-40),1,1,7,2,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,Up,Ch,Yes,NO
4,Caucasian,Male,[40-50),1,1,7,1,51,0,8,0,0,0,197,157,250,5,,,No,No,Steady,No,No,No,Steady,Ch,Yes,NO
5,Caucasian,Male,[50-60),2,1,2,3,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,Steady,No,Yes,>30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,[70-80),1,3,7,3,51,0,16,0,0,0,250.13,291,458,9,,>8,Steady,No,No,No,No,No,Down,Ch,Yes,>30
101762,AfricanAmerican,Female,[80-90),1,4,5,5,33,3,18,0,0,1,560,276,787,9,,,No,No,No,No,No,No,Steady,No,Yes,NO
101763,Caucasian,Male,[70-80),1,1,7,1,53,0,9,1,0,0,38,590,296,13,,,Steady,No,No,No,No,No,Down,Ch,Yes,NO
101764,Caucasian,Female,[80-90),2,3,7,10,45,2,21,0,0,1,996,285,998,9,,,No,No,Steady,No,Steady,No,Up,Ch,Yes,NO


In [600]:
numerical = ['time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
ordinal = ['age', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult']
nominal = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id','change', 'diabetesMed','metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin']

In [602]:
data2[ordinal]

Unnamed: 0,age,diag_1,diag_2,diag_3,max_glu_serum,A1Cresult
1,1,165,110,146,,
2,2,497,109,883,,
3,3,613,128,288,,
4,4,66,36,109,,
5,5,296,293,109,,
...,...,...,...,...,...,...
101761,7,116,180,336,,>8
101762,8,422,165,600,,
101763,7,267,450,185,,
101764,8,776,174,778,,


In [603]:
data2[nominal]

Unnamed: 0,race,gender,admission_type_id,discharge_disposition_id,admission_source_id,change,diabetesMed,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin
1,Caucasian,Female,1,1,7,Ch,Yes,No,No,No,No,No,No,Up
2,AfricanAmerican,Female,1,1,7,No,Yes,No,No,Steady,No,No,No,No
3,Caucasian,Male,1,1,7,Ch,Yes,No,No,No,No,No,No,Up
4,Caucasian,Male,1,1,7,Ch,Yes,No,No,Steady,No,No,No,Steady
5,Caucasian,Male,2,1,2,No,Yes,No,No,No,No,No,No,Steady
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,1,3,7,Ch,Yes,Steady,No,No,No,No,No,Down
101762,AfricanAmerican,Female,1,4,5,No,Yes,No,No,No,No,No,No,Steady
101763,Caucasian,Male,1,1,7,Ch,Yes,Steady,No,No,No,No,No,Down
101764,Caucasian,Female,2,3,7,Ch,Yes,No,No,Steady,No,Steady,No,Up


In [604]:
data2[numerical]

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
1,3,59,0,18,0,0,0,9
2,2,11,5,13,2,0,1,6
3,2,44,1,16,0,0,0,7
4,1,51,0,8,0,0,0,5
5,3,31,6,16,0,0,0,9
...,...,...,...,...,...,...,...,...
101761,3,51,0,16,0,0,0,9
101762,5,33,3,18,0,0,1,9
101763,1,53,0,9,1,0,0,13
101764,10,45,2,21,0,0,1,9


In [26]:
le = LabelEncoder()

In [33]:
data2.readmitted = le.fit_transform(data2.readmitted)

In [36]:
label = data2.readmitted

In [40]:
label.to_csv('./label.csv', index=None)

In [542]:
data2.age = le.fit_transform(data2.age)

In [544]:
le.fit(pd.concat([data2[c] for c in ordinal[-3:]]))

In [545]:
len(le.classes_)

912

In [547]:
for c in ordinal[-3:]:
    data2[c] = le.transform(data2[c])

In [590]:
glu = {'None':0, 'Norm':1, '>200':2, '>300':3}
A1C = {'>7':2, '>8':3, 'None':0, 'Norm':1}

In [605]:
data2.max_glu_serum = data2.max_glu_serum.map(glu)

In [606]:
data2.A1Cresult = data2.A1Cresult.map(A1C)

In [607]:
data3 = pd.get_dummies(data2, columns=nominal)

In [618]:
data3

Unnamed: 0,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,gender_Unknown/Invalid,admission_type_id_1,admission_type_id_2,admission_type_id_3,admission_type_id_4,admission_type_id_5,admission_type_id_6,admission_type_id_7,admission_type_id_8,discharge_disposition_id_1,discharge_disposition_id_2,discharge_disposition_id_3,discharge_disposition_id_4,discharge_disposition_id_5,discharge_disposition_id_6,discharge_disposition_id_7,discharge_disposition_id_8,discharge_disposition_id_9,discharge_disposition_id_10,discharge_disposition_id_11,discharge_disposition_id_12,discharge_disposition_id_13,discharge_disposition_id_14,discharge_disposition_id_15,discharge_disposition_id_16,discharge_disposition_id_17,discharge_disposition_id_18,discharge_disposition_id_19,discharge_disposition_id_20,discharge_disposition_id_22,discharge_disposition_id_23,discharge_disposition_id_24,discharge_disposition_id_25,discharge_disposition_id_27,discharge_disposition_id_28,admission_source_id_1,admission_source_id_2,admission_source_id_3,admission_source_id_4,admission_source_id_5,admission_source_id_6,admission_source_id_7,admission_source_id_8,admission_source_id_9,admission_source_id_10,admission_source_id_11,admission_source_id_13,admission_source_id_14,admission_source_id_17,admission_source_id_20,admission_source_id_22,admission_source_id_25,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,metformin_Down,metformin_No,metformin_Steady,metformin_Up,glimepiride_Down,glimepiride_No,glimepiride_Steady,glimepiride_Up,glipizide_Down,glipizide_No,glipizide_Steady,glipizide_Up,glyburide_Down,glyburide_No,glyburide_Steady,glyburide_Up,pioglitazone_Down,pioglitazone_No,pioglitazone_Steady,pioglitazone_Up,rosiglitazone_Down,rosiglitazone_No,rosiglitazone_Steady,rosiglitazone_Up,insulin_Down,insulin_No,insulin_Steady,insulin_Up
1,1,3,59,0,18,0,0,0,165,110,146,9,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1
2,2,2,11,5,13,2,0,1,497,109,883,6,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0
3,3,2,44,1,16,0,0,0,613,128,288,7,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1
4,4,1,51,0,8,0,0,0,66,36,109,5,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0
5,5,3,31,6,16,0,0,0,296,293,109,9,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,7,3,51,0,16,0,0,0,116,180,336,9,0,3,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0
101762,8,5,33,3,18,0,0,1,422,165,600,9,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,1,0
101763,7,1,53,0,9,1,0,0,267,450,185,13,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0,0,0
101764,8,10,45,2,21,0,0,1,776,174,778,9,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,1


In [620]:
data2.to_csv('./processed_data.csv')

In [613]:
scaler = MinMaxScaler()

In [617]:
scaler.fit_transform(data3).shape

(98053, 105)