In [499]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from scipy.stats import entropy
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

In [385]:
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)

# ID to description

In [279]:
data = pd.read_csv('./IDs_mapping.csv')

In [280]:
data

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
4,5,Not Available
5,6,
6,7,Trauma Center
7,8,Not Mapped
8,,
9,discharge_disposition_id,description


In [214]:
data = data.dropna()

In [215]:
data = data.reset_index().drop('index', axis=1)

In [148]:
data[~data.description.str.contains('Not|Invalid')]

Unnamed: 0,admission_type_id,description
0,1,Emergency
1,2,Urgent
2,3,Elective
3,4,Newborn
5,7,Trauma Center
7,discharge_disposition_id,description
8,1,Discharged to home
9,2,Discharged/transferred to another short term h...
10,3,Discharged/transferred to SNF
11,4,Discharged/transferred to ICF


In [193]:
id2desc = {'admission_type_id': {}, 'discharge_disposition_id': {}, 'admission_source_id': {}}

In [245]:
a = data.iloc[:7].to_dict('records')

In [248]:
b = data.iloc[8:37].to_dict('records')

In [250]:
c = data.iloc[38:].to_dict('records')

In [251]:
id2desc['admission_type_id'] = {d['admission_type_id']: d['description'] for d in a}
id2desc['discharge_disposition_id'] = {d['admission_type_id']: d['description'] for d in b}
id2desc['admission_source_id'] = {d['admission_type_id']: d['description'] for d in c}

In [267]:
with open('./id2desc.json', 'w') as f:
    json.dump(id2desc, f)

In [269]:
with open('./id2desc.json', 'r') as f:
    id2de = json.load(f)

In [278]:
id2de['discharge_disposition_id'].get('1', None)

'Discharged to home'

# Data

In [520]:
data2 = pd.read_csv('./diabetic_data.csv')

In [521]:
data2 = data2.drop(['weight', 'payer_code', 'medical_specialty', 'encounter_id', 'patient_nbr', 'readmitted'], axis=1)

In [522]:
num_col = []
cat_col = []

In [523]:
for col in data2.columns:
    if data2[col].dtype == 'int64':
        num_col.append(col)
    else:
        cat_col.append(col)

In [524]:
num_col

['admission_type_id',
 'discharge_disposition_id',
 'admission_source_id',
 'time_in_hospital',
 'num_lab_procedures',
 'num_procedures',
 'num_medications',
 'number_outpatient',
 'number_emergency',
 'number_inpatient',
 'number_diagnoses']

In [525]:
cat_col

['race',
 'gender',
 'age',
 'diag_1',
 'diag_2',
 'diag_3',
 'max_glu_serum',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'acetohexamide',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone',
 'change',
 'diabetesMed']

In [526]:
col_miss = []

In [527]:
for col in cat_col:
    if data2[col].str.contains('\?').any():
        col_miss.append(col)

In [528]:
col_miss

['race', 'diag_1', 'diag_2', 'diag_3']

In [529]:
idx_miss = eval('|'.join([f"data2.{c}.str.contains('\?')" for c in col_miss]))

In [530]:
data2 = data2[~idx_miss]

In [531]:
low_ent_col = []

In [532]:
for c in data2.columns:

    value,counts = np.unique(data2[c], return_counts=True)
    ent = entropy(counts)
#     print(f'entropy of {c} is {ent}')
    if ent < 1e-1:
        low_ent_col.append(c)
    
#     if len(data2[c].value_counts())==1:
#         print(c)

In [533]:
low_ent_col

['repaglinide',
 'nateglinide',
 'chlorpropamide',
 'acetohexamide',
 'tolbutamide',
 'acarbose',
 'miglitol',
 'troglitazone',
 'tolazamide',
 'examide',
 'citoglipton',
 'glyburide-metformin',
 'glipizide-metformin',
 'glimepiride-pioglitazone',
 'metformin-rosiglitazone',
 'metformin-pioglitazone']

In [534]:
data2 = data2.drop(low_ent_col, axis=1)

In [535]:
data2.columns

Index(['race', 'gender', 'age', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin', 'change', 'diabetesMed'],
      dtype='object')

In [536]:
numerical = ['time_in_hospital',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']
ordinal = ['age', 'diag_1',
       'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult']
nominal = ['race', 'gender', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id','change', 'diabetesMed','metformin', 'glimepiride', 'glipizide', 'glyburide', 'pioglitazone',
       'rosiglitazone', 'insulin']

In [565]:
len(numerical) + len(ordinal) + len(nominal)

25

In [537]:
data2[ordinal]

Unnamed: 0,age,diag_1,diag_2,diag_3
1,[10-20),276,250.01,255
2,[20-30),648,250,V27
3,[30-40),8,250.43,403
4,[40-50),197,157,250
5,[50-60),414,411,250
...,...,...,...,...
101761,[70-80),250.13,291,458
101762,[80-90),560,276,787
101763,[70-80),38,590,296
101764,[80-90),996,285,998


In [538]:
data2[nominal]

Unnamed: 0,race,gender,admission_type_id,discharge_disposition_id,admission_source_id,change,diabetesMed,metformin,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin
1,Caucasian,Female,1,1,7,Ch,Yes,No,No,No,No,No,No,Up
2,AfricanAmerican,Female,1,1,7,No,Yes,No,No,Steady,No,No,No,No
3,Caucasian,Male,1,1,7,Ch,Yes,No,No,No,No,No,No,Up
4,Caucasian,Male,1,1,7,Ch,Yes,No,No,Steady,No,No,No,Steady
5,Caucasian,Male,2,1,2,No,Yes,No,No,No,No,No,No,Steady
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101761,AfricanAmerican,Male,1,3,7,Ch,Yes,Steady,No,No,No,No,No,Down
101762,AfricanAmerican,Female,1,4,5,No,Yes,No,No,No,No,No,No,Steady
101763,Caucasian,Male,1,1,7,Ch,Yes,Steady,No,No,No,No,No,Down
101764,Caucasian,Female,2,3,7,Ch,Yes,No,No,Steady,No,Steady,No,Up


In [540]:
data2[numerical]

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient
1,3,59,0,18,0,0,0
2,2,11,5,13,2,0,1
3,2,44,1,16,0,0,0
4,1,51,0,8,0,0,0
5,3,31,6,16,0,0,0
...,...,...,...,...,...,...,...
101761,3,51,0,16,0,0,0
101762,5,33,3,18,0,0,1
101763,1,53,0,9,1,0,0
101764,10,45,2,21,0,0,1


In [541]:
le = LabelEncoder()

In [542]:
data2.age = le.fit_transform(data2.age)

In [544]:
le.fit(pd.concat([data2[c] for c in ordinal[-3:]]))

In [545]:
len(le.classes_)

912

In [547]:
for c in ordinal[-3:]:
    data2[c] = le.transform(data2[c])

In [558]:
data3 = pd.get_dummies(data2, columns=nominal)

In [569]:
le.fit(data2.max_glu_serum)
le.classes_

array(['>200', '>300', 'None', 'Norm'], dtype=object)

In [570]:
le.fit(data2.A1Cresult)
le.classes_

array(['>7', '>8', 'None', 'Norm'], dtype=object)

In [590]:
glu = {'None':0, 'Norm':1, '>200':2, '>300':3}
A1C = {'>7':2, '>8':3, 'None':0, 'Norm':1}

In [591]:
data2.max_glu_serum.map(glu).value_counts()

0    92845
1     2532
2     1449
3     1227
Name: max_glu_serum, dtype: int64

In [592]:
data2.A1Cresult.map(A1C).value_counts()

0    81860
3     7631
1     4854
2     3708
Name: A1Cresult, dtype: int64