In [101]:
import pandas as pd
import numpy as np

In [176]:
# load csv
csv = pd.read_csv('Diabetes_clean.csv')

In [177]:
# load mapping
dicts = dict()

# race
races = list(set(list(csv['race'])))
dicts['race'] = dict()
raced = dicts['race']
for i in range(len(races)):
    raced[races[i]] = i+1

# gender
dicts['gender'] = dict()
genderd = dicts['gender']
genderd['Female'] = 2
genderd['Male'] = 1
genderd['Unknown/Invalid'] = 0

# age
dicts['age'] = dict()
aged = dicts['age']
aged['[0-10)'] = 1
aged['[10-20)'] = 2
aged['[20-30)'] = 3
aged['[30-40)'] = 4
aged['[40-50)'] = 5
aged['[50-60)'] = 6
aged['[60-70)'] = 7
aged['[70-80)'] = 8
aged['[80-90)'] = 9
aged['[90-100)'] = 10


# max_glu_serum
dicts['max_glu_serum'] = dict()
mgsd = dicts['max_glu_serum']
mgsd['None'] = 0
mgsd['Norm'] = 1
mgsd['>200'] = 2
mgsd['>300'] = 3

# A1Cresult
dicts['A1Cresult'] = dict()
A1Cd = dicts['A1Cresult']
A1Cd['None'] = 0
A1Cd['Norm'] = 1
A1Cd['>7'] = 2
A1Cd['>8'] = 3

# change
dicts['change'] = dict()
chd = dicts['change']
chd['No'] = 1
chd['Ch'] = 2

# diabetesMed
dicts['diabetesMed'] = dict()
dmd = dicts['diabetesMed']
dmd['No'] = 1
dmd['Yes'] = 2

# readmitted
dicts['readmitted'] = dict()
red = dicts['readmitted']
red['NO'] = 1
red['<30'] = 2
red['>30'] = 3

# others
dicts['other'] = dict()
otherd = dicts['other']
otherd['No'] = 0
otherd['Down'] = 1
otherd['Steady'] = 2
otherd['Up'] = 3

# diags
diags = list(set(list(csv['diagnose1']) + list(csv['diagnose2']) + list(csv['diagnose2'])))
dicts['diagnose'] = dict()
diagd = dicts['diagnose']
for i in range(len(diags)):
    diagd[diags[i]] = i
    
dicts['diagnose1'] = diagd
dicts['diagnose2'] = diagd
dicts['diagnose3'] = diagd

In [178]:
# remove unrelavant columns
csv = csv.drop(['diag_1', 'diag_2', 'diag_3', 'diag1', 'diag2', 'diag3'], axis=1)

In [179]:
# update values to numeric
keepcols = ['encounter_id', 'patient_nbr', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'num_lab_procedures', 'num_procedures',
       'num_medications', 'number_outpatient', 'number_emergency',
       'number_inpatient', 'number_diagnoses']
onehotcols = ['race', 'gender', 'change', 'diabetesMed', 'diagnose1', 'diagnose2', 'diagnose3']

for c in csv.columns:
    print(c)
    if c in keepcols or c in onehotcols:
        continue
    if c not in dicts.keys():
        dic = dicts['other']
    else:
        dic = dicts[c]
        
    # update values
    ls = csv[c]
    newls = [dic[e] for e in ls]
    csv[c] = newls


encounter_id
patient_nbr
race
gender
age
admission_type_id
discharge_disposition_id
admission_source_id
time_in_hospital
num_lab_procedures
num_procedures
num_medications
number_outpatient
number_emergency
number_inpatient
number_diagnoses
max_glu_serum
A1Cresult
metformin
repaglinide
nateglinide
chlorpropamide
glimepiride
acetohexamide
glipizide
glyburide
tolbutamide
pioglitazone
rosiglitazone
acarbose
miglitol
troglitazone
tolazamide
insulin
glyburide.metformin
glipizide.metformin
glimepiride.pioglitazone
metformin.rosiglitazone
metformin.pioglitazone
change
diabetesMed
readmitted
diagnose1
diagnose2
diagnose3


In [180]:
# update one hot coding
onehotcols = ['race', 'gender', 'change', 'diabetesMed', 'diagnose1', 'diagnose2', 'diagnose3']

dfs = []
for c in onehotcols:
    df  = pd.get_dummies(csv[c],prefix=c, drop_first=False)
    dfs.append(df)

for df in dfs:
    csv = pd.concat([csv, df], axis=1)
    
for c in onehotcols:
    csv = csv.drop([c], axis=1)

In [181]:
# Adjust output
csv = csv[[c for c in csv if c not in ['readmitted']] + ['readmitted']]

In [186]:
csv = csv.drop(['encounter_id', 'patient_nbr','glimepiride.pioglitazone'], axis=1)
csv.to_csv('hospital_ready.csv', index=False)