In [1]:
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', 51)

In [3]:
data = pd.read_csv("diabetic_data.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            10176

In [5]:
one_patient = data[data['patient_nbr'].isin([23398488])].sort_values('encounter_id')
one_patient

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
36822,113382426,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,10,MC,Nephrology,29,2,29,2,1,0,428,403,466,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
37957,117803256,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,1,MC,Nephrology,43,1,12,2,3,1,584,518,250.42,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30
38165,118554540,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,8,MC,Nephrology,44,1,24,2,3,2,428,585,518,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
38790,120572160,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,2,MC,Nephrology,35,1,13,2,4,4,514,403,250.4,4,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
39573,123050358,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,2,MC,Nephrology,39,3,23,2,4,5,996,403,250.4,8,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
41335,127929642,23398488,AfricanAmerican,Male,[50-60),?,1,4,7,2,MC,Pulmonology,41,1,12,2,6,6,511,780,250.4,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
41715,128980494,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,3,MC,Nephrology,44,2,11,1,6,7,416,786,535,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
41979,129618780,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,3,MC,Nephrology,36,1,23,1,6,8,482,403,428,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30
42537,131125500,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,1,MC,Nephrology,43,1,7,1,6,9,518,403,V15,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
42994,132599958,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,2,MC,Nephrology,45,1,15,1,7,10,276,403,V45,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30


In [6]:
data['change'].value_counts()

No    54755
Ch    47011
Name: change, dtype: int64

In [7]:
data['time_in_hospital'].describe()

count    101766.000000
mean          4.395987
std           2.985108
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          14.000000
Name: time_in_hospital, dtype: float64

In [8]:
data['number_diagnoses'].describe()

count    101766.000000
mean          7.422607
std           1.933600
min           1.000000
25%           6.000000
50%           8.000000
75%           9.000000
max          16.000000
Name: number_diagnoses, dtype: float64

In [9]:
data['number_emergency'].describe()

count    101766.000000
mean          0.197836
std           0.930472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          76.000000
Name: number_emergency, dtype: float64

In [10]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [11]:
data['race'].unique()

array(['Caucasian', 'AfricanAmerican', '?', 'Other', 'Asian', 'Hispanic'],
      dtype=object)

In [12]:
plot = [go.Bar(
        x = data['race'].unique(),
        y = data['race'].value_counts()
    )]

layout = go.Layout(
    title='Race',
    xaxis= dict(
            title= 'Rasa'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [13]:
plot = [go.Bar(
        x = ['Caucasian', 'AfricanAmerican', 'Other'],
        y = [data['race'].value_counts()['Caucasian'], data['race'].value_counts()['AfricanAmerican'], 
            data['race'].value_counts()['?'] + data['race'].value_counts()['Other'] + 
            data['race'].value_counts()['Asian'] + data['race'].value_counts()['Hispanic']]
    )]

layout = go.Layout(
    title='Race',
    xaxis= dict(
            title= 'Rasa'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [14]:
data['metformin'].value_counts()

No        81778
Steady    18346
Up         1067
Down        575
Name: metformin, dtype: int64

In [15]:
# metformin, metformin-pioglitazone
medications = data.loc[:,"metformin":"metformin-pioglitazone"]

traces = []
plot = []

def gety():
    meds = {'No':[], 'Steady':[], 'Up':[], 'Down':[]}
    
    for state in medications['metformin'].unique():
        for column in medications:
            try:
                # print("OK: " + state)
                meds[state].append(medications[column].value_counts()[state])
            except KeyError:
                meds[state].append(0)
    return meds

for state in medications['metformin'].unique():
    tmp = gety()
    traces.append(go.Bar(
        x = list(medications),
        y = tmp[state],
        name = state
    ))

plot = [traces[0], traces[1], traces[2],traces[3]]
    
layout = go.Layout(
    title='Medications',
    xaxis= dict(
            title= 'Konkretny liek'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [16]:
plot = [traces[1], traces[2],traces[3]]
layout = go.Layout(
    title='Medications',
    xaxis= dict(
            title= 'Konkretny liek'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [17]:
plot = [go.Bar(
        x = data['readmitted'].unique(),
        y = data['readmitted'].value_counts()
    )]

layout = go.Layout(
    title='Readmitance',
    xaxis= dict(
            title= 'Prijaty alebo neprijaty naspat (viac alebo menej ako 30 dni)'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [18]:
plot = [go.Bar(
        x = ['No', 'Yes'],
        y = [data['readmitted'].value_counts()['NO'], data['readmitted'].value_counts()['>30'] +
            data['readmitted'].value_counts()['<30']]
    )]

layout = go.Layout(
    title='Readmitance',
    xaxis= dict(
            title= 'Prijaty alebo neprijaty naspat'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [19]:
plot = [go.Bar(
        x = data['num_procedures'].value_counts().index,
        y = data['num_procedures'].value_counts()
    )]

layout = go.Layout(
    title='Number of procedures',
    xaxis= dict(
            title= 'Pocet procedur'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [20]:
number_of_med = data.sort_values('num_medications')

plot = [go.Bar(
        x = number_of_med['num_medications'].value_counts().index,
        y = number_of_med['num_medications'].value_counts()
    )]

layout = go.Layout(
    title='Number of medications',
    xaxis= dict(
            title= 'Pocet predpisanych liekov'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [21]:
plot = [go.Bar(
        x = number_of_med['medical_specialty'].value_counts().index,
        y = number_of_med['medical_specialty'].value_counts()
    )]

layout = go.Layout(
    title='medical_specialty',
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [4]:
data = data.drop(columns=['weight', 'admission_type_id', 'discharge_disposition_id', 'admission_source_id', 'payer_code', 'medical_specialty'])

In [5]:
data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),1,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),3,59,0,18,0,0,0,276,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),2,11,5,13,2,0,1,648,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),2,44,1,16,0,0,0,8,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,51,0,8,0,0,0,197,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),3,31,6,16,0,0,0,414,411,250,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),4,70,1,21,0,0,0,414,411,V45,7,,,Steady,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),5,73,0,12,0,0,0,428,492,250,8,,,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),13,68,2,28,0,0,0,398,427,38,8,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),12,33,3,18,0,0,0,434,198,486,8,,,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [6]:
# medications
# No = -1
# Steady = 0
# Down = 1
# Up = 2

data = data.replace(to_replace='No', value=-1)
data = data.replace(to_replace='Steady', value=0)
data = data.replace(to_replace='Down', value=1)
data = data.replace(to_replace='Up', value=2)

In [7]:
# change
# No = 0
# Ch = 1

# diabetesMed
# No = 0
# Yes = 1
data = data.replace(to_replace='Ch', value=1)
data.loc[data['change'] < 0, 'change'] = 0
data.loc[data['diabetesMed'] == 'Yes', 'diabetesMed'] = 1
data.loc[data['diabetesMed'] < 0, 'diabetesMed'] = 0

In [8]:
# gender
# Male = 0
# Female = 1

data = data.replace(to_replace='Male', value=0)
data = data.replace(to_replace='Female', value=1)

In [9]:
# race

# Caucasian = 0
# African-American = 1
# Asian, Hispanic, Other = 2

data = data.replace(to_replace='Caucasian', value=0)
data = data.replace(to_replace='AfricanAmerican', value=1)
data = data.replace(to_replace=['Hispanic', 'Asian', 'Other'] , value=2)

In [10]:
data = data.assign(diag_1_139 =pd.Series())
data = data.assign(diag_140_239 =pd.Series())
data = data.assign(diag_240_279 =pd.Series())
data = data.assign(diag_280_289 =pd.Series())
data = data.assign(diag_290_319 =pd.Series())
data = data.assign(diag_320_389 =pd.Series())
data = data.assign(diag_390_459 =pd.Series())
data = data.assign(diag_460_519 =pd.Series())
data = data.assign(diag_520_579 =pd.Series())
data = data.assign(diag_580_629 =pd.Series())
data = data.assign(diag_630_679 =pd.Series())
data = data.assign(diag_680_709 =pd.Series())
data = data.assign(diag_710_739 =pd.Series())
data = data.assign(diag_740_759 =pd.Series())
data = data.assign(diag_760_779 =pd.Series())
data = data.assign(diag_780_799 =pd.Series())
data = data.assign(diag_800_999 =pd.Series())
data = data.assign(diag_EV =pd.Series())

In [12]:
data = data.fillna(0)
data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,diag_1_139,diag_140_239,diag_240_279,diag_280_289,diag_290_319,diag_320_389,diag_390_459,diag_460_519,diag_520_579,diag_580_629,diag_630_679,diag_680_709,diag_710_739,diag_740_759,diag_760_779,diag_780_799,diag_800_999,diag_EV
0,2278392,8222157,0,1,[0-10),1,41,0,1,0,0,0,250.83,?,?,1,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,0,0,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,149190,55629189,0,1,[10-20),3,59,0,18,0,0,0,276,250.01,255,9,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,>30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,64410,86047875,1,1,[20-30),2,11,5,13,2,0,1,648,250,V27,6,,,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,0,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,500364,82442376,0,0,[30-40),2,44,1,16,0,0,0,8,250.43,403,7,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16680,42519267,0,0,[40-50),1,51,0,8,0,0,0,197,157,250,5,,,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,35754,82637451,0,0,[50-60),3,31,6,16,0,0,0,414,411,250,9,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,0,1,>30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,55842,84259809,0,0,[60-70),4,70,1,21,0,0,0,414,411,V45,7,,,0,-1,-1,-1,0,-1,-1,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,63768,114882984,0,0,[70-80),5,73,0,12,0,0,0,428,492,250,8,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,0,1,>30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,12522,48330783,0,1,[80-90),13,68,2,28,0,0,0,398,427,38,8,,,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,15738,63555939,0,1,[90-100),12,33,3,18,0,0,0,434,198,486,8,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
data

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,...,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted,diag_1_139,diag_140_239,diag_240_279,diag_280_289,diag_290_319,diag_320_389,diag_390_459,diag_460_519,diag_520_579,diag_580_629,diag_630_679,diag_680_709,diag_710_739,diag_740_759,diag_760_779,diag_780_799,diag_800_999,diag_EV
0,2278392,8222157,0,1,[0-10),1,41,0,1,0,0,0,250.83,?,?,1,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,0,0,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,149190,55629189,0,1,[10-20),3,59,0,18,0,0,0,276,250.01,255,9,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,>30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,64410,86047875,1,1,[20-30),2,11,5,13,2,0,1,648,250,V27,6,,,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,0,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,500364,82442376,0,0,[30-40),2,44,1,16,0,0,0,8,250.43,403,7,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,16680,42519267,0,0,[40-50),1,51,0,8,0,0,0,197,157,250,5,,,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,35754,82637451,0,0,[50-60),3,31,6,16,0,0,0,414,411,250,9,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,0,1,>30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,55842,84259809,0,0,[60-70),4,70,1,21,0,0,0,414,411,V45,7,,,0,-1,-1,-1,0,-1,-1,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,63768,114882984,0,0,[70-80),5,73,0,12,0,0,0,428,492,250,8,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,0,1,>30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,12522,48330783,0,1,[80-90),13,68,2,28,0,0,0,398,427,38,8,,,-1,-1,-1,-1,-1,-1,0,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,15738,63555939,0,1,[90-100),12,33,3,18,0,0,0,434,198,486,8,,,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,1,1,NO,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
data = data.replace(to_replace='?', value=0)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 62 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null int64
gender                      101766 non-null object
age                         101766 non-null object
time_in_hospital            101766 non-null int64
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            101766 non-null int64
diag_1                      101766 non-null object
diag_2                      101766 non-null object
diag_3                      101766 non-null object
number_diagnoses            101766 non-null int64
max_glu_serum               101766 non-null object
A1Cresult                   10176

In [15]:
for index, row in data.iterrows():
    diag = [row['diag_1'], row['diag_2'], row['diag_3']]
    for d in diag:
        if isinstance(d, str) and (d.startswith('V') or d.startswith('E')):
            data.at[index, 'diag_EV'] = data.at[index, 'diag_EV'] + 1
        elif 1 <= float(d) <= 139:
            data.at[index, 'diag_1_139'] = data.at[index, 'diag_1_139'] + 1
        elif 140 <= float(d) <= 239:
            data.at[index, 'diag_140_239'] = data.at[index, 'diag_140_239'] + 1
        elif 240 <= float(d) <= 279:
            data.at[index, 'diag_240_279'] = data.at[index, 'diag_240_279'] + 1
        elif 280 <= float(d) <= 289:
            data.at[index, 'diag_280_289'] = data.at[index, 'diag_280_289'] + 1
        elif 290 <= float(d) <= 319:
            data.at[index, 'diag_290_319'] = data.at[index, 'diag_290_319'] + 1
        elif 320 <= float(d) <= 389:
            data.at[index, 'diag_320_389'] = data.at[index, 'diag_320_389'] + 1
        elif 390 <= float(d) <= 459:
            data.at[index, 'diag_390_459'] = data.at[index, 'diag_390_459'] + 1
        elif 460 <= float(d) <= 519:
            data.at[index, 'diag_460_519'] = data.at[index, 'diag_460_519'] + 1
        elif 520 <= float(d) <= 579:
            data.at[index, 'diag_520_579'] = data.at[index, 'diag_520_579'] + 1
        elif 580 <= float(d) <= 629:
            data.at[index, 'diag_580_629'] = data.at[index, 'diag_580_629'] + 1
        elif 630 <= float(d) <= 679:
            data.at[index, 'diag_630_679'] = data.at[index, 'diag_630_679'] + 1
        elif 680 <= float(d) <= 709:
            data.at[index, 'diag_680_709'] = data.at[index, 'diag_680_709'] + 1
        elif 710 <= float(d) <= 739:
            data.at[index, 'diag_710_739'] = data.at[index, 'diag_710_739'] + 1
        elif 740 <= float(d) <= 759:
            data.at[index, 'diag_740_759'] = data.at[index, 'diag_740_759'] + 1
        elif 760 <= float(d) <= 779:
            data.at[index, 'diag_760_779'] = data.at[index, 'diag_760_779'] + 1
        elif 780 <= float(d) <= 799:
            data.at[index, 'diag_780_799'] = data.at[index, 'diag_780_799'] + 1
        elif 800 <= float(d) <= 999:
            data.at[index, 'diag_800_999'] = data.at[index, 'diag_800_999'] + 1
    # break

In [61]:
data['age'].value_counts()

[70-80)     26068
[60-70)     22483
[50-60)     17256
[80-90)     17197
[40-50)      9685
[30-40)      3775
[90-100)     2793
[20-30)      1657
[10-20)       691
[0-10)        161
Name: age, dtype: int64

In [16]:
data = data.replace(to_replace='[0-10)', value=0)

In [17]:
data = data.replace(to_replace='[0-10)', value=0)
data = data.replace(to_replace='[10-20)', value=1)
data = data.replace(to_replace='[20-30)', value=2)
data = data.replace(to_replace='[30-40)', value=3)
data = data.replace(to_replace='[40-50)', value=4)
data = data.replace(to_replace='[50-60)', value=5)
data = data.replace(to_replace='[60-70)', value=6)
data = data.replace(to_replace='[70-80)', value=7)
data = data.replace(to_replace='[80-90)', value=8)
data = data.replace(to_replace='[90-100)', value=9)


In [18]:
data = data.replace(to_replace='NO', value=0)
data = data.replace(to_replace='>30', value=1)
data = data.replace(to_replace='<30', value=1)

In [19]:
data = data.drop(columns=['max_glu_serum', 'A1Cresult', 'diag_1', 'diag_2', 'diag_3'])

In [20]:
data = data.drop(columns=['patient_nbr'])

In [21]:
data = data.set_index('encounter_id')

In [24]:
data = data.drop(data[data['gender'] == 'Unknown/Invalid'].index)
data['gender'].value_counts()

1    54708
0    47055
Name: gender, dtype: int64

In [25]:
data_normalized = data.copy()

In [26]:
data_normalized[list(data_normalized)] = data_normalized[list(data_normalized)].apply(pd.to_numeric)

scaler = preprocessing.MinMaxScaler()
scaled_values = scaler.fit_transform(data_normalized) 
data_normalized.loc[:,:] = scaled_values

In [35]:
train, test = train_test_split(data_normalized, test_size=0.2)

In [38]:
train_target = train['readmitted']
test_target = test['readmitted']

In [40]:
train = train.drop(columns=['readmitted'])
test = test.drop(columns=['readmitted'])

In [41]:
gnb = GaussianNB()
y_pred = gnb.fit(train, train_target).predict(test)


In [43]:
from sklearn import metrics

In [44]:
print(metrics.classification_report(test_target, y_pred))

             precision    recall  f1-score   support

        0.0       0.58      0.87      0.70     10944
        1.0       0.65      0.28      0.39      9409

avg / total       0.61      0.60      0.56     20353



In [42]:
(y_pred == test_target).value_counts()

True     12155
False     8198
Name: readmitted, dtype: int64