In [2]:
import pandas as pd
import plotly.plotly as py
import plotly.graph_objs as go
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', 51)

In [3]:
data = pd.read_csv("diabetic_data.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
encounter_id                101766 non-null int64
patient_nbr                 101766 non-null int64
race                        101766 non-null object
gender                      101766 non-null object
age                         101766 non-null object
weight                      101766 non-null object
admission_type_id           101766 non-null int64
discharge_disposition_id    101766 non-null int64
admission_source_id         101766 non-null int64
time_in_hospital            101766 non-null int64
payer_code                  101766 non-null object
medical_specialty           101766 non-null object
num_lab_procedures          101766 non-null int64
num_procedures              101766 non-null int64
num_medications             101766 non-null int64
number_outpatient           101766 non-null int64
number_emergency            101766 non-null int64
number_inpatient            10176

In [5]:
one_patient = data[data['patient_nbr'].isin([23398488])].sort_values('encounter_id')
one_patient

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
36822,113382426,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,10,MC,Nephrology,29,2,29,2,1,0,428,403,466,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
37957,117803256,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,1,MC,Nephrology,43,1,12,2,3,1,584,518,250.42,6,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30
38165,118554540,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,8,MC,Nephrology,44,1,24,2,3,2,428,585,518,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
38790,120572160,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,2,MC,Nephrology,35,1,13,2,4,4,514,403,250.4,4,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
39573,123050358,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,2,MC,Nephrology,39,3,23,2,4,5,996,403,250.4,8,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,>30
41335,127929642,23398488,AfricanAmerican,Male,[50-60),?,1,4,7,2,MC,Pulmonology,41,1,12,2,6,6,511,780,250.4,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
41715,128980494,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,3,MC,Nephrology,44,2,11,1,6,7,416,786,535,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
41979,129618780,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,3,MC,Nephrology,36,1,23,1,6,8,482,403,428,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,<30
42537,131125500,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,1,MC,Nephrology,43,1,7,1,6,9,518,403,V15,5,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Down,No,No,No,No,No,Ch,Yes,<30
42994,132599958,23398488,AfricanAmerican,Male,[50-60),?,1,1,7,2,MC,Nephrology,45,1,15,1,7,10,276,403,V45,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30


In [6]:
data['change'].value_counts()

No    54755
Ch    47011
Name: change, dtype: int64

In [7]:
data['time_in_hospital'].describe()

count    101766.000000
mean          4.395987
std           2.985108
min           1.000000
25%           2.000000
50%           4.000000
75%           6.000000
max          14.000000
Name: time_in_hospital, dtype: float64

In [8]:
data['number_diagnoses'].describe()

count    101766.000000
mean          7.422607
std           1.933600
min           1.000000
25%           6.000000
50%           8.000000
75%           9.000000
max          16.000000
Name: number_diagnoses, dtype: float64

In [9]:
data['number_emergency'].describe()

count    101766.000000
mean          0.197836
std           0.930472
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          76.000000
Name: number_emergency, dtype: float64

In [10]:
data.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [4]:
data['max_glu_serum'].value_counts()

None    96420
Norm     2597
>200     1485
>300     1264
Name: max_glu_serum, dtype: int64

In [12]:
plot = [go.Bar(
        x = data['race'].unique(),
        y = data['race'].value_counts()
    )]

layout = go.Layout(
    title='Race',
    xaxis= dict(
            title= 'Rasa'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [13]:
plot = [go.Bar(
        x = ['Caucasian', 'AfricanAmerican', 'Other'],
        y = [data['race'].value_counts()['Caucasian'], data['race'].value_counts()['AfricanAmerican'], 
            data['race'].value_counts()['?'] + data['race'].value_counts()['Other'] + 
            data['race'].value_counts()['Asian'] + data['race'].value_counts()['Hispanic']]
    )]

layout = go.Layout(
    title='Race',
    xaxis= dict(
            title= 'Rasa'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [14]:
data['metformin'].value_counts()

No        81778
Steady    18346
Up         1067
Down        575
Name: metformin, dtype: int64

In [15]:
# metformin, metformin-pioglitazone
medications = data.loc[:,"metformin":"metformin-pioglitazone"]

traces = []
plot = []

def gety():
    meds = {'No':[], 'Steady':[], 'Up':[], 'Down':[]}
    
    for state in medications['metformin'].unique():
        for column in medications:
            try:
                # print("OK: " + state)
                meds[state].append(medications[column].value_counts()[state])
            except KeyError:
                meds[state].append(0)
    return meds

for state in medications['metformin'].unique():
    tmp = gety()
    traces.append(go.Bar(
        x = list(medications),
        y = tmp[state],
        name = state
    ))

plot = [traces[0], traces[1], traces[2],traces[3]]
    
layout = go.Layout(
    title='Medications',
    xaxis= dict(
            title= 'Konkretny liek'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [16]:
plot = [traces[1], traces[2],traces[3]]
layout = go.Layout(
    title='Medications',
    xaxis= dict(
            title= 'Konkretny liek'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [17]:
plot = [go.Bar(
        x = data['readmitted'].unique(),
        y = data['readmitted'].value_counts()
    )]

layout = go.Layout(
    title='Readmitance',
    xaxis= dict(
            title= 'Prijaty alebo neprijaty naspat (viac alebo menej ako 30 dni)'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [18]:
plot = [go.Bar(
        x = ['No', 'Yes'],
        y = [data['readmitted'].value_counts()['NO'], data['readmitted'].value_counts()['>30'] +
            data['readmitted'].value_counts()['<30']]
    )]

layout = go.Layout(
    title='Readmitance',
    xaxis= dict(
            title= 'Prijaty alebo neprijaty naspat'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [19]:
plot = [go.Bar(
        x = data['num_procedures'].value_counts().index,
        y = data['num_procedures'].value_counts()
    )]

layout = go.Layout(
    title='Number of procedures',
    xaxis= dict(
            title= 'Pocet procedur'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [20]:
number_of_med = data.sort_values('num_medications')

plot = [go.Bar(
        x = number_of_med['num_medications'].value_counts().index,
        y = number_of_med['num_medications'].value_counts()
    )]

layout = go.Layout(
    title='Number of medications',
    xaxis= dict(
            title= 'Pocet predpisanych liekov'
        ),
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')

In [21]:
plot = [go.Bar(
        x = number_of_med['medical_specialty'].value_counts().index,
        y = number_of_med['medical_specialty'].value_counts()
    )]

layout = go.Layout(
    title='medical_specialty',
        yaxis= dict(
            title= 'Pocet navstiev'
        ),
)

fig = go.Figure(data=plot, layout=layout)

py.iplot(fig, filename='oznal2')