## Logistic regression with Statsmodels

ในบทนี้เราจะทำการประมาณการข้อมูล ซึ่งข้อมูลของเราในที่นี้คือ ปัจจัยที่ทำให้เกิดเบาหวาน หากคุณเป็นเบาหวาน y=1 แต่ถ้าไม่เป็นเบาหวาน y=0 เราจะใช้ Statsmodels ในการประมาณการ โดยจะมีการแบ่งข้อมูลออกเป็น training set และ testing set เพื่อที่จะดูว่า model ที่เราใช้ แม่นยำพอที่จะไปใช้กับข้อมูลอื่นได้หรือไม่ และเราจะตรวจสอบความแม่นยำของข้อมูลโดยใช้ confusion matrix 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv('C:\\Users\\LENOVO\\Desktop\\ML2\\files for example\\heart.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [4]:
data.target.value_counts()

1    165
0    138
Name: target, dtype: int64

In [5]:
a = pd.get_dummies(data['cp'], prefix = "cp")
b = pd.get_dummies(data['thal'], prefix = "thal")
c = pd.get_dummies(data['slope'], prefix = "slope")
d = pd.get_dummies(data['exang'], prefix = "exang")
e = pd.get_dummies(data['fbs'], prefix = "fbs")
f =  pd.get_dummies(data['restecg'], prefix = "restecg")
g = pd.get_dummies(data['ca'], prefix = "ca")

In [6]:
frames = [data, a, b, c,d,e,f,g]
data = pd.concat(frames, axis = 1)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,ca_0,ca_1,ca_2,ca_3,ca_4
0,63,1,3,145,233,1,0,150,0,2.3,...,0,1,1,0,0,1,0,0,0,0
1,37,1,2,130,250,0,1,187,0,3.5,...,1,0,0,1,0,1,0,0,0,0
2,41,0,1,130,204,0,0,172,0,1.4,...,1,0,1,0,0,1,0,0,0,0
3,56,1,1,120,236,0,1,178,0,0.8,...,1,0,0,1,0,1,0,0,0,0
4,57,0,0,120,354,0,1,163,1,0.6,...,1,0,0,1,0,1,0,0,0,0


In [7]:
data = data.drop(columns = ['cp', 'thal', 'slope','exang','fbs','restecg','ca','thal_0',  'cp_0','slope_0','exang_0', 'fbs_0','restecg_0','ca_0'])
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,slope_1,slope_2,exang_1,fbs_1,restecg_1,restecg_2,ca_1,ca_2,ca_3,ca_4
0,63,1,145,233,150,2.3,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,37,1,130,250,187,3.5,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,41,0,130,204,172,1.4,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,56,1,120,236,178,0.8,1,1,0,0,...,0,1,0,0,1,0,0,0,0,0
4,57,0,120,354,163,0.6,1,0,0,0,...,0,1,1,0,1,0,0,0,0,0


In [8]:
y = data.target.values
x_data = data.drop(['target'], axis = 1)

In [9]:
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values

In [10]:
from patsy import dmatrices
import statsmodels.api as sm

In [11]:
data.columns

Index(['age', 'sex', 'trestbps', 'chol', 'thalach', 'oldpeak', 'target',
       'cp_1', 'cp_2', 'cp_3', 'thal_1', 'thal_2', 'thal_3', 'slope_1',
       'slope_2', 'exang_1', 'fbs_1', 'restecg_1', 'restecg_2', 'ca_1', 'ca_2',
       'ca_3', 'ca_4'],
      dtype='object')

In [18]:
y,x = dmatrices('target ~ age + sex + trestbps + chol + thalach + oldpeak + cp_1 + cp_2 + cp_3 + thal_1 + thal_2 +thal_3 + slope_1 +slope_2+exang_1 + fbs_1 + restecg_1 + restecg_2 + ca_1 + ca_2 + ca_3 + ca_4' , data=data, return_type = 'dataframe')

In [19]:
model = sm.Logit(y, x).fit()
model.summary()

Optimization terminated successfully.
         Current function value: 0.296420
         Iterations 8


0,1,2,3
Dep. Variable:,target,No. Observations:,303.0
Model:,Logit,Df Residuals:,280.0
Method:,MLE,Df Model:,22.0
Date:,"Thu, 11 Jun 2020",Pseudo R-squ.:,0.5699
Time:,16:25:32,Log-Likelihood:,-89.815
converged:,True,LL-Null:,-208.82
,,LLR p-value:,3.556e-38

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.1790,3.705,0.048,0.961,-7.084,7.442
age,0.0278,0.025,1.094,0.274,-0.022,0.078
sex,-1.8623,0.571,-3.262,0.001,-2.981,-0.743
trestbps,-0.0262,0.012,-2.191,0.028,-0.050,-0.003
chol,-0.0043,0.004,-1.011,0.312,-0.013,0.004
thalach,0.0201,0.012,1.691,0.091,-0.003,0.043
oldpeak,-0.3972,0.242,-1.639,0.101,-0.872,0.078
cp_1,0.8647,0.578,1.496,0.135,-0.268,1.998
cp_2,2.0032,0.529,3.784,0.000,0.966,3.041


In [20]:
model.params

Intercept    0.179045
age          0.027819
sex         -1.862297
trestbps    -0.026162
chol        -0.004291
thalach      0.020055
oldpeak     -0.397174
cp_1         0.864708
cp_2         2.003186
cp_3         2.417107
thal_1       2.637558
thal_2       2.367747
thal_3       0.915115
slope_1     -0.775084
slope_2      0.689965
exang_1     -0.779111
fbs_1        0.445666
restecg_1    0.460582
restecg_2   -0.714204
ca_1        -2.342301
ca_2        -3.483178
ca_3        -2.247144
ca_4         1.267961
dtype: float64

In [21]:
model.predict(x)

0      0.946400
1      0.887347
2      0.979347
3      0.966351
4      0.946653
5      0.633714
6      0.860821
7      0.841934
8      0.878080
9      0.974239
10     0.799186
11     0.994308
12     0.937702
13     0.847819
14     0.995619
15     0.983381
16     0.997903
17     0.973918
18     0.706970
19     0.898066
20     0.279483
21     0.964881
22     0.874437
23     0.753993
24     0.836212
25     0.659764
26     0.973516
27     0.979300
28     0.926247
29     0.949459
         ...   
273    0.293628
274    0.018110
275    0.054061
276    0.005194
277    0.764472
278    0.689750
279    0.008487
280    0.110205
281    0.084919
282    0.492350
283    0.547752
284    0.017283
285    0.000629
286    0.723196
287    0.437107
288    0.005182
289    0.012103
290    0.144656
291    0.024592
292    0.022783
293    0.531958
294    0.443001
295    0.003161
296    0.863400
297    0.007729
298    0.326674
299    0.678378
300    0.004742
301    0.008562
302    0.722244
Length: 303, dtype: floa

In [22]:
P = model.predict(x).apply(lambda p: 0 if p < .5 else 1)

In [23]:
P

0      1
1      1
2      1
3      1
4      1
5      1
6      1
7      1
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     1
20     0
21     1
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     1
      ..
273    0
274    0
275    0
276    0
277    1
278    1
279    0
280    0
281    0
282    0
283    1
284    0
285    0
286    1
287    0
288    0
289    0
290    0
291    0
292    0
293    1
294    0
295    0
296    1
297    0
298    0
299    1
300    0
301    0
302    1
Length: 303, dtype: int64

In [24]:
dt = pd.concat([data,pd.Series(P,name = 'predicted')], axis = 'columns')
dt.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,slope_2,exang_1,fbs_1,restecg_1,restecg_2,ca_1,ca_2,ca_3,ca_4,predicted
0,63,1,145,233,150,2.3,1,0,0,1,...,0,0,1,0,0,0,0,0,0,1
1,37,1,130,250,187,3.5,1,0,1,0,...,0,0,0,1,0,0,0,0,0,1
2,41,0,130,204,172,1.4,1,1,0,0,...,1,0,0,0,0,0,0,0,0,1
3,56,1,120,236,178,0.8,1,1,0,0,...,1,0,0,1,0,0,0,0,0,1
4,57,0,120,354,163,0.6,1,0,0,0,...,1,1,0,1,0,0,0,0,0,1


In [25]:
model.pred_table()

array([[115.,  23.],
       [ 12., 153.]])

In [26]:
TN, FP, FN, TP = model.pred_table().ravel()

In [27]:
Acurracy=(TP+TN)/(TP+TN+FP+FN)
Acurracy

0.8844884488448845