## Logistic regression with Scikit-learn

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data=pd.read_csv('https://github.com/Siriratkant/Logistic-regression/raw/master/heart.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age         303 non-null int64
sex         303 non-null int64
cp          303 non-null int64
trestbps    303 non-null int64
chol        303 non-null int64
fbs         303 non-null int64
restecg     303 non-null int64
thalach     303 non-null int64
exang       303 non-null int64
oldpeak     303 non-null float64
slope       303 non-null int64
ca          303 non-null int64
thal        303 non-null int64
target      303 non-null int64
dtypes: float64(1), int64(13)
memory usage: 33.2 KB


In [4]:
data.target.value_counts()

1    165
0    138
Name: target, dtype: int64

In [5]:
a = pd.get_dummies(data['cp'], prefix = "cp")
b = pd.get_dummies(data['thal'], prefix = "thal")
c = pd.get_dummies(data['slope'], prefix = "slope")
d = pd.get_dummies(data['exang'], prefix = "exang")
e = pd.get_dummies(data['fbs'], prefix = "fbs")
f =  pd.get_dummies(data['restecg'], prefix = "restecg")
g = pd.get_dummies(data['ca'], prefix = "ca")

In [6]:
frames = [data, a, b, c,d,e,f,g]
data = pd.concat(frames, axis = 1)
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,...,fbs_0,fbs_1,restecg_0,restecg_1,restecg_2,ca_0,ca_1,ca_2,ca_3,ca_4
0,63,1,3,145,233,1,0,150,0,2.3,...,0,1,1,0,0,1,0,0,0,0
1,37,1,2,130,250,0,1,187,0,3.5,...,1,0,0,1,0,1,0,0,0,0
2,41,0,1,130,204,0,0,172,0,1.4,...,1,0,1,0,0,1,0,0,0,0
3,56,1,1,120,236,0,1,178,0,0.8,...,1,0,0,1,0,1,0,0,0,0
4,57,0,0,120,354,0,1,163,1,0.6,...,1,0,0,1,0,1,0,0,0,0


In [7]:
data = data.drop(columns = ['cp', 'thal', 'slope','exang','fbs','restecg','ca','thal_0',  'cp_0','slope_0','exang_0', 'fbs_0','restecg_0','ca_0'])
data.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,target,cp_1,cp_2,cp_3,...,slope_1,slope_2,exang_1,fbs_1,restecg_1,restecg_2,ca_1,ca_2,ca_3,ca_4
0,63,1,145,233,150,2.3,1,0,0,1,...,0,0,0,1,0,0,0,0,0,0
1,37,1,130,250,187,3.5,1,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2,41,0,130,204,172,1.4,1,1,0,0,...,0,1,0,0,0,0,0,0,0,0
3,56,1,120,236,178,0.8,1,1,0,0,...,0,1,0,0,1,0,0,0,0,0
4,57,0,120,354,163,0.6,1,0,0,0,...,0,1,1,0,1,0,0,0,0,0


In [8]:
y = data.target.values
x_data = data.drop(['target'], axis = 1)

In [9]:
x = (x_data - np.min(x_data)) / (np.max(x_data) - np.min(x_data)).values

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [11]:
data.columns

Index(['age', 'sex', 'trestbps', 'chol', 'thalach', 'oldpeak', 'target',
       'cp_1', 'cp_2', 'cp_3', 'thal_1', 'thal_2', 'thal_3', 'slope_1',
       'slope_2', 'exang_1', 'fbs_1', 'restecg_1', 'restecg_2', 'ca_1', 'ca_2',
       'ca_3', 'ca_4'],
      dtype='object')

In [12]:
x_train, x_test, y_train, y_test = train_test_split (x,y,test_size = 0.2,random_state = 0)

In [13]:
x_train.head()

Unnamed: 0,age,sex,trestbps,chol,thalach,oldpeak,cp_1,cp_2,cp_3,thal_1,...,slope_1,slope_2,exang_1,fbs_1,restecg_1,restecg_2,ca_1,ca_2,ca_3,ca_4
74,0.291667,0.0,0.264151,0.19863,0.717557,0.032258,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
153,0.770833,0.0,0.490566,0.347032,0.618321,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
64,0.604167,1.0,0.433962,0.194064,0.717557,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
296,0.708333,0.0,0.283019,0.1621,0.496183,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
287,0.583333,1.0,0.566038,0.242009,0.709924,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
y_train

array([1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0],
      dtype=int64)

In [15]:
model = LogisticRegression()
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [16]:
model.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
predicted= model.predict(x_test)
predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [18]:
model.intercept_

array([0.45003291])

In [19]:
model.coef_

array([[ 0.09079978, -1.07854089, -0.56694737, -0.28993139,  1.16999714,
        -0.96972888,  0.64179615,  1.2543071 ,  1.0661575 ,  0.18391441,
         0.91220986, -0.49069007, -0.36563243,  0.45227016, -0.72833766,
        -0.0596302 ,  0.23135564, -0.32019191, -1.36474396, -1.78187169,
        -1.77044347,  0.35870165]])

## 3.การตรวจสอบความแม่นยำของ model ด้วย confusion matix


อันนี้เป็นการวัดความแม่นยำของ model เลย โดยเราได้นำ model ที่เราได้จากจากการประมาณการกับข้อมูล training set แล้วนำ model นี้มาใช้กับข้อมูล testing set 

$$
\begin{bmatrix}
x_{0,0} & x_{0,1} \\
x_{1,0} & x_{1,1}
\end{bmatrix}
=
\begin{bmatrix}
TN&FP \\
FN&TP
\end{bmatrix}
$$

<table>
<tr>
    <td></td>
    <td>predicted values</td>
    <td>predicted values</td>
</tr>
<tr>
    <td>actual values</td>
    <td>TN</td>
    <td>FP</td>
</tr>
<tr>
    <td>actual values</td>
    <td>FN</td>
    <td>TP</td>
</tr>
<table>

* **หมายเหตุ** ตารางจะต่างจากที่เรียน ขึ้นอยู่กับ library ที่เราใช้ โดยของ scikit-learn จะมีการสลับตำแหน่งแต่ว่าความหมายยังคงเดิม

In [20]:
from sklearn import metrics

In [21]:
metrics.confusion_matrix(y_test,predicted)

array([[24,  3],
       [ 5, 29]], dtype=int64)

In [22]:
predicted

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1], dtype=int64)

In [23]:
y_test

array([0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1], dtype=int64)

**หากต้องการคำนวณเอง สามารถเขียน code ตามด้านล่างได้เลย**

In [24]:
TN, FP, FN, TP = metrics.confusion_matrix(y_test, predicted).ravel()
TN

24

$$\mathrm {Accuracy}={\frac {\mathrm {TP} +\mathrm {TN} }{\mathrm {TP} +\mathrm {TN} +\mathrm {FP} +\mathrm {FN} }}$$

In [25]:
Acurracy=(TP+TN)/(TP+TN+FP+FN)
Acurracy

0.8688524590163934

 $$\mathrm {Precision}={\frac {\mathrm {TP}  }{\mathrm {TP} +\mathrm {FP} }}$$

In [26]:
Precision =(TP)/(TP+FP)
Precision

0.90625

 $$\mathrm {Recall}={\frac {\mathrm {TP}  }{\mathrm {TP} +\mathrm {FN} }}$$

In [27]:
Recall = (TP)/(TP+FN)
Recall

0.8529411764705882

$$\mathrm {F-measure}={\frac {\mathrm {2 * Recall * Precision}  }{\mathrm {Recall+Precision}  }}$$

In [28]:
F_measure = 2 * ( Recall*Precision) / (Recall+Precision)
F_measure

0.8787878787878787

**หากต้องการให้แสดงผลทั้งหมด สามารถเขียน code ตามด้านล่างได้เลย**

In [29]:
print(metrics.classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86        27
           1       0.91      0.85      0.88        34

   micro avg       0.87      0.87      0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



**หากต้องการแสดงทีละค่า สามารถเขียน code ตามด้านล่างได้เลย**

In [30]:
metrics.accuracy_score(y_test, predicted)

0.8688524590163934

In [31]:
metrics.precision_score(y_test, predicted)

0.90625

In [32]:
metrics.recall_score(y_test, predicted)

0.8529411764705882

In [33]:
metrics.f1_score(y_test, predicted)

0.8787878787878787