### LogisticRegression 邏輯斯迴歸模型

In [101]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [102]:
df = pd.read_csv('diabetes.csv')
df

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [103]:
X = df.drop(columns=['Outcome'])
y = df[['Outcome']]

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 33)

### 各參數的詳細解釋：

1. **`max_iter=500`**：
   - 設定最大迭代次數。

2. **`random_state=33`**：
   - 設定隨機數種子，以便在每次執行時獲得相同的隨機結果。

3. **`C=1`**：
   - `C` 值越小，正則化強度越大，即懲罰越強。（Ｃ與懲罰項為倒數）
	- `C` 值越大，正則化強度越小，即懲罰越弱。（Ｃ與懲罰項為倒數）
   - 過擬合的情況下，應該將 C 設得較小 ; 欠擬合的情況下，應該將 C 設得較大。
   - 默認值是 1。

4. **`tol=0.0001`**：
   - 設定優化過程的收斂容忍度，當優化過程的損失函數變化小於 `tol` 時，停止迭代。這是模型訓練停止的條件之一。
   - 默認值是 `1e-4` (0.0001)。

In [105]:
lr = LogisticRegression(max_iter=500, random_state= 33, C=1, tol= 0.0001) 
#lr = linear_model.LogisticRegression(max_iter=500, random_state= 33, C=1, tol= 0.0001)

In [106]:
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
train_score = lr.score(X_train, y_train)
test_score = lr.score(X_test, y_test)
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix

  y = column_or_1d(y, warn=True)


array([[131,  15],
       [ 41,  44]])

In [107]:
# y_test 預測結果
y_pred

array([0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0])

In [108]:
# [預測無糖尿病機率 , 預測有糖尿病機率]
lr.predict_proba(X_test)

array([[0.65677676, 0.34322324],
       [0.91546885, 0.08453115],
       [0.79080657, 0.20919343],
       [0.92330269, 0.07669731],
       [0.41094306, 0.58905694],
       [0.67508234, 0.32491766],
       [0.79277132, 0.20722868],
       [0.12759053, 0.87240947],
       [0.8362284 , 0.1637716 ],
       [0.75257184, 0.24742816],
       [0.36095596, 0.63904404],
       [0.44034899, 0.55965101],
       [0.49427789, 0.50572211],
       [0.9203006 , 0.0796994 ],
       [0.23060676, 0.76939324],
       [0.57035766, 0.42964234],
       [0.69065612, 0.30934388],
       [0.93492149, 0.06507851],
       [0.71716056, 0.28283944],
       [0.88065211, 0.11934789],
       [0.90866085, 0.09133915],
       [0.49246661, 0.50753339],
       [0.86543199, 0.13456801],
       [0.75034984, 0.24965016],
       [0.79487062, 0.20512938],
       [0.42925331, 0.57074669],
       [0.86947162, 0.13052838],
       [0.68230098, 0.31769902],
       [0.07458832, 0.92541168],
       [0.2203759 , 0.7796241 ],
       [0.

In [109]:
print(confusion_matrix) #混淆矩陣
print(f'train score :{train_score}')
print(f'test score :{test_score}')

[[131  15]
 [ 41  44]]
train score :0.7783985102420856
test score :0.7575757575757576


# 模糊矩陣
|         | 預測 NO | 預測 YES |
|---------|----------|-----------|
| 實際 NO | TN       | FP        |
| 實際 YES| FN       | TP        |

### 數據解讀
- Accuracy(準確率) :有多少比率的個體被分類正確
    -   Accuracy = (TP + TN) / (TP + FP + TN + FN)
- Recall（召回率）：表示模型對正類檢測的靈敏度，越高越能識別真實正類。
    -   Recall/Sensitivity = TP / (TP + FN)
- Precision（精確度）：表示模型預測正類的準確度，越高越少出現假陽性。
    -   Precision = TP / (TP + FP)
- F1 Score ：提供了精確率和召回率之間的平衡，是綜合評估模型性能的重要指標 ; 1表示模型輸出的最好，0表示最差。

In [110]:
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
score = {
    "Metric": ["Accuracy", "Recall", "Precision", "F1 Score"],
    "Value": [accuracy, recall, precision, f1]
}
df = pd.DataFrame(score)
df

Unnamed: 0,Metric,Value
0,Accuracy,0.757576
1,Recall,0.517647
2,Precision,0.745763
3,F1 Score,0.611111


## 多元分類
以iris data 示範

In [111]:
from sklearn import datasets

In [112]:
iris = datasets.load_iris()

In [113]:
X = iris.data
y = iris.target

In [114]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state= 33)

In [115]:
model = LogisticRegression()
#model = linear_model.LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [116]:
model.predict(X_test)

array([1, 1, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0, 2, 1, 2, 1, 1, 0, 1, 2, 0, 0,
       2, 0, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 0, 1, 1, 1, 0,
       0])

In [117]:
model.predict_proba(X_test)

array([[2.14338914e-02, 9.09727007e-01, 6.88391020e-02],
       [8.21231015e-03, 8.89557866e-01, 1.02229824e-01],
       [9.64167943e-01, 3.58315648e-02, 4.92343253e-07],
       [3.68871404e-03, 7.93536924e-01, 2.02774361e-01],
       [1.56903277e-04, 2.84752551e-01, 7.15090545e-01],
       [4.39628456e-06, 1.18729529e-02, 9.88122651e-01],
       [9.43089731e-01, 5.69087321e-02, 1.53728277e-06],
       [9.77923779e-01, 2.20760121e-02, 2.08478136e-07],
       [1.68102884e-04, 1.79044572e-01, 8.20787325e-01],
       [1.92675286e-05, 4.49978114e-02, 9.54982921e-01],
       [8.17699515e-06, 2.75659422e-02, 9.72425881e-01],
       [9.80576610e-01, 1.94232466e-02, 1.43805796e-07],
       [3.97792813e-05, 3.86056306e-02, 9.61354590e-01],
       [4.46362242e-03, 7.75112413e-01, 2.20423965e-01],
       [2.32745523e-04, 1.64586589e-01, 8.35180666e-01],
       [1.04641611e-02, 9.17426514e-01, 7.21093247e-02],
       [2.61366404e-03, 5.06835068e-01, 4.90551268e-01],
       [9.71267566e-01, 2.87321

In [118]:
model.score(X_train, y_train)

0.9619047619047619

In [119]:
model.score(X_test, y_test)

0.9333333333333333