In [20]:
# 데이터 불러오기
import seaborn as sns

iris = sns.load_dataset('iris')
X = iris.drop('species', axis = 1)
y = iris['species']

In [21]:
# y를 범주형 데이터로 변환
from sklearn.preprocessing import LabelEncoder

classle = LabelEncoder()
y_label = classle.fit_transform(iris['species'].values)

In [22]:
# 전체 데이터를 학습데이터와 테스트데이터로 분할
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y_label, test_size = 0.3, random_state = 1, stratify = y_label)

In [23]:
# Logistic Regression 적용
# Logistic -> 특성변수로 목적변수의 범주에 해당하는 확률을 구한다. 
"""
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

LogisticRegression클래스 파라미터 설명
"""
from sklearn.linear_model import LogisticRegression

# C = 1/λ
# λ -> 규제화 강도
# ovr -> one-vs-rest의 약자 : 해당하는 범주에 대한 sigmoid확률을 구하는 경우로 손실함수로 binary crossentropy사용 
Logit = LogisticRegression(C=1e2, multi_class='ovr', random_state = 1, max_iter=200)

Logit.fit(X_train, y_train)
y_train_pred = Logit.predict(X_train)
y_test_pred  = Logit.predict(X_test)
y_test_pred_proba = Logit.predict_proba(X_test) # 데이터의 class가 아닌 예측된 확률을 반환
print(y_test_pred[:5])
print(y_test_pred_proba[:5])

[2 0 0 1 1]
[[1.58069040e-07 8.08355630e-02 9.19164279e-01]
 [9.80729791e-01 1.92702092e-02 1.44032616e-16]
 [8.83660214e-01 1.16339786e-01 1.09742723e-16]
 [1.10125474e-05 6.10622960e-01 3.89366027e-01]
 [1.69869263e-04 9.94696666e-01 5.13346497e-03]]


In [24]:
# Accuracy score
from sklearn.metrics import accuracy_score

print(accuracy_score(y_train, y_train_pred))
print(accuracy_score(y_test, y_test_pred))

0.9523809523809523
0.9777777777777777


In [25]:
# 예측 결과에 대한 시각화 confusion matrix
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_test, y_test_pred))

[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]


In [26]:
!pip install joblib



You should consider upgrading via the 'D:\python37_pro\studyPython\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [27]:
# Logistic 모델 저장
import joblib
joblib.dump(Logit, './save.pkl')
logit_from_joblib = joblib.load('./save.pkl')
logit_pred = logit_from_joblib.predict(X_test)
print(accuracy_score(y_test, logit_pred))
print(confusion_matrix(y_test, logit_pred))
print(logit_pred)

0.9777777777777777
[[15  0  0]
 [ 0 15  0]
 [ 0  1 14]]
[2 0 0 1 1 1 2 1 2 0 0 2 0 1 0 1 2 1 1 2 2 0 1 2 1 1 1 2 0 2 0 0 1 1 2 2 0
 0 0 1 2 2 1 0 0]


In [33]:
# 3개의 목적변수 13개의 특성변수의 와인 데이터셋으로 Logistic 학습
import pandas as pd
import numpy as np

# 데이터 호출, 목적 변수(0)는 이미 범주형으로 되어있음.
dat_wine = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data', header=None)
print(dat_wine.shape)
dat_wine.head()

(178, 14)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [34]:
# 데이터 전처리
dat_wine.columns = ['class label', 'alchohol', 'malic acid', 'ash', 'alcalinity of ash', 'magnesium'
                   , 'total phenols', 'flavanoids', 'nonflavanoid phenols', 'proanthocyanins', 'color intensity'
                   , 'hue', 'OD208', 'proline']

print('class label:', np.unique(dat_wine['class label']))
dat_wine.head()

class label: [1 2 3]


Unnamed: 0,class label,alchohol,malic acid,ash,alcalinity of ash,magnesium,total phenols,flavanoids,nonflavanoid phenols,proanthocyanins,color intensity,hue,OD208,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [36]:
# 전체 데이터를 학습데이터와 테스트데이터로 분할
from sklearn.model_selection import train_test_split

X, y = dat_wine.iloc[:, 1:].values, dat_wine.iloc[:, 0].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1, stratify = y)

In [43]:
# 규제화 강도(λ)및 규제화 방법에 따른 Logistic 결과
from sklearn.linear_model import LogisticRegression

lr2_10 = LogisticRegression(penalty='l2', C=10.0, solver='liblinear') # L2 with C(=1/λ)=10
lr2_1 = LogisticRegression(penalty='l2', C=1.0, solver='liblinear') # L2 with C(=1/λ)=1
lr2_1_0 = LogisticRegression(penalty='l2', C=0.1, solver='liblinear') # L2 with C(=1/λ)=0.1

lr1_10 = LogisticRegression(penalty='l1', C=10.0, solver='liblinear') # L1 with C(=1/λ)=10
lr1_1 = LogisticRegression(penalty='l1', C=1.0, solver='liblinear') # L1 with C(=1/λ)=1
lr1_1_0 = LogisticRegression(penalty='l1', C=0.1, solver='liblinear') # L1 with C(=1/λ)=0.1

In [44]:
# 규제화 강도와 규제화 방법을 바꿔가며 accuracy score 계산

# l2
lr2_10.fit(X_train, y_train)
print('Training accuracy with L2 and λ=0.1:', lr2_10.score(X_train, y_train))
print('Test accuracy with L2 and λ=0.1:', lr2_10.score(X_test, y_test))

lr2_1.fit(X_train, y_train)
print('Training accuracy with L2 and λ=1:', lr2_1.score(X_train, y_train))
print('Test accuracy with L2 and λ=1:', lr2_1.score(X_test, y_test))

lr2_1_0.fit(X_train, y_train)
print('Training accuracy with L2 and λ=10:', lr2_1_0.score(X_train, y_train))
print('Test accuracy with L2 and λ=10:', lr2_1_0.score(X_test, y_test))

# l1
lr1_10.fit(X_train, y_train)
print('Training accuracy with L1 and λ=0.1:', lr1_10.score(X_train, y_train))
print('Test accuracy with L1 and λ=0.1:', lr1_10.score(X_test, y_test))

lr1_1.fit(X_train, y_train)
print('Training accuracy with L1 and λ=1:', lr1_1.score(X_train, y_train))
print('Test accuracy with L1 and λ=1:', lr1_1.score(X_test, y_test))

lr1_1_0.fit(X_train, y_train)
print('Training accuracy with L1 and λ=10:', lr1_1_0.score(X_train, y_train))
print('Test accuracy with L1 and λ=10:', lr1_1_0.score(X_test, y_test))

Training accuracy with L2 and λ=0.1: 0.9919354838709677
Test accuracy with L2 and λ=0.1: 0.9259259259259259
Training accuracy with L2 and λ=1: 0.9838709677419355
Test accuracy with L2 and λ=1: 0.9444444444444444
Training accuracy with L2 and λ=10: 0.9758064516129032
Test accuracy with L2 and λ=10: 0.9259259259259259
Training accuracy with L1 and λ=0.1: 1.0
Test accuracy with L1 and λ=0.1: 0.9259259259259259
Training accuracy with L1 and λ=1: 0.9838709677419355
Test accuracy with L1 and λ=1: 0.9074074074074074
Training accuracy with L1 and λ=10: 0.9354838709677419
Test accuracy with L1 and λ=10: 0.8888888888888888


In [48]:
# L2 규제화에서 규제 강도값에 따른 추정계수의 값의 변화
# intercept는 bias값 coef는 계수추정치
# ex) lr2_10의 첫 번째 클래스의 bias -0.96, 특성변수 계수 [-0.83, 1.44, 1.46, -1.27  ..... 0.62, 0.0267]  
print(lr2_10.intercept_)
print(lr2_1.intercept_)
print(lr2_1_0.intercept_)
print(lr2_10.coef_)
print(lr2_1.coef_)
print(lr2_1_0.coef_)

[-0.96454004  0.79517128 -0.22390284]
[-0.16655119  0.26945612 -0.07960783]
[-0.02988443  0.05684191 -0.02613036]
[[-8.35657273e-01  1.44151237e+00  1.46573599e+00 -1.27575538e+00
   2.09584320e-02  9.54431103e-01  3.00225662e+00 -7.24587224e-02
  -1.22899327e+00 -5.54451221e-01 -1.84235878e-02  6.28631543e-01
   2.67981202e-02]
 [ 1.25372415e+00 -2.37060814e+00 -6.32120704e-01  7.45500101e-01
  -1.79307209e-02 -9.68782363e-01  1.10891910e+00  1.49498820e+00
   1.47342900e+00 -2.50773519e+00  1.88932025e+00 -6.34375922e-01
  -2.26698547e-02]
 [-7.02362330e-01  1.19097868e+00  5.88119730e-01  4.03797101e-02
   7.27058253e-02 -3.70623869e-01 -3.76293283e+00 -6.02754264e-01
  -7.46353010e-01  1.33639049e+00 -1.36735133e+00 -2.11442109e+00
   2.99230394e-03]]
[[-3.58983951e-01  4.10637468e-01  3.53872093e-01 -7.42979348e-01
  -3.36933384e-02  5.44683231e-01  1.12999767e+00 -5.75279878e-02
   5.03329500e-02 -1.89873021e-01  7.55818481e-03  6.14639418e-01
   2.00108373e-02]
 [ 5.83717383e-01

In [49]:
# L2 규제화에서 규제 강도값에 따른 추정계수의 값의 변화
print(lr1_10.intercept_)
print(lr1_1.intercept_)
print(lr1_1_0.intercept_)
print(lr1_10.coef_)
print(lr1_1.coef_)
print(lr1_1_0.coef_)

[0.         6.63925143 0.        ]
[0. 0. 0.]
[0. 0. 0.]
[[-1.00915559e+00  2.39128963e+00  1.63865735e-01 -1.85617225e+00
   8.31208799e-02  0.00000000e+00  7.12844184e+00  0.00000000e+00
  -2.97399035e+00 -8.16410340e-01  0.00000000e+00  0.00000000e+00
   3.48564193e-02]
 [ 4.60354831e-01 -2.09680190e+00 -2.77308296e+00  7.01076999e-01
   1.19093769e-02 -1.86380904e+00  1.41930837e+00  9.83877929e+00
   2.17737451e+00 -2.62066195e+00  5.04500247e+00 -8.07626704e-02
  -2.31029889e-02]
 [-3.49921641e-01  1.43751100e+00  0.00000000e+00  3.04559704e-02
   1.22761800e-01  0.00000000e+00 -8.09845707e+00  0.00000000e+00
   0.00000000e+00  1.29724905e+00 -2.00584971e+00 -3.73961668e+00
  -5.38523694e-04]]
[[-2.33783117e-02  8.12566493e-02  0.00000000e+00 -7.05407439e-01
  -4.60035510e-02  0.00000000e+00  1.97046425e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   1.76173129e-02]
 [ 6.17449638e-01 -1.24483377e+00  0.00000000e+00  4.31297431e-01
   2.273