In [1]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression

cancer = load_breast_cancer()

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

scaler = StandardScaler()
data_scaled = scaler.fit_transform(cancer.data)

X_train, X_test, y_train, y_test = train_test_split(
    data_scaled,
    cancer.target,
    test_size=0.3,
    random_state=0
)

In [3]:
lr_clf = LogisticRegression() # 기본 solver는 lbfgs
lr_clf.fit(X_train, y_train)

LogisticRegression()

# 평가

In [4]:
from sklearn.metrics import accuracy_score, roc_auc_score

lr_pred = lr_clf.predict(X_test)
print("Accuracy : {:.3f}, ROC_AUC : {:.3f}".format(accuracy_score(y_test, lr_pred), roc_auc_score(y_test, lr_pred)))

Accuracy : 0.977, ROC_AUC : 0.972


In [5]:
# solver에 따른 성능 변화 측정
solvers = ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga']

for solver in solvers:
  lr_clf = LogisticRegression(solver=solver, max_iter=600) # max_iter : 최적화 횟수
  lr_clf.fit(X_train, y_train)

  lr_pred = lr_clf.predict(X_test)

  print(solver)
  print("Accuracy : {:.3f}, ROC_AUC : {:.3f}".format(accuracy_score(y_test, lr_pred), roc_auc_score(y_test, lr_pred)))
  print()

lbfgs
Accuracy : 0.977, ROC_AUC : 0.972

liblinear
Accuracy : 0.982, ROC_AUC : 0.979

newton-cg
Accuracy : 0.977, ROC_AUC : 0.972

sag
Accuracy : 0.982, ROC_AUC : 0.979

saga
Accuracy : 0.982, ROC_AUC : 0.979



In [6]:
# GridSearch
from sklearn.model_selection import GridSearchCV

params={'solver':['liblinear', 'lbfgs'],
        'penalty':['l2', 'l1'],
        'C':[0.01, 0.1, 1, 1, 5, 10]}

grid_clf = GridSearchCV(
    LogisticRegression(),
    param_grid=params,
    scoring='accuracy',
    cv=3
)

grid_clf.fit(data_scaled, cancer.target)

18 fits failed out of a total of 72.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/envs/ml-env/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/envs/ml-env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1461, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/opt/anaconda3/envs/ml-env/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 447, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 pe

GridSearchCV(cv=3, estimator=LogisticRegression(),
             param_grid={'C': [0.01, 0.1, 1, 1, 5, 10], 'penalty': ['l2', 'l1'],
                         'solver': ['liblinear', 'lbfgs']},
             scoring='accuracy')

In [7]:
print('최적 하이퍼 파라미터:{}, 최적 평균 정확도:{:.3f}'.format(grid_clf.best_params_, grid_clf.best_score_))

최적 하이퍼 파라미터:{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}, 최적 평균 정확도:0.979


### 다중 분류 전략
1. OVR (One Vs Rest) - OVA (One Vs All) ★★★★★
  - 클래스가 K개 존재하는 경우(K는 3이상) 1개의 클래스를 제외한 나머지 다른 클래스들을 K개 만들어 각각의 이진 분류를 수행
  - 이진 분류에대한 확률을 구하고, 그 확률이 제일 높은 상황을 최종 클래스로 판별
    - 0번 클래스에 대해서 `[1, 2]`클래스를 묶어준다. -> `0 vs [1, 2]`
    - 1번 클래스에 대해서 `[0, 2]`클래스를 묶어준다. -> `1 vs [0, 2]`
    - 2번 클래스에 대해서 `[0, 1]`클래스를 묶어준다. -> `2 vs [0, 1]`

2. OVO ( One Vs One )
- 하나씩 하나씩 비교하는 분류기(모델)를 여러 개 만든다.
- `0 vs 1`, `0 vs 2`, `1 vs 2`
* `0 vs 1`, `0 vs 2`, `0 vs 3`, `1 vs 2`, `1 vs 3`, `2 vs 3`
  * 분류기 개수 K개의 클래스가 있다고 가정: $K\times \frac{K-1}{2}$ 개의 분류기 생성

**보통 OVR 방법을 더 선호한다**