In [1]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()


In [3]:
from sklearn.preprocessing import StandardScaler
cancer_std = StandardScaler().fit_transform(cancer.data)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_std, cancer.target, stratify=cancer.target, test_size = 0.2, random_state=2022
)


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier


In [8]:
lrc = LogisticRegression(random_state=2022)
svc = SVC(random_state=2022)
knn = KNeighborsClassifier()


In [9]:
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc), ('KNN',knn)], voting = 'hard'
)

In [11]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

1.0

In [12]:
# 개별 분류기의 성능
lrc.fit(X_train, y_train)
svc.fit(X_train, y_train)
knn.fit(X_train, y_train)
lrc.score(X_train, y_train), svc.score(X_train, y_train), knn.score(X_train, y_train)

(0.9868131868131869, 0.9824175824175824, 0.9714285714285714)

In [13]:
lrc.predict_proba(X_test[:5])

array([[9.99999999e-01, 1.45395340e-09],
       [1.29048390e-02, 9.87095161e-01],
       [9.99988662e-01, 1.13377133e-05],
       [5.23798806e-03, 9.94762012e-01],
       [9.99999953e-01, 4.74465654e-08]])

In [14]:
# SVC는 probability 옵션 따로 줘야함!
svc2 = SVC(probability=True, random_state=2022)
svc2.fit(X_train, y_train)
svc2.predict_proba(X_test[:5])

array([[9.99896554e-01, 1.03445598e-04],
       [7.53631647e-06, 9.99992464e-01],
       [9.99957780e-01, 4.22200830e-05],
       [1.11084633e-05, 9.99988892e-01],
       [9.99216287e-01, 7.83713010e-04]])

In [15]:
knn.predict_proba(X_test[-5:])

array([[0.8, 0.2],
       [1. , 0. ],
       [1. , 0. ],
       [0. , 1. ],
       [0. , 1. ]])

In [18]:
voc2 = VotingClassifier(
    estimators=[('LRC', lrc), ('SVC', svc2), ('KNN', knn)], voting='soft'
)

In [19]:
voc2.fit(X_train, y_train)
voc2.score(X_test, y_test)

1.0

In [20]:

voc2.predict_proba(X_test[:5])

array([[9.99965518e-01, 3.44823508e-05],
       [4.30412510e-03, 9.95695875e-01],
       [9.99982147e-01, 1.78525988e-05],
       [1.74969884e-03, 9.98250301e-01],
       [9.99738747e-01, 2.61253486e-04]])

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [26]:
from sklearn.model_selection import GridSearchCV
params = {
    'LRC__C' : [0.1, 1, 10, 20],
    'SVC__C' : [0.1, 1, 10]
}
voc2_grid = GridSearchCV(
    voc2, params, scoring='accuracy', cv=5
)
voc2_grid.fit(X_train, y_train)
print(voc2_grid.best_score_, voc2_grid.best_params_)
voc2_best = voc2_grid.best_estimator_
voc2_best.score(X_test, y_test)

0.9780219780219781 {'LRC__C': 10, 'SVC__C': 1}


1.0

In [29]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2022, n_jobs=-1)
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [31]:
params = {
    'n_estimators' : [10, 100, 1000]
}
rfc_grid = GridSearchCV(
    rfc, params, scoring='accuracy', cv = 5 
)
rfc_grid.fit(X_train, y_train)
print(rfc_grid.best_score_, rfc_grid.best_params_)
rfc_best = rfc_grid.best_estimator_
rfc_best.score(X_test, y_test)

0.9494505494505494 {'n_estimators': 1000}


1.0