# 분류 알고리즘 정확도 경진대회

#### load_digits, load_breast_cancer
- test_size = 0.2, random_state = 2021

In [1]:
from sklearn.datasets import load_digits
from sklearn.datasets import load_breast_cancer
digits = load_digits()
cancer = load_breast_cancer()

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

lr = LogisticRegression()
svc = SVC()
knn = KNeighborsClassifier()

In [3]:
from sklearn.metrics import accuracy_score

### Digits

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
digits_scaled = scaler.fit_transform(digits.data)

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    digits_scaled, digits.target, test_size=0.2, random_state=2021
)

In [6]:
from sklearn.ensemble import VotingClassifier

vo_clf = VotingClassifier(
    estimators=[('LR', lr), ('SVC', svc), ('KNN', knn)], voting='hard'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test,pred)
print(f'앙상블 학습의 정확도: {acc:.4f}')

앙상블 학습의 정확도: 0.9778


In [7]:
classifiers = [lr,svc,knn]
for classifier in classifiers:
    classifier.fit(X_train,y_train)
    pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, pred)
    class_name = classifier.__class__.__name__
    print(f'{class_name} 정확도: {acc:.4f}')

LogisticRegression 정확도: 0.9528
SVC 정확도: 0.9778
KNeighborsClassifier 정확도: 0.9889


#### KNeighborsClassifier 가 앙상블보다 정확함. 따라서 knn 이용 최적파라미터

In [8]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [9]:
knn.fit(X_train, y_train)
pred = knn.predict(X_test)
acc = accuracy_score(y_test, pred)
print( f'KNN의 정확도 : {acc:.4f}')

KNN의 정확도 : 0.9889


In [10]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
Knn_clf = KNeighborsClassifier(n_jobs=1)
param_distribs = { 'n_neighbors': randint(low=1, high=50), 'leaf_size': randint(low=1, high=50), 'p': randint(low=1, high=50)}
random_cv = RandomizedSearchCV(Knn_clf, param_distributions=param_distribs, n_iter=100, cv=5, n_jobs=1,scoring='accuracy', refit=True)
random_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {random_cv.best_score_:.4f}')
print('최적 파라미터: ', random_cv.best_params_)

최고 평균 정확도: 0.9777
최적 파라미터:  {'leaf_size': 6, 'n_neighbors': 8, 'p': 4}


In [11]:
best = random_cv.best_estimator_
pred = best.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 Kneighbor 정확도: {acc:.4f}')

최적 파라미터 Kneighbor 정확도: 0.9861


In [19]:
from sklearn.model_selection import GridSearchCV
params = {
    'leaf_size': [28,30,32],
    'n_neighbors': [4,5,6],
    'p': [1,2,3]
}
Knn_clf = KNeighborsClassifier(n_jobs=1)
grid_cv = GridSearchCV(Knn_clf, param_grid=params, cv=3, n_jobs=1,verbose=1,scoring='accuracy', refit=True)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
최고 평균 정확도: 0.9737
최적 파라미터:  {'leaf_size': 28, 'n_neighbors': 6, 'p': 2}
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed:    2.5s finished


In [20]:
best = grid_cv.best_estimator_
pred = best.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 Kneighbor 정확도: {acc:.4f}')

최적 파라미터 Kneighbor 정확도: 0.9825


## Digits : Kneighbor분류기로 'leaf_size': 6, 'n_neighbors': 8, 'p': 4 일때  정확도 0.9861

### Breast Cancer

In [41]:
scaler = MinMaxScaler()
cancer_scaled = scaler.fit_transform(cancer.data)
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, test_size=0.2, random_state=2021
)

In [42]:
vo_clf = VotingClassifier(
    estimators=[('LR', lr), ('SVC', svc), ('KNN', knn)], voting='hard'
)
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
acc = accuracy_score(y_test,pred)
print(f'앙상블 학습의 정확도: {acc:.4f}')

앙상블 학습의 정확도: 0.9825


In [43]:
classifiers = [lr,svc,knn]
for classifier in classifiers:
    classifier.fit(X_train,y_train)
    pred = classifier.predict(X_test)
    acc = accuracy_score(y_test, pred)
    class_name = classifier.__class__.__name__
    print(f'{class_name} 정확도: {acc:.4f}')

LogisticRegression 정확도: 0.9561
SVC 정확도: 0.9912
KNeighborsClassifier 정확도: 0.9825


#### SVC 가 앙상블보다 정확함. 따라서 svc 이용 최적파라미터

In [44]:
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

In [45]:
svc.fit(X_train, y_train)
pred = svc.predict(X_test)
acc = accuracy_score(y_test, pred)
print( f'SVC 정확도 : {acc:.4f}')

SVC 정확도 : 0.9912


In [52]:
svc_clf = SVC()
param_distribs = {'C':[0.01,0.1,1.0,10,100], 'degree': randint(low=1, high=500), 'cache_size': randint(low=1, high=500)}
random_cv = RandomizedSearchCV(svc_clf, param_distributions=param_distribs, n_iter=100, cv=5, n_jobs=1,scoring='accuracy', refit=True)
random_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {random_cv.best_score_:.4f}')
print('최적 파라미터: ', random_cv.best_params_)

최고 평균 정확도: 0.9714
최적 파라미터:  {'C': 10, 'cache_size': 133, 'degree': 384}


In [53]:
best = random_cv.best_estimator_
pred = best.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 Kneighbor 정확도: {acc:.4f}')

최적 파라미터 Kneighbor 정확도: 0.9825


In [39]:
params = {
    'degree': [35,40,45],
    'cache_size': [110,120,130,140,150]
}
svc_clf = SVC()
grid_cv = GridSearchCV(svc_clf, param_grid=params, cv=3, n_jobs=1,verbose=1,scoring='accuracy', refit=True)
grid_cv.fit(X_train,y_train)
print(f'최고 평균 정확도: {grid_cv.best_score_:.4f}')
print('최적 파라미터: ', grid_cv.best_params_)

Fitting 3 folds for each of 15 candidates, totalling 45 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
최고 평균 정확도: 0.9649
최적 파라미터:  {'cache_size': 110, 'degree': 35}
[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:    0.4s finished


In [40]:
best = grid_cv.best_estimator_
pred = best.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'최적 파라미터 Kneighbor 정확도: {acc:.4f}')

최적 파라미터 Kneighbor 정확도: 0.9912


## Breast Cancer : SVC 분류기로 'cache_size': 110, 'degree': 35 일때  정확도 0.9912