# Multiclass SVM 구현

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

#IRIS 데이터 로드
iris =  sns.load_dataset('iris') 
X= iris.iloc[:,:4] #학습할데이터
y = iris.iloc[:,-1] #타겟
print(y)

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=48)

In [5]:
def standardization(train, test):
    scaler = StandardScaler()
#     train = pd.DataFrame(scaler.fit_transform(train), index=train.index)
#     test = pd.DataFrame(scaler.transform(test), index=test.index)
    train = scaler.fit_transform(train)
    test = scaler.transform(test)
    return train, test

X_train, X_test = standardization(X_train, X_test)

In [6]:
X_train

array([[ 0.78522493,  0.32015325,  0.77221097,  1.04726529],
       [-0.26563371, -1.29989934,  0.0982814 , -0.11996537],
       [ 0.43493872,  0.78302542,  0.94069336,  1.43634218],
       [-0.84944407,  0.78302542, -1.24957775, -1.28719604],
       [-0.38239578, -1.7627715 ,  0.15444219,  0.13941922],
       [ 0.55170079, -0.374155  ,  1.05301496,  0.7878807 ],
       [ 0.31817664, -0.14271892,  0.65988937,  0.7878807 ],
       [ 0.20141457, -0.374155  ,  0.43524618,  0.39880381],
       [-1.66677857, -0.14271892, -1.36189934, -1.28719604],
       [-0.14887164, -0.60559109,  0.21060299,  0.13941922],
       [-0.14887164, -1.06846325, -0.12636179, -0.24965767],
       [ 0.31817664, -0.60559109,  0.15444219,  0.13941922],
       [ 0.66846286, -0.83702717,  0.88453256,  0.91757299],
       [ 0.0846525 , -0.14271892,  0.77221097,  0.7878807 ],
       [-0.49915786, -0.14271892,  0.43524618,  0.39880381],
       [-0.26563371, -0.60559109,  0.65988937,  1.04726529],
       [ 2.18636979,  1.

In [7]:
X_test

array([[-0.14887164, -0.374155  ,  0.26676379,  0.13941922],
       [ 0.31817664, -0.60559109,  0.54756778,  0.00972692],
       [ 0.31817664, -1.06846325,  1.05301496,  0.26911151],
       [-1.5500165 , -1.7627715 , -1.36189934, -1.15750374],
       [ 0.0846525 ,  0.32015325,  0.60372857,  0.7878807 ],
       [ 0.78522493, -0.14271892,  0.99685416,  0.7878807 ],
       [-0.84944407,  1.70876975, -1.24957775, -1.15750374],
       [ 0.20141457, -0.14271892,  0.60372857,  0.7878807 ],
       [-0.38239578,  2.63451409, -1.30573855, -1.28719604],
       [-0.38239578, -1.29989934,  0.15444219,  0.13941922],
       [ 0.66846286,  0.08871717,  0.99685416,  0.7878807 ],
       [-0.38239578,  1.0144615 , -1.36189934, -1.28719604],
       [-0.49915786,  0.78302542, -1.13725615, -1.28719604],
       [ 0.43493872, -0.60559109,  0.60372857,  0.7878807 ],
       [ 0.55170079, -1.7627715 ,  0.37908538,  0.13941922],
       [ 0.55170079,  0.55158933,  0.54756778,  0.52849611],
       [-1.19973028,  0.

### One-Against-Rest

In [21]:
class OneAgainstRestSVM:
    def __init__(self, C=1.0, kernel='rbf', gamma='scale'):
        self.svms = []
        self.y_pred = []
        self.num_classes = 0
        self.y_encoded = None
        self.C = C
        self.kernel = kernel
        self.gamma = gamma
        
    '''
    get_params, set_params, __sklearn_clone__ 메소드는 GridSearchCV 사용 등 scikit-learn 과의 호환성을 얻기 위함
    Reference) https://scikit-learn.org/stable/developers/develop.html#rolling-your-own-estimator'''
    
    def get_params(self, deep=True):
        return {"C": self.C, "kernel": self.kernel, "gamma": self.gamma}

    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self
    
    def __sklearn_clone__(self):
        return self
    
    def fit(self, X_train, y_train):
        self.y_encoded = pd.get_dummies(y_train)    # one-hot encoding
        self.num_classes = self.y_encoded.shape[1]
        
        for i in range(self.num_classes):
            model = SVC(C=self.C, kernel=self.kernel, gamma=self.gamma)
            model.fit(X_train, self.y_encoded.iloc[:,i])
            self.svms.append(model)

            
    # decision function으로 각 classifier의 결과를 비교해 최종 예측
    def predict(self, X_test):
        for i in range(X_test.shape[0]):
            self.y_pred.append(np.argmax([self.svms[label].decision_function(X_test)[i] for label in range(self.num_classes)]))
        
        # test를 진행하기 위해 0,1,2로 되어있던 데이터를 다시 문자 label로 변환
        label_dict = dict()
        for i in range(self.num_classes):
            label_dict[i] = self.y_encoded.columns[i]
        self.y_pred = pd.DataFrame(self.y_pred).replace(label_dict)  # iris 데이터의 경우 : label_dict = {0:'setosa', 1:'versicolor', 2:'virginica'}
        return self.y_pred

### GridSearchCV

#### Linear SVM

In [20]:
svm = OneAgainstRestSVM()
param_grid = {'kernel':['linear'],
              'C':[2**i for i in range(-5, 16)]}

skf = StratifiedKFold(n_splits=10)   # 특정 label 값을 가지는 데이터가 몰리는 것을 방지하기 위하여 Stratified KFold를 사용하였다.
grid_search = GridSearchCV(svm, param_grid, cv=skf, n_jobs=-1, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

In [22]:
print('best parameter : ', grid_search.best_params_)
print('best score : ', grid_search.best_score_)

best parameter :  {'C': 4, 'kernel': 'linear'}
best score :  0.9666666666666666


In [11]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)

accuracy :  0.8333333333333334


- linear SVM을 사용 시 test data에 대해서는 약 83%의 accuracy를 얻는다.

#### Non-Linear SVM

In [17]:
svm = OneAgainstRestSVM()
param_grid = {'kernel':['poly', 'rbf', 'sigmoid'],
              'C':[2**i for i in range(-1, 3)],
             'gamma':[2**i for i in range(-2, 4)]} 

skf = StratifiedKFold(n_splits=10)
grid_search = GridSearchCV(svm, param_grid, cv=skf, n_jobs=-1, scoring = 'accuracy')
grid_search.fit(X_train, y_train)

- 좀 더 넓은 범위의 C값과 gamma 값에 대하여 gridsearch를 수행하고 싶었으나 메모리 상의 문제 때문인지 성공하지 못했음 :(

In [18]:
print('best parameter : ', grid_search.best_params_)
print('best score : ', grid_search.best_score_)

best parameter :  {'C': 1, 'gamma': 0.25, 'kernel': 'rbf'}
best score :  0.975


### 최적 모델로 예측

In [14]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('accuracy : ', accuracy)

accuracy :  0.9666666666666667


kernel은 rbf, C=1, gamma=0.25 인 non-linear SVM 모델을 사용하여 test data에 대해 분류를 수행한 결과 약 96.7%의 accuracy를 얻었다.