In [1]:
import pandas as pd
import os 

path = r'D:\elice_python\GAS_5\pytest_machine'

os.chdir(path)

In [2]:
data = pd.read_csv('cancer.csv')
print(data.shape)

(569, 31)


In [3]:
X = data.iloc[:,:-1]
y=data.iloc[:,-1]
print(X.shape, y.shape)

(569, 30) (569,)


In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

### 기존 방법

In [5]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = SVC()
model.fit(X_train_scaled, y_train)
predictions = model.predict(X_test_scaled)

print(predictions)
print(model.score(X_test_scaled,y_test))

[1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1.
 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
0.9790209790209791


### 파이프라인

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

pipe = Pipeline([
            ('scaler',MinMaxScaler()),
            ('model',SVC()),
            ])

pipe.fit(X_train, y_train)

print(pipe.predict(X_test))
print(pipe.score(X_test, y_test))

[1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1.
 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
0.9790209790209791


## Pipeline 결과에 그리드 서치 적용

### 기존 방법

In [7]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C':[0.1,1,3,5,10], 'gamma':[0.1,1,3,5,10]}
grid_search = GridSearchCV(SVC(), param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.score(X_test_scaled, y_test))

{'C': 3, 'gamma': 0.1}
0.9788782489740082
0.958041958041958


### Pipline 결과에 그리드 서치 적용

In [9]:
pipe.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('scaler', MinMaxScaler()), ('model', SVC())])>

In [12]:
from sklearn.model_selection import GridSearchCV

param_grid = {'model__C':[0.1,1,3,5,10], 'model__gamma':[0.1,1,3,5,10]} 

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)
print(grid_search.score(X_test,y_test))

{'model__C': 3, 'model__gamma': 0.1}
0.9765253077975377
Pipeline(steps=[('scaler', MinMaxScaler()), ('model', SVC(C=3, gamma=0.1))])
0.958041958041958


- `{modelname}__C`에서 '_'가 두 개 임에 유의!

### 더욱 간단한 Pipeline

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC

pipe = make_pipeline(MinMaxScaler(), SVC())
pipe.fit(X_train, y_train)

print(pipe.predict(X_test))
print(pipe.score(X_test, y_test))

[1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 0. 1. 1. 1. 0. 1.
 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 1. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 0. 0. 1. 0. 0.
 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
0.9790209790209791


In [14]:
pipe.get_params

<bound method Pipeline.get_params of Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svc', SVC())])>

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = {'svc__C':[0.1,1,3,5,10], 'svc__gamma':[0.1,1,3,5,10]} 

grid_search = GridSearchCV(pipe, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print(grid_search.best_score_)
print(grid_search.best_estimator_)
print(grid_search.score(X_test,y_test))

{'svc__C': 3, 'svc__gamma': 0.1}
0.9765253077975377
Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=3, gamma=0.1))])
0.958041958041958
