## Pipeline
- 파이프라인은 여러 변환 단계를 정확한 순서대로 실행할 수 있도록 하는 것

- Pipeline은 연속된 단계를 나타내는 이름/추정기 쌍의 목록을 입력으로 받음
- 마지막 단계에는 변환기와 추정기를 모두 사용할 수 있고 그 외에는 모두 변환기여야 함

- 파이프라인의 fit() 메서드를 호출하면 모든 변환기의 fit_transform() 메서드를 순서대로 호출하면서 한 단계의 출력을 다음 단계의 입력으로 전달합니다. 마지막 단계에서는 fit()메서드만 호출

- **'name' 을 지정해주었을 때 name__해당피쳐옵션 (ex -> clf__random_state=13) 세부 설정 가능**

## Method

- make_pipeline() 을 이용하여 pipeline을 만듬
- pipe.fit를 이용하여 최종 진행 -> ex) fit(x_train, y_train)
- pipe.steps[0] 등을 이용하여 pipeline의 각 과정을 따로따로 확인 가능
- pipe.set_params(clf__max_depth=2) 등을 이용하여 pipeline의 세부 변수 파라미터 설정 가능

- 테스트셋 평가할때는 pipe.score(x_test, y_test)함수

## 예제1

In [6]:
import pandas as pd

# 1. UCI 서버서 데이터 로딩 - 유방암 셋
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)

print(df.shape)
df.head()

# 1번 열은 종속변수 (악성B인지 종양M인지)
# 2번열부터 쭉 뒤로는 수치들

(569, 32)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [9]:
# 2. LabelEncoder
from sklearn.preprocessing import LabelEncoder

X = df.loc[:, 2:].values
y = df.loc[:, 1].values # B는 악성 , M은 종양

le = LabelEncoder() # le.classes_ => B, M 
y = le.fit_transform(y)

from sklearn.model_selection import train_test_split
X_train, X_test ,y_train, y_test = train_test_split(X, y, test_size=0.20, stratify = y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(455, 30) (114, 30) (455,) (114,)


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline

pipe_lr = make_pipeline(StandardScaler(), # StandardScaler().fit_transform(X)
                        PCA(n_components=2),
                        LogisticRegression())

pipe_lr

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('pca',
                 PCA(copy=True, iterated_power='auto', n_components=2,
                     random_state=None, svd_solver='auto', tol=0.0,
                     whiten=False)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='lbfgs', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [19]:
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)

# 테스트 정확도
print('테스트 정확도: %.3f' %pipe_lr.score(X_test, y_test))

테스트 정확도: 0.974


## 예제2

In [36]:
import pandas as pd

red_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
red_wine = pd.read_csv(red_url, sep=';')

white_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv'
white_wine = pd.read_csv(white_url, sep=';')

red_wine['color'] = 1
white_wine['color'] = 0

wine = pd.concat([red_wine, white_wine])
print(wine.shape)
wine.head()

(6497, 13)


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,1


In [38]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler

# StandardScaler + DecisionTree
estimators = [('scaler', StandardScaler()),
             ('clf', DecisionTreeClassifier())]

pipe = Pipeline(estimators)

# 각 과정을 따로따로 확인 가능
print(pipe.steps[0])

# clf 에 대한 세부 변수 파라미터 설정
pipe.set_params(clf__max_depth=2)
pipe.set_params(clf__random_state=13)

('scaler', StandardScaler(copy=True, with_mean=True, with_std=True))


Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('clf',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=2,
                                        max_features=None, max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=1, min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        presort='deprecated', random_state=13,
                                        splitter='best'))],
         verbose=False)

In [47]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=13,
                                                    stratify=y)

pipe.fit(X_train, y_train)
y_pr = pipe.predict(X_test)

# 테스트셋 평가할때는 pipe.score함수
print('테스트 정확도: %.3f' %pipe.score(X_test, y_test))

테스트 정확도: 0.912
