## Pipeline 사용 목적

In [2]:
from sklearn.pipeline import Pipeline, make_pipeline

In [3]:
import seaborn as sns

In [4]:
iris = sns.load_dataset('iris')

In [5]:
from sklearn.preprocessing import StandardScaler

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(iris.iloc[:,:-1],iris.species)

In [49]:
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [9]:
pipe = Pipeline([('ss', StandardScaler()), ('knn', KNeighborsClassifier())])

In [10]:
vars(pipe)

{'steps': [('ss', StandardScaler()), ('knn', KNeighborsClassifier())],
 'memory': None,
 'verbose': False}

In [11]:
from sklearn.model_selection import GridSearchCV

In [12]:
knn = KNeighborsClassifier()

In [13]:
knn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [14]:
grid = GridSearchCV(knn, {'n_neighbors':range(2,10)})

In [15]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(2, 10)})

In [16]:
import pandas as pd

In [17]:
pd.DataFrame(grid.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7
mean_fit_time,0.00299149,0.00239177,0.00219231,0.00139623,0,0,0.0062479,0.00312424
std_fit_time,0.00154465,0.00048995,0.000400423,0.000797784,0,0,0.00765209,0.00624847
mean_score_time,0.00259337,0.00239577,0.00239325,0.00180845,0.00312471,0.00312462,0,0
std_score_time,0.000488112,0.000486871,0.000488422,0.000980177,0.00624943,0.00624924,0,0
param_n_neighbors,2,3,4,5,6,7,8,9
params,{'n_neighbors': 2},{'n_neighbors': 3},{'n_neighbors': 4},{'n_neighbors': 5},{'n_neighbors': 6},{'n_neighbors': 7},{'n_neighbors': 8},{'n_neighbors': 9}
split0_test_score,0.869565,0.956522,0.956522,1,1,1,0.956522,0.956522
split1_test_score,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522
split2_test_score,0.818182,0.863636,0.818182,0.863636,0.818182,0.954545,0.863636,0.863636
split3_test_score,0.954545,0.954545,0.954545,0.954545,1,1,1,1


In [18]:
grid = GridSearchCV(knn, {'n_neighbors':range(2,10), 'p': [2,3,4]})

In [19]:
grid.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(2, 10), 'p': [2, 3, 4]})

In [20]:
pd.DataFrame(grid.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
mean_fit_time,0.00271273,0.00219364,0.00219498,0.00159569,0.00352478,0.00312347,0.00312428,0.00312419,0,0.00312409,...,0,0,0.00312452,0.00624847,0.00312424,0,0,0.00312657,0.00312257,0
std_fit_time,0.00075284,0.000976854,0.000398352,0.000488675,0.00610104,0.00624695,0.00624857,0.00624838,0,0.00624819,...,0,0,0.00624905,0.00765279,0.00624847,0,0,0.00625315,0.00624514,0
mean_score_time,0.00337582,0.00239391,0.00259209,0.00219421,0.000211811,0,0,0.00312428,0.00312448,0,...,0.00313234,0.00312395,0,0,0,0,0.00624843,0,0,0.0031846
std_score_time,0.000500437,0.000488364,0.000487841,0.000399089,0.000423622,0,0,0.00624857,0.00624895,0,...,0.00626469,0.0062479,0,0,0,0,0.00765273,0,0,0.00636921
param_n_neighbors,2,2,2,3,3,3,4,4,4,5,...,6,7,7,7,8,8,8,9,9,9
param_p,2,3,4,2,3,4,2,3,4,2,...,4,2,3,4,2,3,4,2,3,4
params,"{'n_neighbors': 2, 'p': 2}","{'n_neighbors': 2, 'p': 3}","{'n_neighbors': 2, 'p': 4}","{'n_neighbors': 3, 'p': 2}","{'n_neighbors': 3, 'p': 3}","{'n_neighbors': 3, 'p': 4}","{'n_neighbors': 4, 'p': 2}","{'n_neighbors': 4, 'p': 3}","{'n_neighbors': 4, 'p': 4}","{'n_neighbors': 5, 'p': 2}",...,"{'n_neighbors': 6, 'p': 4}","{'n_neighbors': 7, 'p': 2}","{'n_neighbors': 7, 'p': 3}","{'n_neighbors': 7, 'p': 4}","{'n_neighbors': 8, 'p': 2}","{'n_neighbors': 8, 'p': 3}","{'n_neighbors': 8, 'p': 4}","{'n_neighbors': 9, 'p': 2}","{'n_neighbors': 9, 'p': 3}","{'n_neighbors': 9, 'p': 4}"
split0_test_score,0.869565,0.869565,0.869565,0.956522,0.956522,0.956522,0.956522,0.913043,0.913043,1,...,0.956522,1,0.956522,0.956522,0.956522,0.956522,0.913043,0.956522,0.956522,0.913043
split1_test_score,0.956522,0.956522,0.956522,0.956522,0.913043,0.913043,0.956522,0.956522,0.956522,0.956522,...,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522,0.956522
split2_test_score,0.818182,0.818182,0.818182,0.863636,0.863636,0.863636,0.818182,0.818182,0.818182,0.863636,...,0.863636,0.954545,0.954545,0.909091,0.863636,0.863636,0.863636,0.863636,0.909091,0.909091


In [21]:
grid = GridSearchCV(knn, {'n_neibhbors':range(2,10), 'leaf size': [20,30]})

In [22]:
pipe3 = Pipeline([('ss', StandardScaler()),('clf', KNeighborsClassifier())])

In [23]:
from sklearn.linear_model import LogisticRegression

In [24]:
grid2 = GridSearchCV(pipe3, [
    {'clf':[KNeighborsClassifier()], 'clf__n_neighbors':[2,3,4,5]},
    {'clf':[LogisticRegression()], 'clf__C':[1,2,3,4,5]}
])

In [25]:
grid2.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('ss', StandardScaler()),
                                       ('clf', KNeighborsClassifier())]),
             param_grid=[{'clf': [KNeighborsClassifier()],
                          'clf__n_neighbors': [2, 3, 4, 5]},
                         {'clf': [LogisticRegression(C=2)],
                          'clf__C': [1, 2, 3, 4, 5]}])

In [26]:
from sklearn.compose import ColumnTransformer

In [75]:
mpg = sns.load_dataset('mpg')

In [76]:
mpg.drop(columns='name', inplace = True)

In [77]:
mpg

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin
0,18.0,8,307.0,130.0,3504,12.0,70,usa
1,15.0,8,350.0,165.0,3693,11.5,70,usa
2,18.0,8,318.0,150.0,3436,11.0,70,usa
3,16.0,8,304.0,150.0,3433,12.0,70,usa
4,17.0,8,302.0,140.0,3449,10.5,70,usa
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82,usa
394,44.0,4,97.0,52.0,2130,24.6,82,europe
395,32.0,4,135.0,84.0,2295,11.6,82,usa
396,28.0,4,120.0,79.0,2625,18.6,82,usa


In [78]:
X_train, X_test, y_train, y_test = train_test_split(mpg.iloc[:,1:],mpg['mpg'])

In [30]:
from sklearn.impute import SimpleImputer

In [34]:
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder

In [79]:
pipe1 = Pipeline([('imputer', SimpleImputer())])
pipe2 = Pipeline([('la', LabelEncoder())])
pipe3 = Pipeline([('ss', StandardScaler())])

In [80]:
ct = ColumnTransformer([
    ('im', pipe1, 'horsepower'),
    ('l', pipe2, 'origin'),
    ('s', pipe3, ['cylinders','displacement'])
])

In [81]:
od = OrdinalEncoder()

In [82]:
od.fit_transform(mpg[['origin']])

array([[2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [0.],
       [0.],
       [0.],
       [1.],
       [1.],
       [0.],
       [2.],
       [1.],
       [2.],
       [0.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [2.],
       [2.],
       [2.],
       [0.],

In [38]:
le = LabelEncoder()

In [39]:
import numpy as np

In [41]:
xx = np.arange(12).reshape(3,4)

In [42]:
le.fit_transform(mpg[['origin']])

  return f(**kwargs)


array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 0, 0, 0,
       0, 0, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 0, 2, 1, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 0, 0, 0, 2, 1, 1, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2,
       1, 1, 2, 2, 0, 2, 2, 0, 0, 0, 0, 2, 0, 1, 2, 2, 2, 2, 2, 1, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 2, 0, 0, 1, 1, 0, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 0, 1, 2, 0,
       2, 0, 0, 0, 0, 1, 0, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0,
       1, 2, 2, 2, 2, 0, 1, 1, 2, 0, 2, 0, 1, 0, 2, 2, 2, 2, 1, 2, 0, 2,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 2, 2, 1, 0, 1,
       0, 1, 0, 2, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 2, 1, 2, 2, 1, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 0, 1, 2, 2, 0, 2, 0, 2,

In [46]:
le.fit(mpg.origin)

LabelEncoder()

In [43]:
import inspect

In [44]:
print(inspect.getsource(LabelEncoder.fit_transform))

    def fit_transform(self, y):
        """Fit label encoder and return encoded labels

        Parameters
        ----------
        y : array-like of shape [n_samples]
            Target values.

        Returns
        -------
        y : array-like of shape [n_samples]
        """
        y = column_or_1d(y, warn=True)
        self.classes_, y = _encode(y, encode=True)
        return y



In [71]:
class ModifiedLabelEncoder(LabelEncoder):
    def fit_transform(self, y, *arg, **kwarg):
        from sklearn.utils import column_or_1d
        y = column_or_1d(y)
        return super().fit_transform(y).reshape(-1,1)
    def transform(self, y, *arg, **kwarg):
        from sklearn.utils import column_or_1d
        y = column_or_1d(y)
        return super().transform(y).reshape(-1,1)

In [84]:
pipe1 = Pipeline([('imputer', SimpleImputer(strategy = 'median'))])
pipe2 = Pipeline([('la', ModifiedLabelEncoder())])
pipe3 = Pipeline([('ss', StandardScaler())])

In [85]:
ct = ColumnTransformer([
    ('im', pipe1, ['horsepower']),
    ('l', pipe2, ['origin']),
    ('s', pipe3, ['cylinders','displacement'])
])

In [86]:
ct.fit_transform(X_train, y_train)

array([[193.        ,   2.        ,   1.54080172,   1.08335997],
       [170.        ,   2.        ,   1.54080172,   2.01227429],
       [ 88.        ,   2.        ,  -0.84134569,  -0.77446868],
       ...,
       [ 70.        ,   2.        ,  -0.84134569,  -0.84220201],
       [102.        ,   0.        ,  -0.84134569,  -0.60029724],
       [ 90.        ,   0.        ,  -0.84134569,  -0.90993535]])

In [87]:
pipe = Pipeline([('ct',ct), ('knn', KNeighborsRegressor())])

In [88]:
pipe

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('im',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['horsepower']),
                                                 ('l',
                                                  Pipeline(steps=[('la',
                                                                   ModifiedLabelEncoder())]),
                                                  ['origin']),
                                                 ('s',
                                                  Pipeline(steps=[('ss',
                                                                   StandardScaler())]),
                                                  ['cylinders',
                                                   'displacement'])])),
                ('knn', KNeighborsR

In [89]:
X_train, X_test, y_train, y_test = train_test_split(mpg.iloc[:,1:],mpg['mpg'])

In [90]:
grid3 = GridSearchCV(pipe, {'ct__im__imputer__strategy':['median','mean']})

In [91]:
grid3.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('ct',
                                        ColumnTransformer(transformers=[('im',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median'))]),
                                                                         ['horsepower']),
                                                                        ('l',
                                                                         Pipeline(steps=[('la',
                                                                                          ModifiedLabelEncoder())]),
                                                                         ['origin']),
                                                                        ('s',
                                                                         Pipeline(steps=[('ss',
      

In [92]:
from sklearn.ensemble import VotingClassifier

In [93]:
from sklearn.naive_bayes import GaussianNB

In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [100]:
vc = VotingClassifier([('GB', GaussianNB()), ('RF', RandomForestClassifier()), ('KNN', KNeighborsClassifier())])

In [96]:
import seaborn as sns

In [98]:
iris = sns.load_dataset('iris')

In [101]:
vc.fit(iris.iloc[:,:-1], iris.species)

VotingClassifier(estimators=[('GB', GaussianNB()),
                             ('RF', RandomForestClassifier()),
                             ('KNN', KNeighborsClassifier())])

In [102]:
from sklearn.ensemble import BaggingClassifier

In [103]:
bc = BaggingClassifier()

In [104]:
bc.fit(iris.iloc[:,:-1], iris.species)

BaggingClassifier()

In [105]:
from sklearn.ensemble import AdaBoostClassifier

In [106]:
ada = AdaBoostClassifier()

In [107]:
ada.fit(iris.iloc[:,:-1], iris.species)

AdaBoostClassifier()

In [108]:
from sklearn.ensemble import StackingClassifier

In [109]:
sc = StackingClassifier([('GB', GaussianNB()), ('RF',RandomForestClassifier(), ('Knn', KNeighborsClassifier()))])

In [110]:
sc.fit(iris.iloc[:,:-1],iris.species)

StackingClassifier(estimators=[('GB', GaussianNB()),
                               ('RF', RandomForestClassifier(),
                                ('Knn', KNeighborsClassifier()))])