In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
iris_df=load_iris()
X_train,X_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
('pca1',PCA(n_components=2)),
('lr_classifier',LogisticRegression(random_state=0))])
model = pipeline_lr.fit(X_train, y_train)
model.score(X_test,y_test)


0.8666666666666667

# Stacking Multiple Pipelines to Find the Model with the Best Accuracy

In [4]:

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)), 
                     ('lr_classifier',LogisticRegression())])
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])
pipeline_svm = Pipeline([('scalar3', StandardScaler()),
                      ('pca3', PCA(n_components=2)),
                      ('clf', svm.SVC())])
pipeline_knn=Pipeline([('scalar4',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('knn_classifier',KNeighborsClassifier())])
pipelines = [pipeline_lr, pipeline_dt, pipeline_svm, pipeline_knn]
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'Support Vector Machine',3:'K Nearest Neighbor'}
for pipe in pipelines:
  pipe.fit(X_train, y_train)
for i,model in enumerate(pipelines):
    print("{} Test Accuracy:{}".format(pipe_dict[i],model.score(X_test,y_test)))


Logistic Regression Test Accuracy:0.8666666666666667
Decision Tree Test Accuracy:0.9111111111111111
Support Vector Machine Test Accuracy:0.9333333333333333
K Nearest Neighbor Test Accuracy:0.9111111111111111


# Hyperparameter Tuning in Pipeline (very cpu intensive)

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
pipe = make_pipeline((RandomForestClassifier()))
grid_param = [
{"randomforestclassifier": [RandomForestClassifier()],
"randomforestclassifier__n_estimators":[10,100,1000],
"randomforestclassifier__max_depth":[5,8,15,25,30,None],
"randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
"randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) 
best_model = gridsearch.fit(X_train,y_train)
best_model.score(X_test,y_test)

0.9777777777777777