In [24]:
# how to create pipelines and libraries indeed

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris

from sklearn.ensemble import RandomForestClassifier
# from sklearn.externals import joblib
from sklearn.decomposition import pca
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [25]:
iris = load_iris()

In [26]:
iris.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [27]:
# now we gonna split the data into train and test

x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.30,random_state =0)

# Pipeline Creation 
* 1 data preprocessing with Standard scaler
* 2 reduce dimensions using pca
* 3 Apply classifier

In [29]:
from sklearn.decomposition import PCA

In [30]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [31]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [32]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

now make a list of lists (piplines)


In [34]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [35]:
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""

now we will create DICTIONARY OF PIPELINES AND CLASSIFIER TYPES FOR EASE OF REFERENCE

In [37]:
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

# fit the pipelines

for pipe in pipelines:
    pipe.fit(x_train,y_train)
        
    

In [38]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {} ".format(pipe_dict[i],model.score(x_test,y_test)))

Logistic Regression Test Accuracy: 0.8666666666666667 
Decision Tree Test Accuracy: 0.9111111111111111 
RandomForest Test Accuracy: 0.9111111111111111 


In [39]:
for i,model in enumerate(pipelines):
    if model.score(x_test,y_test)> best_accuracy:
        best_accuracy=model.score(x_test,y_test)
        best_pipeline=model
        best_classifier=1
print("Classifier with best accuracy: {} ".format(pipe_dict[best_classifier]))

Classifier with best accuracy: Decision Tree 


# HYPER USE OF PIPELINE

In [41]:
x_train,x_test,y_train,y_test = train_test_split(iris.data,iris.target,test_size=0.30,random_state =0)

In [42]:
from sklearn.model_selection import GridSearchCV
import numpy as np

In [43]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(x_train,y_train)

In [44]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(x_test,y_test))

Pipeline(steps=[('classifier', LogisticRegression(solver='saga'))])
The mean accuracy of the model is: 0.9555555555555556


In [23]:
################################################ MAKE PIPELINES ###########################################################

In [45]:
from sklearn.pipeline import make_pipeline

In [46]:
# create the pipelines

pipe = make_pipeline((RandomForestClassifier()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(x_train,y_train)

In [47]:
best_model.score(x_test,y_test)


0.9555555555555556