In [72]:
from sklearn import svm
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

In [4]:
iris_df = load_iris()

In [5]:
iris_df.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [6]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.data, iris_df.target, test_size = 0.3, random_state = 0)

Pipelines Creation
1. Data Preprocessing by using Standard Scaler
2. Reduce Dimension using PCA (Prinicpal Component Analysis)
3. Apply  Classifier

In [32]:
pipeline_lr = Pipeline([('scalar1', StandardScaler()),
                       ('pca1', PCA(n_components = 2)),
                       ('lr_classifier', LogisticRegression(random_state = 0))])

In [33]:
pipeline_dt = Pipeline([('scalar2', StandardScaler()),
                       ('pca2', PCA(n_components = 2)),
                       ('dt_classifier', DecisionTreeClassifier())])

In [34]:
pipeline_randomforest = Pipeline([('scalar3', StandardScaler()),
                       ('pca3', PCA(n_components = 2)),
                       ('rf_classifier', RandomForestClassifier())])

In [35]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [56]:
pipelines

[Pipeline(steps=[('scalar1', StandardScaler()), ('pca1', PCA(n_components=2)),
                 ('lr_classifier', LogisticRegression(random_state=0))]),
 Pipeline(steps=[('scalar2', StandardScaler()), ('pca2', PCA(n_components=2)),
                 ('dt_classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('scalar3', StandardScaler()), ('pca3', PCA(n_components=2)),
                 ('rf_classifier', RandomForestClassifier())])]

In [36]:
best_accuracy = 0.0
best_classifier = 0
best_pipline = ""

In [44]:
pipe_dic = {0: "Logistic Regression", 1: "Decision Tree", 2: "RandomForest"}

In [38]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [46]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dic[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.8666666666666667
Decision Tree Test Accuracy: 0.9111111111111111
RandomForest Test Accuracy: 0.9111111111111111


In [41]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i
print("Classifier with best accuracy: {}".format(pipe_dic[best_classifier]))

Classifier with best accuracy: Decision Tree


# Pipelines Perform Hyperparameter Tuning Using Grid SearchCV

In [42]:
from sklearn.model_selection import GridSearchCV

In [49]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
    {"classifier": [LogisticRegression()],
    "classifier__penalty": ["l2", "l2"],
    "classifier__C": np.logspace(0, 4, 10)
    },
    {"classifier": [LogisticRegression()],
    "classifier__penalty": ["l2"],
    "classifier__C": np.logspace(0, 4, 10),
    "classifier__solver":["newton-cg", "saga", "sag", "liblinear"]
    },
    {"classifier": [RandomForestClassifier()],
    "classifier__n_estimators": [10, 100, 1000],
    "classifier__max_depth": [5, 8, 15, 25, 30, None],
    "classifier__min_samples_leaf": [1, 2, 5, 10, 15, 100],
    "classifier__max_leaf_nodes": [2, 5, 10]}
]

gridsearch = GridSearchCV(pipe, grid_param, cv = 5, verbose = 0, n_jobs = -1)
best_model = gridsearch.fit(X_train, y_train)



In [50]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(steps=[('classifier', LogisticRegression(solver='saga'))])
The mean accuracy of the model is: 0.9555555555555556


# Make Pipelines in Sklearn

In [51]:
from sklearn.pipeline import make_pipeline

In [54]:
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 100, 1000],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,100],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [55]:
best_model.score(X_test,y_test)

0.9555555555555556

In [62]:
kernals = ['rdf', 'linear']
c = [1, 10, 20]
avg_scores = {}

for kval in kernals:
    for cval in c:
        cv_scores = cross_val_score(svm.SVC(kernel = kval, C = cval, gamma = "auto"), iris_df.data, iris_df.target, cv = 5)
        avg_scores[kval + "_" + str(cval)] = np.average(cv_scores)
        
avg_scores

5 fits failed out of a total of 5.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Project\Machine Learning\machinelearning\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Project\Machine Learning\machinelearning\lib\site-packages\sklearn\svm\_base.py", line 255, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "D:\Project\Machine Learning\machinelearning\lib\site-packages\sklearn\svm\_base.py", line 315, in _dense_fit
    ) = libsvm.fit(
  File "sklearn\svm\_libsvm.pyx", line 173, in sklearn.svm._libsvm.fit
ValueError: 'r

{'rdf_1': nan,
 'rdf_10': nan,
 'rdf_20': nan,
 'linear_1': 0.9800000000000001,
 'linear_10': 0.9733333333333334,
 'linear_20': 0.9666666666666666}

In [63]:
# classifier
clf = GridSearchCV(svm.SVC(gamma = "auto"),{
    # parameter grid
    "C" : [1, 10, 20],
    "kernel" : ['rbf', 'linear']
}, cv = 5, return_train_score = False)

clf.fit(iris_df.data, iris_df.target)
clf.cv_results_

{'mean_fit_time': array([0.00099216, 0.00060315, 0.00079808, 0.0001996 , 0.00060239,
        0.00060306]),
 'std_fit_time': array([6.92580324e-06, 4.92546112e-04, 3.99041557e-04, 3.99208069e-04,
        4.91902508e-04, 4.92454106e-04]),
 'mean_score_time': array([0.00039973, 0.00039945, 0.00059395, 0.00039859, 0.00058966,
        0.00039887]),
 'std_score_time': array([0.00048957, 0.00048922, 0.00048502, 0.00048817, 0.00048151,
        0.00048852]),
 'param_C': masked_array(data=[1, 1, 10, 10, 20, 20],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_kernel': masked_array(data=['rbf', 'linear', 'rbf', 'linear', 'rbf', 'linear'],
              mask=[False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 1, 'kernel': 'rbf'},
  {'C': 1, 'kernel': 'linear'},
  {'C': 10, 'kernel': 'rbf'},
  {'C': 10, 'kernel': 'linear'},
  {'C': 20, 'kernel': 'rbf'},
  {'C': 20

In [66]:
df = pd.DataFrame(clf.cv_results_)
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000992,7e-06,0.0004,0.00049,1,rbf,"{'C': 1, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
1,0.000603,0.000493,0.000399,0.000489,1,linear,"{'C': 1, 'kernel': 'linear'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
2,0.000798,0.000399,0.000594,0.000485,10,rbf,"{'C': 10, 'kernel': 'rbf'}",0.966667,1.0,0.966667,0.966667,1.0,0.98,0.01633,1
3,0.0002,0.000399,0.000399,0.000488,10,linear,"{'C': 10, 'kernel': 'linear'}",1.0,1.0,0.9,0.966667,1.0,0.973333,0.038873,4
4,0.000602,0.000492,0.00059,0.000482,20,rbf,"{'C': 20, 'kernel': 'rbf'}",0.966667,1.0,0.9,0.966667,1.0,0.966667,0.036515,5
5,0.000603,0.000492,0.000399,0.000489,20,linear,"{'C': 20, 'kernel': 'linear'}",1.0,1.0,0.9,0.933333,1.0,0.966667,0.042164,6


In [67]:
df[["param_C", "param_kernel", "mean_test_score"]]

Unnamed: 0,param_C,param_kernel,mean_test_score
0,1,rbf,0.98
1,1,linear,0.98
2,10,rbf,0.98
3,10,linear,0.973333
4,20,rbf,0.966667
5,20,linear,0.966667


In [68]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_pairwise',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 'classes_',
 'cv',
 'cv_results_',
 'decision_function',
 'error_score',
 'estimator',
 'fit',
 'get_params',
 'inverse_transform',
 'multimetric_',
 'n_features_

In [69]:
clf.best_score_

0.9800000000000001

In [71]:
clf.best_params_

{'C': 1, 'kernel': 'rbf'}

In [74]:
rs = RandomizedSearchCV(svm.SVC(gamma = "auto"), {
    "C": [1, 10, 20],
    "kernel": ["rdf", "linear"]
}, cv = 5, return_train_score = False, n_iter = 2)

rs.fit(iris_df.data, iris_df.target)
pd.DataFrame(rs.cv_results_)[["param_C", "param_kernel", "mean_test_score"]]

5 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "D:\Project\Machine Learning\machinelearning\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Project\Machine Learning\machinelearning\lib\site-packages\sklearn\svm\_base.py", line 255, in fit
    fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
  File "D:\Project\Machine Learning\machinelearning\lib\site-packages\sklearn\svm\_base.py", line 315, in _dense_fit
    ) = libsvm.fit(
  File "sklearn\svm\_libsvm.pyx", line 173, in sklearn.svm._libsvm.fit
ValueError: '

Unnamed: 0,param_C,param_kernel,mean_test_score
0,20,linear,0.966667
1,20,rdf,
