# Pipeline part 1

In [1]:
# import the required libs

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import tree

## Iris Dataset

1. Data Transformation ( It includes data preprocessing, feature engineering, or data scaling) 
  - Will use Data scaling and PCA  
  
  
2. Model Training (It includes model selecting, and hyperparameter tuning)  
  - Will Try Logistic Regression, SVM and Decision Tree  
  
  
3. Inferring the result (It includes inferring the results on unseen data)

In [2]:
# import Data

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.2, random_state=42)

In [3]:
print(X_train[:10],y_train[:10])

[[4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]
 [6.3 2.5 5.  1.9]
 [6.4 3.2 4.5 1.5]
 [5.2 3.5 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.2 4.1 1.5 0.1]] [0 0 1 0 0 2 1 0 0 0]


## Add data pre-processing and model training in pipeline

In [4]:
# pipeline for Logistic regression

from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

pipe_lr = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', LogisticRegression(random_state=42))])


# pipeline for SVM
pipe_svm = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', svm.SVC(random_state=42))])


# Pipeline for Decision Tree
pipe_dt = Pipeline([('scl', StandardScaler()),
            ('pca', PCA(n_components=2)),
            ('clf', tree.DecisionTreeClassifier(random_state=42))])

In [5]:
# Combine these pipelines

pipelines = [pipe_lr, pipe_svm, pipe_dt]

# Dictionary of pipelines and classifier types for ease of reference

pipe_dict = {0: 'Logistic Regression', 1: 'Support Vector Machine', 2: 'Decision Tree'}

## Train the pipeline

In [6]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)

## Test and compare

In [7]:
for idx, val in enumerate(pipelines):
    print('%s pipeline test accuracy: %.3f' % (pipe_dict[idx], val.score(X_test, y_test)))

Logistic Regression pipeline test accuracy: 0.900
Support Vector Machine pipeline test accuracy: 0.900
Decision Tree pipeline test accuracy: 0.867


## consume the pipeline

In [19]:
sample_example=X_test[10]
sample_example=sample_example.reshape((1,4))
sample_example

array([[6.5, 3.2, 5.1, 2. ]])

In [18]:
pipe=pipelines[0]
result=pipe.predict(sample_example)
result

array([2])