## This notebook demonstrates a simple sklearn pipeline and how to use gridserachcv with sklearn pipeline. We have considered to work with iris dataset. The goal here is not to build a powerful classifier but instead to demonstarte how to use sklearn pipeline in it's simplest form. More complex examples will be added.


## Follow this link to check the wandb report https://wandb.ai/virajdatt/my-test-sklearn_pipeline_project?workspace=user-virajdatt 

In [45]:
# import required packages
import wandb

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings('ignore')

In [46]:
wandb.init(project="my-test-sklearn_pipeline_project", entity="virajdatt", name='LogisticRegression' )

VBox(children=(Label(value=' 0.01MB of 0.01MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [47]:
# Download the iris dataset
raw_data = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv')


In [48]:
# Quick look at the data
raw_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [49]:
# Check for any missing data
raw_data.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [50]:
# Split data into train and test
X_train, X_test, Y_train, Y_test = train_test_split(raw_data.drop(['species'], axis=1), 
                                                    raw_data['species'],
                                                    test_size=0.3,
                                                    stratify=raw_data['species'],
                                                    random_state=42)

## Pipeline code
1. Create a pipeline code for preprocessing the data, where we scale the data
2. Compose the pipeline with a logistic classifier
3. Finally when you test the classifier

In [51]:

num_trans = Pipeline(steps=[
            ('scale', StandardScaler())
])

pipe = Pipeline(steps=[
       ('preprocess', num_trans),
       ('classifier', LogisticRegression())
])
pipe.fit(X_train, Y_train)

Pipeline(steps=[('preprocess', Pipeline(steps=[('scale', StandardScaler())])),
                ('classifier', LogisticRegression())])

In [52]:
pipe.score(X_test, Y_test)

0.9111111111111111

## Gridsearch CV

In [53]:
parameters = {
            'classifier__penalty': ['l1', 'l2'],
            'classifier__C'      : np.logspace(-3,3,7), #array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])
            'classifier__solver' : ['newton-cg', 'lbfgs', 'liblinear'],
}

In [54]:
clf = GridSearchCV(pipe,
                  param_grid=parameters,
                  scoring='accuracy',
                  cv=10)

In [55]:
clf.fit(X_train, Y_train)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocess',
                                        Pipeline(steps=[('scale',
                                                         StandardScaler())])),
                                       ('classifier', LogisticRegression())]),
             param_grid={'classifier__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'classifier__penalty': ['l1', 'l2'],
                         'classifier__solver': ['newton-cg', 'lbfgs',
                                                'liblinear']},
             scoring='accuracy')

In [56]:
clf.best_params_

{'classifier__C': 1.0,
 'classifier__penalty': 'l2',
 'classifier__solver': 'newton-cg'}

In [57]:
clf.score(X_test, Y_test)

0.9111111111111111

In [58]:
pipe2 = Pipeline(steps=[
       ('preprocess', num_trans),
       ('classifier', LogisticRegression(C=1.0, penalty='l2', solver='newton-cg'))
])

In [59]:
pipe2.fit(X_train, Y_train)

Pipeline(steps=[('preprocess', Pipeline(steps=[('scale', StandardScaler())])),
                ('classifier', LogisticRegression(solver='newton-cg'))])

In [60]:
pipe2.score(X_test, Y_test)

0.9111111111111111

In [61]:

wandb.sklearn.plot_learning_curve(pipe2, X_train, Y_train)

In [62]:
wandb.sklearn.plot_summary_metrics(clf, X_train, Y_train, X_test, Y_test)

In [63]:
y_pred = pipe2.predict(X_test)
y_probas = pipe2.predict_proba(X_test)

In [64]:
# Visualize all classifier plots
try:
    wandb.sklearn.plot_classifier(pipe2, X_train, X_test, Y_train, Y_test, y_pred, y_probas, raw_data['species'],
                                                         model_name='logistic_regression', feature_names=None)
except:
    pass

[34m[1mwandb[0m: 
[34m[1mwandb[0m: Plotting logistic_regression.
[34m[1mwandb[0m: Logged feature importances.
[34m[1mwandb[0m: Logged confusion matrix.
[34m[1mwandb[0m: Logged summary metrics.
[34m[1mwandb[0m: Logged class proportions.


In [65]:
wandb.finish

<function wandb.sdk.wandb_run.finish(exit_code: int = None, quiet: bool = None) -> None>

In [66]:
wandb.init(project="my-test-sklearn_pipeline_project", entity="virajdatt", name='DecisionTreeClassifier' )

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [67]:
pipe3 = Pipeline(steps=[
       ('preprocess', num_trans),
       ('classifier', DecisionTreeClassifier())
])
pipe3.fit(X_train, Y_train)
wandb.sklearn.plot_learning_curve(pipe3, X_train, Y_train)

In [68]:
wandb.finish

<function wandb.sdk.wandb_run.finish(exit_code: int = None, quiet: bool = None) -> None>

## Follow this link to check the wandb report https://wandb.ai/virajdatt/my-test-sklearn_pipeline_project?workspace=user-virajdatt 