In [1]:
# This is an initialization cell that is not part of the presentation.
# This cell should be run while not in the rise mode

import numpy as np

np.random.seed(0) # make notebook deterministic

from sklearn.decomposition import TruncatedSVD # import model that we will use but do not want to emphasize

# Disable warnings during the presentation
import warnings
warnings.filterwarnings("ignore") 


#CSS customization
from IPython.core.display import HTML, display

# set width cell to screen width
display(HTML("<style>.container { width:100% !important; }</style>"))

# default font-size is 10pt
# anyway the code below set font size for code cells
HTML("""<style>
.CodeMirror pre {
    font-size: 11pt;
}
</style>""")

# increase font size of pd.DataFrame
HTML("""<style>
     table.dataframe {
font-size:24px;}
</style>""")

# to increase font size in a markdown cell
#<font size=5px>test</font>


# Pipelines and Gridsearch with scikit-learn

#### May 16 2018

## Florent Martin
## Koen van Woerden



# Goal of the talk:

* <font size=8>**Pipeline**</font>

* <font size=8>**Gridsearch**</font>

* <font size=8>**Scikit-learn**</font>

# What is a data pipeline?

<center><img src="../img/figures/diagram_pipeline.svg" width =400></center>

# What is a data pipeline?

<tr style="height:650px;width:900px;">
<td style="width:600px; height:700px;"> <img src="../img/figures/diagram_pipeline.svg" style="width:600px; height:700px;"/></td>
<td style="width:800px;"> <img src="../img/figures/Trans-Alaska_Pipeline.jpg" style="width:800px;" />  </td>
</tr>

Data | Liquid
--- | ---  
<img src="../img/figures/diagram_pipeline.svg" width=1000> | <img src="../img/figures/Trans-Alaska_Pipeline.jpg" width=100> 

# Pipeline $\Rightarrow$ easily experiment

<center><img src="../img/figures/diagram_two_pipelines.svg" width = 1000></center>

# What is Gridsearch?

<center><img src="../img/figures/diagram_pipeline_hyperparamter.svg" width = 1100></center>

# Gridsearch Example

**Goal**: find the best hyperparameter for logistic regression among  
Regularization type: L1, L2  
$C =0.1,1,10,100$


<center><img src="../img/figures/grid.svg" width = 800></center>

# Pipeline + Gridsearch $\Rightarrow$ scikit-learn to the rescue

<center><img src="../img/figures/diagram_pipeline_two_hyperparamters.svg" width = 1100></center>

1. **Classifying authors**  
   $\bullet$ Dataset  
   $\bullet$ Baseline model  
  
2. **Pipeline**  
  $\bullet$ Build your first pipeline  
  $\bullet$ Add new transformations   
  $\bullet$ Add non-scikit-learn transformations  
  $\bullet$ Keep experimenting  
  
3. **Gridsearch**  
  $\bullet$ Hyperparameters  
  $\bullet$ With scikit-learn transformations  
  $\bullet$ With non-scikit-learn transformations

# Part 1: Classifying authors

<table style='font-size:39px'>
    <tr>
        <td>INPUT</td> <td></td><td>OUTPUT</td>
    </tr>
    <tr>
        <td>sentence</td>                                          <td>$\Rightarrow$</td>  <td>author</td>
    </tr>
    <tr>
        <td>'Even the very lights from the city bewilder him.' </td><td>$\Rightarrow$</td>  <td>Edgar Allan Poe  </td>
    </tr>
    <tr>
        <td>X</td>                                                 <td>$\Rightarrow$</td>  <td>y</td>
    </tr>
</table>


In [2]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('../data/talk/data.csv')

In [4]:
data.shape

(14684, 2)

In [5]:
data.sample(n=5)

Unnamed: 0,text,author
10115,"""Our first slide into the abyss itself, from t...",Poe
5906,He heard my account of the self dissolution of...,Shelley
5777,Nor has he yet had any difficulty in obtaining...,Lovecraft
8462,"We examined, first, the furniture of each apar...",Poe
1457,"I did this at some little risk, and before clo...",Poe


In [6]:
X, author = data['text'], data['author']

In [7]:
author.value_counts()

Poe          5963
Shelley      4465
Lovecraft    4256
Name: author, dtype: int64

<center><font size=7>Poe</font></center> | <center><font size=7>Shelley</font></center> | <center><font size=7> Lovecraft </font></center>
---|---|---
 <img src="../img/figures/poe.png" width=265>| <img src="../img/figures/shelley.jpg" width=300> | <img src="../img/figures/lovecraft.jpg" width=300>


# scikit-learn basics

* Objects have `fit` method
* Objects have `transform` or `predict` method

In [None]:
model.fit(X, y)
model.transform(X)

In [None]:
model.fit_transform(X, y)

In [None]:
model.fit(X, y)
model.predict(X)

# Turn labels into integers

In [8]:
from sklearn.preprocessing import LabelEncoder   

In [9]:
label_encoder = LabelEncoder()

In [10]:
label_encoder.fit(author);

In [11]:
y = label_encoder.transform(author)

In [12]:
pd.DataFrame({'author': author[:5], 'y': y[:5]})

Unnamed: 0,author,y
0,Shelley,2
1,Lovecraft,0
2,Shelley,2
3,Poe,1
4,Poe,1


## Bag of words: convert strings to vectors (one-hot encoding)
<center><img src="../img/figures/Koen/03-kvw-bow.svg" width=1350></center>

# Baseline model: Bag of Words + Logistic Regression

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

In [14]:
cvec = CountVectorizer()  

In [15]:
cvec.fit(X);

In [16]:
X_cvec = cvec.transform(X)

In [17]:
type(X_cvec)

scipy.sparse.csr.csr_matrix

In [19]:
X.iloc[87]

'"One" said the clock.'

In [21]:
np.sum(X_cvec.toarray()[87,:])

4

In [18]:
X_cvec.shape

(14684, 22476)

# Logistic regression

In [22]:
from sklearn.linear_model import LogisticRegression

In [23]:
logistic_regression = LogisticRegression()

In [24]:
logistic_regression.fit(X_cvec, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

### (Multi-class: One-versus-Rest)

# Predict author of sentence

In [25]:
rand_sentence = X.sample()

In [26]:
print(rand_sentence.iloc[0])

I always attributed my failure at these points to the disordered state of his health.


In [27]:
rand_sentence_vec = cvec.transform(rand_sentence)

In [28]:
logistic_regression.predict_proba(rand_sentence_vec)

array([[0.04726737, 0.85903555, 0.09369708]])

In [29]:
label_encoder.classes_

array(['Lovecraft', 'Poe', 'Shelley'], dtype=object)

In [30]:
idx = rand_sentence.index[0]
author.loc[idx]

'Poe'

# Accuracy

In [31]:
logistic_regression.score(X_cvec, y)

0.9741214927812585

# Generalization: try model on new data

In [32]:
val = pd.read_csv('../data/talk/val.csv')

In [33]:
val.shape

(4895, 2)

In [34]:
X_val, author_val  = val['text'], val['author']

In [35]:
X_val_cvec = cvec.transform(X_val)

In [36]:
y_val = label_encoder.transform(author_val)

In [37]:
logistic_regression.score(X_val_cvec, y_val)

0.8175689479060265

## Repetitive code 
## Solution: pipeline

# Part 2: Pipeline

#  Combine all  transformations in a Pipeline

In [44]:
cvec = CountVectorizer()
logistic_regression = LogisticRegression() 

In [45]:
X_cvec = cvec.fit_transform(X)

In [46]:
X_svd = svd.fit_transform(X_cvec)

In [47]:
logistic_regression.fit(X_svd, y);

In [48]:
logistic_regression.score(X_svd, y)

0.4443612094797058

In [49]:
X_val_cvec = cvec.transform(X_val)

In [50]:
X_val_svd = svd.transform(X_val_cvec)

In [51]:
logistic_regression.score(X_val_svd, y_val)

0.44473953013278855

In [None]:
cvec = CountVectorizer()
svd = TruncatedSVD()
logistic_regression = LogisticRegression()

In [None]:
X_cvec = cvec.fit_transform(X)

In [None]:
X_svd = svd.fit_transform(X_cvec)

In [None]:
logistic_regression.fit(X_svd, y);

In [None]:
logistic_regression.score(X_svd, y)

In [None]:
X_val_cvec = cvec.transform(X_val)

In [None]:
X_val_svd = svd.transform(X_val_cvec)

In [None]:
logistic_regression.score(X_val_svd, y_val)

* <font size=8>Many **intermediate variables** </font>

* <font size=8>Transformations spread out over the notebook</font>

* <font size=8>**Experimenting** is **difficult**</font>

* <font size=8>**Solution**:  create a **Pipeline object**</font>

In [52]:
from sklearn.pipeline import Pipeline

In [61]:
pipeline = Pipeline(steps=  [   ('cvec'  , CountVectorizer()), 
                                ('logreg', LogisticRegression())  ]  )

In [62]:
pipeline.fit(X, y);

In [63]:
pipeline.score(X, y)

0.9741214927812585

In [64]:
pipeline.score(X_val, y_val)

0.8175689479060265

In [65]:
rand_sentence = X_val.sample()

In [66]:
print(rand_sentence.iloc[0])

When, at length, we had concluded our examination, and the intense excitement of the time had, in some measure, subsided, Legrand, who saw that I was dying with impatience for a solution of this most extraordinary riddle, entered into a full detail of all the circumstances connected with it.


In [67]:
pipeline.predict_proba(rand_sentence)

array([[8.63755557e-03, 9.90702703e-01, 6.59741126e-04]])

In [68]:
pipeline.predict(rand_sentence)

array([1])

In [69]:
label_encoder.classes_

array(['Lovecraft', 'Poe', 'Shelley'], dtype=object)

In [70]:
author[rand_sentence.index]

2794    Lovecraft
Name: author, dtype: object

## Under the hood of Pipeline

In [None]:
pipeline = Pipeline(steps=  [ ('first_transformation', first_transformation),
                                ...
                              ('last_transformation', last_transformation)          ] ) 

In [None]:
pipeline.fit(X,y)

## Scikit-learn does

In [None]:
X_first = first_transformation.fit_transform(X)
X_second = second_transformation.fit_transform(X_first)
...
X_last = last_transformation.fit(X_previous_last)

* All step but the last *must* implement a `fit` and `transform` method
* The last step *must* implement a `fit` method, and  a `transform` or `predict` method as well

# Add a non-scikit-learn transformation to the Pipeline  

## Lemmatizer

**lemma** = **dictionary entry**  

**swimming**, **swims**, **swim** $\Rightarrow$ same **lemma** $\Rightarrow$ **swim**  

**Lemmatizer**: word $\mapsto$ lemma

##   nltk = natural language toolkit  (NLP library)

In [71]:
from nltk.stem import WordNetLemmatizer

In [72]:
class Lemmatizer():
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        lem = WordNetLemmatizer()
        lower = X.str.lower()
        tokenized = lower.str.split(' ')
        lemmatized = tokenized.apply(lambda l: " ".join([lem.lemmatize(word) for word in l]))
        return lemmatized

In [73]:
lemmatizer = Lemmatizer()

In [74]:
sentence = pd.Series(data=['Cows and pigs are common animals on farms'])

In [75]:
lemmatizer.transform(sentence).iloc[0]

'cow and pig are common animal on farm'

In [76]:
pipeline = Pipeline(steps=[  ('lem', Lemmatizer()),
                             ('cvec', CountVectorizer()),
                             ('logreg', LogisticRegression())   ])

In [77]:
pipeline.fit(X, y);

In [78]:
pipeline.score(X, y)

0.9720784527376737

In [79]:
pipeline.score(X_val, y_val)

0.8145045965270684

# Adding Gensim word2vec

In [80]:
from gensim.models.word2vec import Word2Vec

In [82]:
class GensimWord2Vec():
    def fit(self, X, y=None):
        self.model = Word2Vec(X)
        return self
        
    def transform(self, X, y=None):
        lower = X.str.lower()
        tokenized = lower.str.split(' ')
        vectors = tokenized.apply(lambda l: [self.model[word] for word in l if word in self.model])
        def average(l):
            if l == []:
                return np.zeros(self.model.vector_size)
            else:
                return np.mean(l, axis=0)
        vectors = vectors.apply(average)
        vectors = vectors.apply(pd.Series)
        return vectors

In [83]:
pipeline = Pipeline(steps=  [   ('word2vec', GensimWord2Vec()),
                                ('logreg', LogisticRegression())    ]  )

In [84]:
pipeline.fit(X, y);

In [85]:
pipeline.score(X, y)

0.4167120675565241

In [86]:
pipeline.score(X_val, y_val)

0.4067415730337079

##  Feature unions: Combine bag of words and word2vec

<center><img src="../img/figures/diagram_feature_union1.svg" width=1100></center>

In [88]:
from sklearn.pipeline import FeatureUnion

In [87]:
lem_cvec = Pipeline(steps = [('lem', Lemmatizer()),
                             ('cvec', CountVectorizer())])

In [89]:
feature_union = FeatureUnion([('lem_cvec', lem_cvec),
                              ('gensimw2v', GensimWord2Vec())])

In [90]:
pipeline = Pipeline( [  ('feature_union', feature_union),
                        ('logreg', LogisticRegression())                          ])

In [91]:
pipeline.fit(X, y);

In [92]:
pipeline.score(X, y)

0.9733042767638246

In [93]:
pipeline.score(X_val, y_val)

0.8177732379979571

# Further experiment: tf-idf

In [94]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [95]:
pipeline = Pipeline(steps=  [  ('lem', Lemmatizer()),
                               ('tfidf', TfidfVectorizer()),
                               ('logreg', LogisticRegression())    ])

In [96]:
pipeline.fit(X, y);

In [97]:
pipeline.score(X, y)

0.8962816671206756

In [98]:
pipeline.score(X_val, y_val)

0.802655771195097

# Further experiment: Naive Bayes classifier

In [99]:
from sklearn.naive_bayes import MultinomialNB

In [100]:
pipeline = Pipeline(steps=  [    ('CountVectorizer', CountVectorizer()),
                                 ('NaiveBayes', MultinomialNB())             ])

In [101]:
pipeline.fit(X, y);

In [102]:
pipeline.score(X, y)

0.9156224461999455

In [103]:
pipeline.score(X_val, y_val)

0.8330949948927477

# Part 3: Gridsearch

## What is a **hyperparameter**?

## Examples
* learning rate 
* regularization coefficient
* number of hidden layers in a neural network
* ...

## Responsibility of the data scientist

<font size=7>change **hyperparameter** $\Rightarrow$ change **model** $\Rightarrow$ change **performance**</font>

# Baseline model

In [104]:
cvec = CountVectorizer()

In [105]:
X_cvec = cvec.fit_transform(X)

In [106]:
logistic_regression = LogisticRegression(C=1)

* `C` is a **hyperparameter**

* How do we know about `C`?

In [107]:
LogisticRegression?

In [114]:
logistic_regression = LogisticRegression(C=1000)
logistic_regression.fit(X_cvec, y);
logistic_regression.score(X_cvec, y)

1.0

In [115]:
X_val_cvec = cvec.transform(X_val)
logistic_regression.score(X_val_cvec, y_val)

0.7691521961184883

# Gridsearch

* Our previous model depends on a **hyperparameter** `C`

* Changing `C` changes the performance $\Rightarrow$  **Try** different `C`

* Keep track of the results! (Who remembers the results we got?)

* We want this to be done automatically

* Gridsearch is what we need

# Gridsearch in scikit-learn

In [116]:
from sklearn.model_selection import GridSearchCV

In [117]:
gridsearch = GridSearchCV(estimator=LogisticRegression(), 
                  param_grid={'C': [0.1, 1, 10, 100, 1000]}, verbose=3)

In [118]:
gridsearch.fit(X_cvec, y) 

Fitting 3 folds for each of 5 candidates, totalling 15 fits
[CV] C=0.1 ...........................................................
[CV] .................. C=0.1, score=0.7757352941176471, total=   0.4s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.4s remaining:    0.0s


[CV] .................. C=0.1, score=0.7662921348314606, total=   0.4s
[CV] C=0.1 ...........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s remaining:    0.0s


[CV] ................... C=0.1, score=0.774167177600654, total=   0.4s
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.8112745098039216, total=   0.8s
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.8032686414708886, total=   0.7s
[CV] C=1 .............................................................
[CV] .................... C=1, score=0.8078888207643572, total=   0.6s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7998366013071896, total=   1.3s
[CV] C=10 ............................................................
[CV] ..................... C=10, score=0.79244126659857, total=   1.2s
[CV] C=10 ............................................................
[CV] ................... C=10, score=0.7968526466380543, total=   1.1s
[CV] C=100 ...........................................................
[CV] .

[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   15.7s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100, 1000]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score='warn', scoring=None, verbose=3)

In [119]:
gridsearch.best_params_  

{'C': 1}

In [120]:
gridsearch.best_score_

0.8074775265595205

In [121]:
gridsearch.cv_results_

{'mean_fit_time': array([0.40538804, 0.68529201, 1.20883743, 1.40308134, 1.50358335]),
 'mean_score_time': array([0.00154662, 0.00148304, 0.0015765 , 0.0012993 , 0.0013814 ]),
 'mean_test_score': array([0.77206483, 0.80747753, 0.79637701, 0.7805094 , 0.76845546]),
 'mean_train_score': array([0.87806463, 0.97950149, 0.99819533, 0.99979571, 1.        ]),
 'param_C': masked_array(data=[0.1, 1, 10, 100, 1000],
              mask=[False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1}, {'C': 1}, {'C': 10}, {'C': 100}, {'C': 1000}],
 'rank_test_score': array([4, 1, 2, 3, 5], dtype=int32),
 'split0_test_score': array([0.77573529, 0.81127451, 0.7998366 , 0.77920752, 0.76981209]),
 'split0_train_score': array([0.87832039, 0.97987331, 0.99856968, 0.99989783, 1.        ]),
 'split1_test_score': array([0.76629213, 0.80326864, 0.79244127, 0.78038815, 0.7671093 ]),
 'split1_train_score': array([0.87874144, 0.97875166, 0.99785473, 0.99979569, 1.

## Cross Validation (CV): no need for separate validation set

<center><img src='../img/figures/K-fold_cross_validation_EN.jpg' width=750>

<font size=3>By Fabian Flöck [<a href="https://creativecommons.org/licenses/by-sa/3.0">CC BY-SA 3.0</a>], <a href="https://commons.wikimedia.org/wiki/File:K-fold_cross_validation_EN.jpg">from Wikimedia Commons</a></font>
</center>


### Varying regularization

In [122]:
gridsearch = GridSearchCV(estimator=LogisticRegression(), 
                  param_grid={'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}, verbose=3)

In [123]:
gridsearch.fit(X_cvec, y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] C=0.1, penalty=l1 ...............................................
[CV] ...... C=0.1, penalty=l1, score=0.6701388888888888, total=   0.2s
[CV] C=0.1, penalty=l1 ...............................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.3s remaining:    0.0s


[CV] ...... C=0.1, penalty=l1, score=0.6649642492339122, total=   0.2s
[CV] C=0.1, penalty=l1 ...............................................
[CV] ...... C=0.1, penalty=l1, score=0.6740241160842019, total=   0.2s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.7757352941176471, total=   0.5s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ...... C=0.1, penalty=l2, score=0.7662921348314606, total=   0.4s
[CV] C=0.1, penalty=l2 ...............................................
[CV] ....... C=0.1, penalty=l2, score=0.774167177600654, total=   0.4s
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.7794117647058824, total=   0.2s
[CV] C=1, penalty=l1 .................................................
[CV] ........ C=1, penalty=l1, score=0.7758937691521961, total=   0.2s
[CV] C=1, penalty=l1 .................................................
[CV] .

[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed:   15.0s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [124]:
gridsearch.best_params_

{'C': 1, 'penalty': 'l2'}

In [125]:
gridsearch.best_score_

0.8074775265595205

In [126]:
gridsearch.cv_results_

{'mean_fit_time': array([0.16640369, 0.39961926, 0.20294325, 0.64799929, 0.47372135,
        1.210121  , 0.58727264, 1.25881402]),
 'mean_score_time': array([0.00152787, 0.00150196, 0.00168991, 0.00143298, 0.00150402,
        0.00163635, 0.00164938, 0.00144323]),
 'mean_test_score': array([0.66970853, 0.77206483, 0.77819395, 0.80747753, 0.77792155,
        0.79637701, 0.76464179, 0.7805094 ]),
 'mean_train_score': array([0.69415719, 0.87806463, 0.92587179, 0.97950149, 0.99894445,
        0.99819533, 1.        , 0.99979571]),
 'param_C': masked_array(data=[0.1, 0.1, 1, 1, 10, 10, 100, 100],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_penalty': masked_array(data=['l1', 'l2', 'l1', 'l2', 'l1', 'l2', 'l1', 'l2'],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'C': 0.1, 'penalty': 'l1'},
  {'C': 0.1, 'pena

* How to optimize hyperparameters of pipelines?

* This works automatically with pipelines of scikit-learn objects

# Gridsearch on pipelines of scikit-learn objects

In [127]:
pipeline = Pipeline(steps=  [   ('CountVectorizer', CountVectorizer()),
                                ('NaiveBayes', MultinomialNB())           ])

In [128]:
pipeline.get_params()

{'CountVectorizer': CountVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'CountVectorizer__analyzer': 'word',
 'CountVectorizer__binary': False,
 'CountVectorizer__decode_error': 'strict',
 'CountVectorizer__dtype': numpy.int64,
 'CountVectorizer__encoding': 'utf-8',
 'CountVectorizer__input': 'content',
 'CountVectorizer__lowercase': True,
 'CountVectorizer__max_df': 1.0,
 'CountVectorizer__max_features': None,
 'CountVectorizer__min_df': 1,
 'CountVectorizer__ngram_range': (1, 1),
 'CountVectorizer__preprocessor': None,
 'CountVectorizer__stop_words': None,
 'CountVectorizer__strip_accents': None,
 'CountVectorizer__token_pattern': '(?u)\\b\\w\\w+\\b',
 'CountV

In [None]:
MultinomialNB?

In [129]:
param_grid = {  'CountVectorizer__binary': [True, False],
                'CountVectorizer__ngram_range': [(1, 1), (1,2)],
                'NaiveBayes__alpha': np.logspace(start=-1, stop=1, num=3)    }   

In [130]:
gridsearch = GridSearchCV(estimator=pipeline, param_grid=param_grid, verbose=5)

In [131]:
gridsearch.fit(X, y)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=0.1 
[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=0.1, score=0.835171568627451, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=0.1 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.6s remaining:    0.0s


[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=0.1, score=0.8312563840653728, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=0.1 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.3s remaining:    0.0s


[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=0.1, score=0.836092376864909, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=1.0 


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.9s remaining:    0.0s


[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=1.0, score=0.8339460784313726, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=1.0 


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.6s remaining:    0.0s


[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=1.0, score=0.8241062308478039, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=1.0 
[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=1.0, score=0.8346617617003883, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=10.0 
[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=10.0, score=0.735498366013072, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=10.0 
[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=10.0, score=0.726046986721144, total=   0.4s
[CV] CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), NaiveBayes__alpha=10.0 
[CV]  CountVectorizer__binary=True, CountVectorizer__ngram_range=(1, 1), 

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed:   42.1s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('CountVectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), p...one, vocabulary=None)), ('NaiveBayes', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'CountVectorizer__binary': [True, False], 'CountVectorizer__ngram_range': [(1, 1), (1, 2)], 'NaiveBayes__alpha': array([ 0.1,  1. , 10. ])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [132]:
gridsearch.best_score_

0.8397575592481613

In [133]:
gridsearch.best_params_

{'CountVectorizer__binary': True,
 'CountVectorizer__ngram_range': (1, 2),
 'NaiveBayes__alpha': 0.1}

In [134]:
gridsearch.cv_results_

{'mean_fit_time': array([0.27160931, 0.28333004, 0.26002137, 0.93489639, 0.92432928,
        0.91905975, 0.25934068, 0.27192235, 0.26511423, 0.95985969,
        0.98999206, 0.95550092]),
 'mean_score_time': array([0.13309948, 0.13329951, 0.11747607, 0.2447079 , 0.25512131,
        0.24656749, 0.11637505, 0.11698055, 0.11752685, 0.27359398,
        0.2369833 , 0.26216181]),
 'mean_test_score': array([0.83417325, 0.83090439, 0.7246663 , 0.83975756, 0.82913375,
        0.70103514, 0.83512667, 0.82899755, 0.70757287, 0.83887224,
        0.82504767, 0.66984473]),
 'mean_train_score': array([0.95215891, 0.92764245, 0.79807968, 0.99785482, 0.9899891 ,
        0.86764519, 0.94834534, 0.92178569, 0.77397189, 0.99775269,
        0.98842278, 0.83369682]),
 'param_CountVectorizer__binary': masked_array(data=[True, True, True, True, True, True, False, False,
                    False, False, False, False],
              mask=[False, False, False, False, False, False, False, False,
                 

In [135]:
pipeline.set_params(**gridsearch.best_params_);

In [136]:
pipeline.fit(X, y);

In [137]:
pipeline.score(X, y)

0.9969354399346227

In [138]:
pipeline.score(X_val, y_val)

0.8473953013278857

# Adding non-sklearn objects

Gridsearch $\Rightarrow$ derive from `BaseEstimator`

`fit_transform` $\Rightarrow$ derive from `TransformerMixin`

In [139]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

In [140]:
class GensimWord2Vec(TransformerMixin, BaseEstimator): # Derive from BaseEstimator!
    def __init__(self, size=100, min_count=5):
        self.size=size
        self.min_count=min_count
    
    def fit(self, X, y=None):
        self.model = Word2Vec(X, size=self.size, min_count=self.min_count)
        return self
        
        
    def transform(self, X, y=None):
        lower = X.str.lower()
        tokenized = lower.str.split(' ')
        vectors = tokenized.apply(lambda l: [self.model[word] for word in l if word in self.model])
        def average(l):
            if l == []:
                return np.zeros(self.model.vector_size)
            else:
                return np.mean(l, axis=0)
        vectors = vectors.apply(average)
        vectors = vectors.apply(pd.Series)
        return vectors

In [141]:
pipeline = Pipeline(steps=  [  ('word2vec', GensimWord2Vec()),
                               ('logreg', LogisticRegression())        ])

In [142]:
pipeline.get_params()

{'logreg': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
           penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
           verbose=0, warm_start=False),
 'logreg__C': 1.0,
 'logreg__class_weight': None,
 'logreg__dual': False,
 'logreg__fit_intercept': True,
 'logreg__intercept_scaling': 1,
 'logreg__max_iter': 100,
 'logreg__multi_class': 'ovr',
 'logreg__n_jobs': 1,
 'logreg__penalty': 'l2',
 'logreg__random_state': None,
 'logreg__solver': 'liblinear',
 'logreg__tol': 0.0001,
 'logreg__verbose': 0,
 'logreg__warm_start': False,
 'memory': None,
 'steps': [('word2vec', GensimWord2Vec(min_count=5, size=100)),
  ('logreg',
   LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
             penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
       

In [143]:
param_grid = {   'word2vec__min_count': [1],
                 'word2vec__size': [10, 50]           } 

In [144]:
gridsearch = GridSearchCV(estimator=pipeline, param_grid=param_grid, verbose=5)

In [145]:
gridsearch.fit(X, y)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] word2vec__min_count=1, word2vec__size=10 ........................
[CV]  word2vec__min_count=1, word2vec__size=10, score=0.4097222222222222, total=   4.8s
[CV] word2vec__min_count=1, word2vec__size=10 ........................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.0s remaining:    0.0s


[CV]  word2vec__min_count=1, word2vec__size=10, score=0.4218590398365679, total=   4.6s
[CV] word2vec__min_count=1, word2vec__size=10 ........................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.7s remaining:    0.0s


[CV]  word2vec__min_count=1, word2vec__size=10, score=0.41630901287553645, total=   4.7s
[CV] word2vec__min_count=1, word2vec__size=50 ........................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   20.6s remaining:    0.0s


[CV]  word2vec__min_count=1, word2vec__size=50, score=0.41013071895424835, total=   5.2s
[CV] word2vec__min_count=1, word2vec__size=50 ........................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   27.9s remaining:    0.0s


[CV]  word2vec__min_count=1, word2vec__size=50, score=0.4216547497446374, total=   5.0s
[CV] word2vec__min_count=1, word2vec__size=50 ........................
[CV]  word2vec__min_count=1, word2vec__size=50, score=0.41651338647046804, total=   4.9s


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:   42.4s finished


GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('word2vec', GensimWord2Vec(min_count=5, size=100)), ('logreg', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'word2vec__min_count': [1], 'word2vec__size': [10, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=5)

In [146]:
gridsearch.best_params_

{'word2vec__min_count': 1, 'word2vec__size': 50}

In [147]:
gridsearch.best_score_

0.4160991555434487

In [148]:
pipeline.set_params(**gridsearch.best_params_);

In [149]:
pipeline.fit(X, y);

In [150]:
pipeline.score(X, y)

0.4167120675565241

In [151]:
pipeline.score(X_val, y_val)

0.4071501532175689

# Conclusion

* <font style="font-size:60px;"> **Pipelines** $\Rightarrow$  **clear code** + **easy experiments** </font>

*  <font style="font-size:60px;"> **Gridsearch** $\Rightarrow$ **tuning** of **hyperparameters** </font>

* <font style="font-size:60px;">**Scikit-learn** $\Rightarrow$ convenient classes for both</font>

# Thank you for your attention