In [None]:
# This is an initialization cell that is not part of the presentation.
# This cell should be run while not in the rise mode

import numpy as np

np.random.seed(0) # make notebook deterministic

from sklearn.decomposition import TruncatedSVD # import model that we will use but do not want to emphasize

# Disable warnings during the presentation
import warnings
warnings.filterwarnings("ignore") 


#CSS customization
from IPython.core.display import HTML, display

# set width cell to screen width
display(HTML("<style>.container { width:100% !important; }</style>"))

# default font-size is 10pt
# anyway the code below set font size for code cells
HTML("""<style>
.CodeMirror pre {
    font-size: 11pt;
}
</style>""")

# increase font size of pd.DataFrame
HTML("""<style>
     table.dataframe {
font-size:24px;}
</style>""")

# to increase font size in a markdown cell
#<font size=5px>test</font>


# Pipelines and Gridsearch with scikit-learn

#### May 16 2018

## Florent Martin
## Koen van Woerden



# Goal of the talk:

* <font size=8>**Pipeline**</font>

* <font size=8>**Gridsearch**</font>

* <font size=8>**Scikit-learn**</font>

# What is a data pipeline?

<center><img src="../img/figures/diagram_pipeline.svg" width =400></center>

# What is a data pipeline?

<tr style="height:650px;width:900px;">
<td style="width:600px; height:700px;"> <img src="../img/figures/diagram_pipeline.svg" style="width:600px; height:700px;"/></td>
<td style="width:800px;"> <img src="../img/figures/Trans-Alaska_Pipeline.jpg" style="width:800px;" />  </td>
</tr>

Data | Liquid
--- | ---  
<img src="../img/figures/diagram_pipeline.svg" width=1000> | <img src="../img/figures/Trans-Alaska_Pipeline.jpg" width=100> 

# Pipeline $\Rightarrow$ easily experiment

<center><img src="../img/figures/diagram_two_pipelines.svg" width = 1000></center>

# What is Gridsearch?

<center><img src="../img/figures/diagram_pipeline_hyperparamter.svg" width = 1100></center>

# Gridsearch Example

**Goal**: find the best hyperparameter for logistic regression among  
Regularization type: L1, L2  
$C =0.1,1,10,100$


<center><img src="../img/figures/grid.svg" width = 800></center>

# Pipeline + Gridsearch $\Rightarrow$ scikit-learn to the rescue

<center><img src="../img/figures/diagram_pipeline_two_hyperparamters.svg" width = 1100></center>

1. **Classifying authors**  
   $\bullet$ Dataset  
   $\bullet$ Baseline model  
  
2. **Pipeline**  
  $\bullet$ Build your first pipeline  
  $\bullet$ Add new transformations   
  $\bullet$ Add non-scikit-learn transformations  
  $\bullet$ Keep experimenting  
  
3. **Gridsearch**  
  $\bullet$ Hyperparameters  
  $\bullet$ With scikit-learn transformations  
  $\bullet$ With non-scikit-learn transformations

# Part 1: Classifying authors

<table style='font-size:39px'>
    <tr>
        <td>INPUT</td> <td></td><td>OUTPUT</td>
    </tr>
    <tr>
        <td>sentence</td>                                          <td>$\Rightarrow$</td>  <td>author</td>
    </tr>
    <tr>
        <td>'Even the very lights from the city bewilder him.' </td><td>$\Rightarrow$</td>  <td>Edgar Allan Poe  </td>
    </tr>
    <tr>
        <td>X</td>                                                 <td>$\Rightarrow$</td>  <td>y</td>
    </tr>
</table>


In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('../data/talk/data.csv')

In [None]:
data.shape

In [None]:
data.sample(n=5)

In [None]:
X, author = data['text'], data['author']

In [None]:
author.value_counts()

<center><font size=7>Poe</font></center> | <center><font size=7>Shelley</font></center> | <center><font size=7> Lovecraft </font></center>
---|---|---
 <img src="../img/figures/poe.png" width=265>| <img src="../img/figures/shelley.jpg" width=300> | <img src="../img/figures/lovecraft.jpg" width=300>


# scikit-learn basics

* Objects have `fit` method
* Objects have `transform` or `predict` method

In [None]:
model.fit(X, y)
model.transform(X)

In [None]:
model.fit_transform(X, y)

In [None]:
model.fit(X, y)
model.predict(X)

# Turn labels into integers

In [None]:
from sklearn.preprocessing import LabelEncoder   

In [None]:
label_encoder = LabelEncoder()

In [None]:
label_encoder.fit(author);

In [None]:
y = label_encoder.transform(author)

In [None]:
pd.DataFrame({'author': author[:5], 'y': y[:5]})

## Bag of words: convert strings to vectors (one-hot encoding)
<center><img src="../img/figures/Koen/03-kvw-bow.svg" width=1350></center>

# Baseline model: Bag of Words + Logistic Regression

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cvec = CountVectorizer()

In [None]:
cvec.fit(X);

In [None]:
X_cvec = cvec.transform(X)

In [None]:
type(X_cvec)

In [None]:
X_cvec.shape

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic_regression = LogisticRegression()

In [None]:
logistic_regression.fit(X_cvec, y)

### (Multi-class: One-versus-Rest)

# Predict author of sentence

In [None]:
rand_sentence = X.sample()

In [None]:
print(rand_sentence.iloc[0])

In [None]:
rand_sentence_vec = cvec.transform(rand_sentence)

In [None]:
logistic_regression.predict_proba(rand_sentence_vec)

In [None]:
label_encoder.classes_

In [None]:
idx = rand_sentence.index[0]
author.loc[idx]

# Accuracy

In [None]:
logistic_regression.score(X_cvec, y)

# Generalization: try model on new data

In [None]:
val = pd.read_csv('../data/talk/val.csv')

In [None]:
val.shape

In [None]:
X_val, author_val  = val['text'], val['author']

In [None]:
X_val_cvec = cvec.transform(X_val)

In [None]:
y_val = label_encoder.transform(author_val)

In [None]:
logistic_regression.score(X_val_cvec, y_val)

## Repetitive code 
## Solution: pipeline

# Part 2: Pipeline

#  Combine all  transformations in a Pipeline

In [None]:
cvec = CountVectorizer()
logistic_regression = LogisticRegression() 

In [None]:
X_cvec = cvec.fit_transform(X)

In [None]:
logistic_regression.fit(X_cvec, y);

In [None]:
logistic_regression.score(X_cvec, y)

In [None]:
X_val_cvec = cvec.transform(X_val)

In [None]:
logistic_regression.score(X_val_cvec, y_val)

In [None]:
cvec = CountVectorizer()
svd = TruncatedSVD()
logistic_regression = LogisticRegression()

In [None]:
X_cvec = cvec.fit_transform(X)

In [None]:
X_svd = svd.fit_transform(X_cvec)

In [None]:
logistic_regression.fit(X_svd, y);

In [None]:
logistic_regression.score(X_svd, y)

In [None]:
X_val_cvec = cvec.transform(X_val)

In [None]:
X_val_svd = svd.transform(X_val_cvec)

In [None]:
logistic_regression.score(X_val_svd, y_val)

* <font size=8>Many **intermediate variables** </font>

* <font size=8>Transformations spread out over the notebook</font>

* <font size=8>**Experimenting** is **difficult**</font>

* <font size=8>**Solution**:  create a **Pipeline object**</font>

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = Pipeline(steps=  [   ('cvec'  , CountVectorizer()), 
                                ('logreg', LogisticRegression())  ]  )

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

In [None]:
rand_sentence = X_val.sample()

In [None]:
print(rand_sentence.iloc[0])

In [None]:
pipeline.predict_proba(rand_sentence)

In [None]:
pipeline.predict(rand_sentence)

In [None]:
label_encoder.classes_

In [None]:
author[rand_sentence.index]

## Under the hood of Pipeline

In [None]:
pipeline = Pipeline(steps=  [ ('first_transformation', first_transformation),
                                ...
                              ('last_transformation', last_transformation)          ] ) 

In [None]:
pipeline.fit(X,y)

## Scikit-learn does

In [None]:
X_first = first_transformation.fit_transform(X)
X_second = second_transformation.fit_transform(X_first)
...
X_last = last_transformation.fit(X_previous_last)

* All step but the last *must* implement a `fit` and `transform` method
* The last step *must* implement a `fit` method, and  a `transform` or `predict` method as well

# Add a non-scikit-learn transformation to the Pipeline  

## Lemmatizer

**lemma** = **dictionary entry**  

**swimming**, **swims**, **swim** $\Rightarrow$ same **lemma** $\Rightarrow$ **swim**  

**Lemmatizer**: word $\mapsto$ lemma

##   nltk = natural language toolkit  (NLP library)

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
class Lemmatizer():
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        lem = WordNetLemmatizer()
        lower = X.str.lower()
        tokenized = lower.str.split(' ')
        lemmatized = tokenized.apply(lambda l: " ".join([lem.lemmatize(word) for word in l]))
        return lemmatized

In [None]:
lemmatizer = Lemmatizer()

In [None]:
sentence = pd.Series(data=['Cows and pigs are common animals on farms'])

In [None]:
lemmatizer.transform(sentence).iloc[0]

In [None]:
pipeline = Pipeline(steps=[  ('lem', Lemmatizer()),
                             ('cvec', CountVectorizer()),
                             ('logreg', LogisticRegression())   ])

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

# Adding Gensim word2vec

In [None]:
from gensim.models.word2vec import Word2Vec

In [None]:
class GensimWord2Vec():
    def fit(self, X, y=None):
        self.model = Word2Vec(X)
        return self
        
    def transform(self, X, y=None):
        lower = X.str.lower()
        tokenized = lower.str.split(' ')
        vectors = tokenized.apply(lambda l: [self.model[word] for word in l if word in self.model])
        def average(l):
            if l == []:
                return np.zeros(self.model.vector_size)
            else:
                return np.mean(l, axis=0)
        vectors = vectors.apply(average)
        vectors = vectors.apply(pd.Series)
        return vectors

In [None]:
pipeline = Pipeline(steps=  [   ('word2vec', GensimWord2Vec()),
                                ('logreg', LogisticRegression())    ]  )

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

##  Feature unions: Combine bag of words and word2vec

<center><img src="../img/figures/diagram_feature_union1.svg" width=1100></center>

In [None]:
from sklearn.pipeline import FeatureUnion

In [None]:
lem_cvec = Pipeline(steps = [('lem', Lemmatizer()),
                             ('cvec', CountVectorizer())])

In [None]:
feature_union = FeatureUnion([('lem_cvec', lem_cvec),
                              ('gensimw2v', GensimWord2Vec())])

In [None]:
pipeline = Pipeline( [  ('feature_union', feature_union),
                        ('logreg', LogisticRegression())                          ])

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

# Further experiment: tf-idf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
pipeline = Pipeline(steps=  [  ('lem', Lemmatizer()),
                               ('tfidf', TfidfVectorizer()),
                               ('logreg', LogisticRegression())    ])

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

# Further experiment: Naive Bayes classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
pipeline = Pipeline(steps=  [    ('CountVectorizer', CountVectorizer()),
                                 ('NaiveBayes', MultinomialNB())             ])

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

# Part 3: Gridsearch

## What is a **hyperparameter**?

## Examples
* learning rate 
* regularization coefficient
* number of hidden layers in a neural network
* ...

## Responsibility of the data scientist

<font size=7>change **hyperparameter** $\Rightarrow$ change **model** $\Rightarrow$ change **performance**</font>

# Baseline model

In [None]:
cvec = CountVectorizer()

In [None]:
X_cvec = cvec.fit_transform(X)

In [None]:
logistic_regression = LogisticRegression(C=1)

* `C` is a **hyperparameter**

* How do we know about `C`?

In [None]:
LogisticRegression?

In [None]:
logistic_regression = LogisticRegression(C=1)
logistic_regression.fit(X_cvec, y);
logistic_regression.score(X_cvec, y)

In [None]:
X_val_cvec = cvec.transform(X_val)
logistic_regression.score(X_val_cvec, y_val)

# Gridsearch

* Our previous model depends on a **hyperparameter** `C`

* Changing `C` changes the performance $\Rightarrow$  **Try** different `C`

* Keep track of the results! (Who remembers the results we got?)

* We want this to be done automatically

* Gridsearch is what we need

# Gridsearch in scikit-learn

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
gridsearch = GridSearchCV(estimator=LogisticRegression(), 
                  param_grid={'C': [0.1, 1, 10, 100, 1000]}, verbose=3)

In [None]:
gridsearch.fit(X_cvec, y)

In [None]:
gridsearch.best_params_ 

In [None]:
gridsearch.best_score_

In [None]:
gridsearch.cv_results_

## Cross Validation (CV): no need for separate validation set

<center><img src='../img/figures/K-fold_cross_validation_EN.jpg' width=750>

<font size=3>By Fabian Fl√∂ck [<a href="https://creativecommons.org/licenses/by-sa/3.0">CC BY-SA 3.0</a>], <a href="https://commons.wikimedia.org/wiki/File:K-fold_cross_validation_EN.jpg">from Wikimedia Commons</a></font>
</center>


### Varying regularization

In [None]:
gridsearch = GridSearchCV(estimator=LogisticRegression(), 
                  param_grid={'C': [0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}, verbose=3)

In [None]:
gridsearch.fit(X_cvec, y)

In [None]:
gridsearch.best_params_

In [None]:
gridsearch.best_score_

In [None]:
gridsearch.cv_results_

* How to optimize hyperparameters of pipelines?

* This works automatically with pipelines of scikit-learn objects

# Gridsearch on pipelines of scikit-learn objects

In [None]:
pipeline = Pipeline(steps=  [   ('CountVectorizer', CountVectorizer()),
                                ('NaiveBayes', MultinomialNB())           ])

In [None]:
pipeline.get_params()

In [None]:
MultinomialNB?

In [None]:
param_grid = {  'CountVectorizer__binary': [True, False],
                'CountVectorizer__ngram_range': [(1, 1), (1,2)],
                'NaiveBayes__alpha': np.logspace(start=-1, stop=1, num=3)    }   

In [None]:
gridsearch = GridSearchCV(estimator=pipeline, param_grid=param_grid, verbose=5)

In [None]:
gridsearch.fit(X, y)

In [None]:
gridsearch.best_score_

In [None]:
gridsearch.best_params_

In [None]:
gridsearch.cv_results_

In [None]:
pipeline.set_params(**gridsearch.best_params_);

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

# Adding non-sklearn objects

Gridsearch $\Rightarrow$ derive from `BaseEstimator`

`fit_transform` $\Rightarrow$ derive from `TransformerMixin`

In [None]:
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

In [None]:
class GensimWord2Vec(TransformerMixin, BaseEstimator): # Derive from BaseEstimator!
    def __init__(self, size=100, min_count=5):
        self.size=size
        self.min_count=min_count
    
    def fit(self, X, y=None):
        self.model = Word2Vec(X, size=self.size, min_count=self.min_count)
        return self
        
        
    def transform(self, X, y=None):
        lower = X.str.lower()
        tokenized = lower.str.split(' ')
        vectors = tokenized.apply(lambda l: [self.model[word] for word in l if word in self.model])
        def average(l):
            if l == []:
                return np.zeros(self.model.vector_size)
            else:
                return np.mean(l, axis=0)
        vectors = vectors.apply(average)
        vectors = vectors.apply(pd.Series)
        return vectors

In [None]:
pipeline = Pipeline(steps=  [  ('word2vec', GensimWord2Vec()),
                               ('logreg', LogisticRegression())        ])

In [None]:
pipeline.get_params()

In [None]:
param_grid = {   'word2vec__min_count': [1],
                 'word2vec__size': [10, 50]           } 

In [None]:
gridsearch = GridSearchCV(estimator=pipeline, param_grid=param_grid, verbose=5)

In [None]:
gridsearch.fit(X, y)

In [None]:
gridsearch.best_params_

In [None]:
gridsearch.best_score_

In [None]:
pipeline.set_params(**gridsearch.best_params_);

In [None]:
pipeline.fit(X, y);

In [None]:
pipeline.score(X, y)

In [None]:
pipeline.score(X_val, y_val)

# Conclusion

* <font style="font-size:60px;"> **Pipelines** $\Rightarrow$  **clear code** + **easy experiments** </font>

*  <font style="font-size:60px;"> **Gridsearch** $\Rightarrow$ **tuning** of **hyperparameters** </font>

* <font style="font-size:60px;">**Scikit-learn** $\Rightarrow$ convenient classes for both</font>

# Thank you for your attention