 ## Creating basic Pipelines using sklearn

> ### Importing libraries

In [1]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

import numpy as np
import os
import pickle
import numpy as np
import pandas as pd
from scipy.sparse import hstack

import eli5

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
import seaborn as sns
from IPython.display import display_html
from sklearn.base import BaseEstimator, TransformerMixin
import pdb

In [2]:
PATH_TO_DATA = '../input/'
SEED = 17

> ### Reading Raw Data Files

In [3]:
sites = ['site%s' % i for i in range(1, 11)]
times = ['time%s' % i for i in range(1, 11)]
path_to_train=os.path.join(PATH_TO_DATA, 'train_sessions.csv')
path_to_test=os.path.join(PATH_TO_DATA, 'test_sessions.csv')
path_to_site_dict=os.path.join(PATH_TO_DATA, 'site_dic.pkl')
train_df = pd.read_csv(path_to_train,
                   index_col='session_id', parse_dates=times)
test_df = pd.read_csv(path_to_test,
                  index_col='session_id', parse_dates=times)

with open(path_to_site_dict, 'rb') as f:
    site2id = pickle.load(f)
# create an inverse id _> site mapping
id2site = {v:k for (k, v) in site2id.items()}
# we treat site with id 0 as "unknown"
id2site[0] = 'unknown'

> ### Custom Debugger to print intermediate values

### Usage
```
                    Pipeline( [
                    ('check',FunctionTransformer(randomfunction, validate=False)),              
                    ("debug1", Debug())] )
```

In [4]:
class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print("Degugger Start")
        print(X[1:5])
        # what other output you want
        print("End")
        return X

    def fit(self, X, y=None, **fit_params):
        return self

> ### Pipeline Specific Functions.

If the transformer is not taking any additional input, transformer function can be used

In [5]:
def concatfunction(data):
    return data[sites].fillna(0).astype('int').apply(lambda row: 
                                                     ' '.join([id2site[i] for i in row]), axis=1).tolist()

>  Custom Transformer

For passing additional parameters, custom transformer is created.

In [6]:
class add_time_features(BaseEstimator, TransformerMixin):
    """Extract time features from datetime column"""

    def __init__(self,column='time1',add_hour=False):
        self.column=column
        self.add_hour=add_hour

    def transform(self, data, y=None):
        """The workhorse of this feature extractor"""
        times = ['time%s' % i for i in range(1, 11)]
        times=data[times]
        hour = times[self.column].apply(lambda ts: ts.hour)
        morning = ((hour >= 7) & (hour <= 11)).astype('int').values.reshape(-1, 1)
        day = ((hour >= 12) & (hour <= 18)).astype('int').values.reshape(-1, 1)
        evening = ((hour >= 19) & (hour <= 23)).astype('int').values.reshape(-1, 1)
        night = ((hour >= 0) & (hour <=6)).astype('int').values.reshape(-1, 1)
        objects_to_hstack = [ morning, day, evening, night]
        feature_names = ['morning', 'day', 'evening', 'night']
        if self.add_hour:
        # scale hour dividing by 24
            objects_to_hstack.append(hour.values.reshape(-1, 1) / 24)
            feature_names.append('hour')
        return pd.DataFrame(np.hstack(objects_to_hstack),columns=feature_names,index=data.index)

    def fit(self, data, y=None):
        """Returns `self` unless something different happens in train and test"""
        return self

In [7]:
vectorizer_params={'ngram_range': (1, 5), 
                   'max_features': 100000,
                   'tokenizer': lambda s: s.split()}
time_split = TimeSeriesSplit(n_splits=10)

### Pipeline

In [8]:

# data --+-->concatenate sites in a session-->tf-idf vectorizer--+-->FeatureUnion-->Logistic Regression
#        |                                                       |
#        +--> extracting time features from start date column  --+


1. FeatureUnion: http://michelleful.github.io/code-blog/2015/06/20/pipelines/
2. FunctionTransformer: https://stackoverflow.com/questions/39001956/sklearn-pipeline-how-to-apply-different-transformations-on-different-columns/39009125#39009125        

In [9]:
from sklearn.preprocessing import FunctionTransformer

pipeline = Pipeline( [
        ('union', FeatureUnion(
            transformer_list=[
                    ('tf-idf features',
                     Pipeline([  
                            ('concatenate sites',FunctionTransformer(concatfunction, validate=False)),  
                            ('vectorizing text',TfidfVectorizer(**vectorizer_params))
                             ])
                    )
                    ,
                    ('time features',
                     Pipeline([
                             ('time1 features',add_time_features(column='time1',add_hour=False))
                            ])
                    )
                            ]
                            )
        ),
        ('classifier',LogisticRegression(C=1, random_state=SEED, solver='liblinear'))
                    ])


> ### Fitting Pipeline on training data

In [10]:
pipeline.fit(train_df.drop(columns='target'),train_df['target'])

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('tf-idf features', Pipeline(memory=None,
     steps=[('concatenate sites', FunctionTransformer(accept_sparse=False, check_inverse=True,
          func=<function concatfunction at 0x7f17304f3048>,
          inv_kw_args=None, inverse...alty='l2', random_state=17, solver='liblinear',
          tol=0.0001, verbose=0, warm_start=False))])

> ### Making predictions on test data

In [11]:
pipeline.predict(test_df)

array([0, 0, 0, ..., 0, 0, 0])