In [1]:
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline

In [2]:
#two pipeline ingredients: the ngram counter and the classifier
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(ngram_range=(1,4), analyzer='word')

from sklearn.svm import LinearSVC
clf = LinearSVC()

In [3]:
train = pd.read_csv("train.csv").fillna("")
test  = pd.read_csv("test.csv").fillna("")

In [4]:
train.shape, test.shape

((10158, 6), (22513, 4))

In [5]:
train.head(2)

Unnamed: 0,id,query,product_title,product_description,median_relevance,relevance_variance
0,1,bridal shower decorations,Accent Pillow with Heart Design - Red/Black,Red satin accent pillow embroidered with a hea...,1,0.0
1,2,led christmas lights,Set of 10 Battery Operated Multi LED Train Chr...,Set of 10 Battery Operated Train Christmas Lig...,4,0.0


In [6]:
#Jaccard similarity coefficient score
#The Jaccard index [1], or Jaccard similarity coefficient, defined as the size of the intersection divided by the size of the union of two sets.

#define regex1 for use in Jaccard function
regex1 = re.compile('[^a-zA-Z]')

def Jaccard(row):
    words0 = regex1.sub(' ', row[0])
    words1 = regex1.sub(' ', row[1])

    words0 = set(words0.lower().split(' '))
    words1 = set(words1.lower().split(' '))

    normalizer = max(float(len(words0 | words1)), 1.0)
    return len(words0 & words1) / normalizer

In [7]:
train['Jaccard'] = train[['query', 'product_title']].apply(Jaccard, axis=1)
test['Jaccard'] = test[['query', 'product_title']].apply(Jaccard, axis=1)

In [11]:
from sklearn.pipeline import Pipeline, FeatureUnion

# Steps: extracting the ngrams and put them through the classifier. Pipeline looks like:

pipeline = Pipeline([
    ('vect', vect),  # extract ngrams from query
    ('clf' , clf),   # feed the output through a classifier
])

In [13]:
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score

def run_experiment(X, y, pipeline, num_expts=100):
    scores = list()
    for i in range(num_expts):
        X_train, X_test, y_train, y_true = train_test_split(X, y)
        model = pipeline.fit(X_train, y_train)  # train the classifier
        y_test = model.predict(X_test)          # apply the model to the test data
        score = accuracy_score(y_test, y_true)  # compare the results to the gold standard
        scores.append(score)

    print (sum(scores) / num_expts)

In [14]:
# The general shape of a custom data transformer is as follows:

from sklearn.base import TransformerMixin, BaseEstimator

class DataTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, vars):
        self.vars = vars # this contains whatever variables you need 
                         # to pass in for use in the `transform` step
            
    def transform(self, data):
        # this is the crucial method. It takes in whatever data is passed into
        # the tranformer as a whole, such as a Pandas dataframe or a numpy array,
        # and returns the transformed data
        return mydatatransform(data, self.vars)
    
    def fit(self, *_):
        # most of the time, `fit` doesn't need to do anything
        # just return `self`
        # exceptions: if you're writing a custom classifier,
        #          or if how the test data is transformed is dependent on
        #                how the training data was transformed
        # Examples of the second type are scalers and the n-gram transformer
        return self

In [15]:
# Write the extractor

class TextExtractor(BaseEstimator, TransformerMixin):
    """Adapted from code by @zacstewart 
       https://github.com/zacstewart/kaggle_seeclickfix/blob/master/estimator.py
       Also see Zac Stewart's excellent blogpost on pipelines:
       http://zacstewart.com/2014/08/05/pipelines-of-featureunions-of-pipelines.html
       """
    
    def __init__(self, column_name):
        self.column_name = column_name

    def transform(self, df):
        # select the relevant column and return it as a numpy array
        # set the array type to be string
        return np.asarray(df[self.column_name]).astype(str)
        
    def fit(self, *_):
        return self

In [29]:
#define X, y, and X_test

X = train[['query', 'Jaccard']]
y = train['median_relevance']
X_test = test[['query', 'Jaccard']]

In [23]:
pipeline = Pipeline([
    ('query_extractor', TextExtractor('query')), # extract names from df
    ('vect', vect),  # extract ngrams from query
    ('clf' , clf),   # feed the output through a classifier
])

In [24]:
run_experiment(X, y, pipeline)

0.634811023622


In [25]:
class Apply(BaseEstimator, TransformerMixin):
    """Applies a function f element-wise to the numpy array
    """
    
    def __init__(self, fn):
        self.fn = np.vectorize(fn)
        
    def transform(self, data):
        # note: reshaping is necessary because otherwise sklearn
        # interprets 1-d array as a single sample
        return self.fn(data.reshape(data.size, 1))

    def fit(self, *_):
        return self

In [26]:
# we already imported FeatureUnion earlier, so here goes

pipeline = Pipeline([
    ('query_extractor', TextExtractor('query')), # extract names from df
    ('text_features', FeatureUnion([
        ('vect', vect),  # extract ngrams from query
        ('num_words', Apply(lambda s: len(s.split()))), # length of string
    ])),
    ('clf' , clf),   # feed the output through a classifier
])

In [27]:
run_experiment(X, y, pipeline)

0.635492125984


In [28]:
pipeline.fit(X,y)

Pipeline(steps=[('query_extractor', TextExtractor(column_name='query')), ('text_features', FeatureUnion(n_jobs=1,
       transformer_list=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=T...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [30]:
y_pred = pipeline.predict(X_test)

In [31]:
y_pred

array([4, 2, 3, ..., 1, 4, 4], dtype=int64)

In [32]:
submission = pd.DataFrame({"id": test["id"], "prediction": y_pred})
submission.to_csv("pipeline_submission.csv", index=False)