In [87]:
import pandas as pd
import numpy as np

import nltk
from bs4 import BeautifulSoup
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from ml_metrics import quadratic_weighted_kappa

import time
import warnings

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("../../raw_train_test.csv").fillna("")

In [53]:
def preprocess(text):
    without_html_tags = BeautifulSoup(text).get_text()
    without_headers = re.sub(r"^(.|\n)*Item specifics","",without_html_tags)
    without_symbols = re.sub(r"[^a-zA-Z0-9]"," ",without_headers)
    without_extra_spaces = " ".join(without_symbols.split())
    return without_extra_spaces

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(stemmer.stem(t.lower())) for t in tokens]

In [52]:
for col in ['query','product_title','product_description']:
    data[col] = data[col].apply(lambda x: preprocess(x))

In [64]:
data['text'] = data.apply(
    lambda x: "%s %s" % (x['query'],x['product_title']),
    axis = 1
)

In [70]:
tfidf = TfidfVectorizer(tokenizer = tokenize, stop_words = 'english')
train_test_split_idx = data[data.dataset == 'train'].shape[0]
X = tfidf.fit_transform(data['text'])
X_train = X[:train_test_split_idx,:] 
y_train = data.loc[:train_test_split_idx-1,'median_relevance'].values.astype(np.int32)

In [100]:
pipeline = Pipeline([
    ('SVD',TruncatedSVD(n_components=100)),
    ('normalizer',StandardScaler()),
    ('naive_bayes',LogisticRegressionCV(class_weight='auto',max_iter=200))
])

In [101]:
cross_val_start = time.time()
if __name__ == "__main__":
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = pipeline,
        X = X_train,
        y = y_train,
        scoring = kappa_scorer,
        cv = 5,
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - cross_val_start))

Average cross validation score: 0.2449
All Cross Validation Scores: [ 0.24081318  0.25488596  0.22682284  0.2497383   0.25219347]
Training time: 11.7 seconds
