In [1]:
import pandas as pd
import numpy as np

import nltk
from bs4 import BeautifulSoup
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from ml_metrics import quadratic_weighted_kappa

import time

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv("../raw_train_test.csv").fillna('')

In [3]:
def preprocess(text):
    without_html_tags = BeautifulSoup(text).get_text()
    #without_headers = re.sub(r"^(.|\n)*Item specifics","",without_html_tags)
    without_symbols = re.sub(r"[^a-zA-Z]"," ",without_html_tags)
    without_extra_spaces = " ".join(without_symbols.split())
    return without_extra_spaces

In [4]:
for col in ['query','product_title','product_description']:
    data[col] = data[col].apply(lambda x: preprocess(x))

  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)
  '"%s" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client to get the document behind the URL, and feed that document to Beautiful Soup.' % markup)


In [5]:
def prefix(text,feature,stopwords):
    prefixes = {'query':'x','product_title':'y','product_description':'z'}
    prefix = prefixes[feature]  
    words = text.split()
    return " ".join(
        [prefix + word.lower() for word in words if word.lower() not in stopwords]
    )

In [6]:
for col in ['query','product_title','product_description']:
    data[col] = data[col].apply(
        lambda x: prefix(x,col,nltk.corpus.stopwords.words('english'))
    )

In [7]:
data['text'] = data.apply(
    lambda x: "%s %s %s" % (
        x['query'], x['product_title'], x['product_description']),
    axis = 1
)

In [8]:
train,test = data[data.dataset == 'train'], data[data.dataset == 'test']

In [None]:
cross_val_start = time.time()
if __name__ == "__main__":
    
    def tokenize(text):
        tokens = nltk.word_tokenize(text)
        stemmer = nltk.PorterStemmer()
        lemmatizer = nltk.WordNetLemmatizer()
        return [lemmatizer.lemmatize(stemmer.stem(t.lower())) for t in tokens]
    
    prediction_pipeline = Pipeline([
        ('tfidf',TfidfVectorizer(tokenizer=tokenize,min_df=5,max_df=500)),
        ('SVD',TruncatedSVD(n_components=200)),
        ('normalizer',StandardScaler()),
        ('estimator',SVC(C = 10.0))
    ])

    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = prediction_pipeline,
        X = train['text'],
        y = train['median_relevance'],
        scoring = kappa_scorer,
        cv = StratifiedKFold(train['median_relevance'],5,random_state=np.random.randint(1)),
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - cross_val_start))