In [63]:
import pandas as pd
import numpy as np

import nltk
from bs4 import BeautifulSoup
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import SVC
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from ml_metrics import quadratic_weighted_kappa

import time

import matplotlib.pyplot as plt
%matplotlib inline

#Import Data

In [83]:
data = pd.read_csv("../../raw_train_test.csv").fillna('')

In [84]:
def preprocess(text):
    without_html_tags = BeautifulSoup(text).get_text()
    #without_headers = re.sub(r"^(.|\n)*Item specifics","",without_html_tags)
    without_symbols = re.sub(r"[^a-zA-Z]"," ",without_html_tags)
    without_extra_spaces = " ".join(without_symbols.split())
    return without_extra_spaces

In [85]:
for col in ['query','product_title','product_description']:
    data[col] = data[col].apply(lambda x: preprocess(x))

##Method #0: TF-IDF On Combined, Unlabeled Query and Title Only.

In [86]:
def method_0_tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(stemmer.stem(t.lower())) for t in tokens]

In [87]:
data['text0'] = data.apply(
    lambda x: "%s %s" % (
        x['query'], x['product_title']),
    axis = 1
)

In [88]:
tfidf = TfidfVectorizer(
    tokenizer = method_0_tokenize,
    min_df=5, 
    max_df=500, 
    max_features=None, 
    strip_accents='unicode', 
    analyzer='word', 
    token_pattern=r'\w{1,}', 
    ngram_range=(1, 2), 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=True, 
    stop_words = 'english'
)
train_test_split_idx = data[data.dataset == 'train'].shape[0]
X = tfidf.fit_transform(data['text0'])
X_train0 = X[:train_test_split_idx,:] 
y_train0 = data.loc[:train_test_split_idx-1,'median_relevance'].values.astype(int)

In [89]:
method_0_pipeline = Pipeline([
    ('SVD',TruncatedSVD(n_components=200)),
    ('normalizer',StandardScaler()),
    ('estimator',SVC(C = 10.0))
])

In [90]:
cross_val_start = time.time()
if __name__ == "__main__":
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = method_0_pipeline,
        X = X_train0,
        y = y_train0,
        scoring = kappa_scorer,
        cv = StratifiedKFold(y_train0,5,random_state=np.random.randint(1)),
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - cross_val_start))

Average cross validation score: 0.5246
All Cross Validation Scores: [ 0.50961913  0.54300078  0.50769423  0.54725975  0.51536205]
Training time: 87.2 seconds


##Method #1: TF-IDF On Combined, Un-Labeled Text Features. Corpus is train + test

In [91]:
def method_1_tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(stemmer.stem(t.lower())) for t in tokens]

In [92]:
data['text1'] = data.apply(
    lambda x: "%s %s %s" % (
        x['query'], x['product_title'], x['product_description']),
    axis = 1
)

In [93]:
tfidf = TfidfVectorizer(
    tokenizer = method_1_tokenize,
    min_df=5, 
    max_df=500, 
    max_features=None, 
    strip_accents='unicode', 
    analyzer='word', 
    token_pattern=r'\w{1,}', 
    ngram_range=(1, 2), 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=True, 
    stop_words = 'english'
)
train_test_split_idx = data[data.dataset == 'train'].shape[0]
X = tfidf.fit_transform(data['text1'])
X_train1 = X[:train_test_split_idx,:] 
y_train1 = data.loc[:train_test_split_idx-1,'median_relevance'].values.astype(int)

In [94]:
method_1_pipeline = Pipeline([
    ('SVD',TruncatedSVD(n_components=200)),
    ('normalizer',StandardScaler()),
    ('estimator',SVC(C = 10.0))
])

In [95]:
cross_val_start = time.time()
if __name__ == "__main__":
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = method_1_pipeline,
        X = X_train1,
        y = y_train1,
        scoring = kappa_scorer,
        cv = StratifiedKFold(y_train1,5,random_state=np.random.randint(1)),
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - cross_val_start))

Average cross validation score: 0.4922
All Cross Validation Scores: [ 0.50641951  0.48525595  0.4765771   0.51390849  0.47906106]
Training time: 100.0 seconds


##Method #2: TF-IDF On Combined, Labeled Text Features.

Here we'll preprend the query and product title words with "q" and "p", respectively, and see if the results are different.

In [96]:
def method_2_tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(stemmer.stem(t.lower())) for t in tokens]

def prefix(text,feature,stopwords):
    prefixes = {'query':'x','product_title':'y','product_description':'z'}
    prefix = prefixes[feature]
    
    words = text.split()
    return " ".join(
        [prefix + word.lower() for word in words if word.lower() not in stopwords]
    )

In [97]:
for col in ['query','product_title','product_description']:
    data[col+"2"] = data[col].apply(
        lambda x: prefix(x,col,nltk.corpus.stopwords.words('english'))
    )

In [98]:
data['text2'] = data.apply(
    lambda x: "%s %s %s" % (
        x['query2'], x['product_title2'], x['product_description2']),
    axis = 1
)

In [99]:
tfidf = TfidfVectorizer(
    tokenizer = method_2_tokenize,
    min_df=5, 
    max_df=500, 
    max_features=None, 
    strip_accents='unicode', 
    analyzer='word', 
    token_pattern=r'\w{1,}', 
    ngram_range=(1, 2), 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=True, 
    stop_words = 'english'
)
train_test_split_idx = data[data.dataset == 'train'].shape[0]
X = tfidf.fit_transform(data['text2'])
X_train2 = X[:train_test_split_idx,:] 
y_train2 = data.loc[:train_test_split_idx-1,'median_relevance'].values.astype(int)

In [100]:
method_2_pipeline = Pipeline([
    ('SVD',TruncatedSVD(n_components=200)),
    ('normalizer',StandardScaler()),
    ('estimator',SVC(C = 10.0))
])

In [101]:
cross_val_start = time.time()
if __name__ == "__main__":
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = method_2_pipeline,
        X = X_train2,
        y = y_train2,
        scoring = kappa_scorer,
        cv = StratifiedKFold(y_train2,5,random_state=np.random.randint(1)),
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - cross_val_start))

Average cross validation score: 0.5537
All Cross Validation Scores: [ 0.5562447   0.55960865  0.54209732  0.54778217  0.56289438]
Training time: 101.1 seconds


##Method 3: TF-IDF On Combined, Labeled Text Features, using train as TFIDF corpus

In [103]:
def method_3_tokenize(text):
    tokens = nltk.word_tokenize(text)
    stemmer = nltk.PorterStemmer()
    lemmatizer = nltk.WordNetLemmatizer()
    return [lemmatizer.lemmatize(stemmer.stem(t.lower())) for t in tokens]

def prefix(text,feature,stopwords):
    prefixes = {'query':'x','product_title':'y','product_description':'z'}
    prefix = prefixes[feature]
    
    words = text.split()
    return " ".join(
        [prefix + word.lower() for word in words if word.lower() not in stopwords]
    )

In [104]:
for col in ['query','product_title','product_description']:
    data[col+"3"] = data[col].apply(
        lambda x: prefix(x,col,nltk.corpus.stopwords.words('english'))
    )

In [105]:
data['text3'] = data.apply(
    lambda x: "%s %s %s" % (
        x['query3'], x['product_title3'], x['product_description3']),
    axis = 1
)

In [111]:
tfidf = TfidfVectorizer(
    tokenizer = method_3_tokenize,
    min_df=5, 
    max_df=500, 
    max_features=None, 
    strip_accents='unicode', 
    analyzer='word', 
    token_pattern=r'\w{1,}', 
    ngram_range=(1, 2), 
    use_idf=True, 
    smooth_idf=True, 
    sublinear_tf=True, 
    stop_words = 'english'
)
train,test = data[data.dataset == 'train'], data[data.dataset == 'test']
X_train3,y_train3 = tfidf.fit_transform(train['text3']), train['median_relevance']
X_test3 = tfidf.fit_transform(test['text3'])

In [107]:
method_3_pipeline = Pipeline([
    ('SVD',TruncatedSVD(n_components=200)),
    ('normalizer',StandardScaler()),
    ('estimator',SVC(C = 10.0))
])

In [108]:
cross_val_start = time.time()
if __name__ == "__main__":
    kappa_scorer = metrics.make_scorer(
        quadratic_weighted_kappa, 
        greater_is_better = True
    )
    
    scores = cross_val_score(
        estimator = method_3_pipeline,
        X = X,
        y = y,
        scoring = kappa_scorer,
        cv = StratifiedKFold(y_train2,5,random_state=np.random.randint(1)),
        n_jobs = -1
    )
print("Average cross validation score: %0.4f" % scores.mean())
print("All Cross Validation Scores: %s" % scores)
print("Training time: %0.1f seconds" % (time.time() - cross_val_start))

Average cross validation score: 0.5820
All Cross Validation Scores: [ 0.56336399  0.58327726  0.58598208  0.58796243  0.58960182]
Training time: 93.6 seconds


In [112]:
prediction_start = time.time()
model = method_3_pipeline.fit(X_train,y_train)
results = pd.DataFrame({
'id': test['id'].values,
'prediction': method_3_pipeline.predict(X_test)
})
print("Prediction time: %0.1f seconds" % (time.time() - prediction_start))

ValueError: dimension mismatch

In [113]:
X_test.

(22513, 50395)