In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.stem.porter import *
import re
from sklearn.feature_extraction import text

In [5]:
training_data = pd.read_csv('train.csv', index_col = [0])

In [2]:
test_data = pd.read_csv('test.csv', index_col = [0])

In [6]:
training_data.product_description = training_data.product_description.fillna('')
test_data.product_description = test_data.product_description.fillna('')

Preprocessing

In [3]:
# stop word removal
stemmer = PorterStemmer()
sw=[]
ML_STOP_WORDS = ['http','www','img','border','color','style','padding','table','font','inch','width','height']
ML_STOP_WORDS += list(text.ENGLISH_STOP_WORDS)
for stw in ML_STOP_WORDS:
    sw.append("z"+str(stw))
ML_STOP_WORDS += sw
for i in range(len(ML_STOP_WORDS)):
    ML_STOP_WORDS[i]=stemmer.stem(ML_STOP_WORDS[i])

In [4]:
# declarations
def ML_TEXT_CLEAN(f2,f3):
    if len(f2)<3:
        f2="feature2null"
    if len(f3)<3:
        f3="feature3null"
    tx = BeautifulSoup(f3)
    tx1 = [x.extract() for x in tx.findAll('script')]
    tx = tx.get_text(" ").strip()
    #s = (" ").join(["z"+ str(z) for z in f2.split(" ")]) + " " + tx
    s = (" ").join([str(z) for z in f2.split(" ")]) + " " + tx
    s = re.sub("[^a-zA-Z0-9]"," ", s)
    s = re.sub("[0-9]{1,3}px"," ", s)
    s = re.sub(" [0-9]{1,6} |000"," ", s)
    s = (" ").join([stemmer.stem(z) for z in s.split(" ") if len(z)>2])
    s = s.lower()
    return s

In [7]:
# Cleaning training data
s_data = []
for i in range(len(training_data.id)):
    s = ML_TEXT_CLEAN(training_data.product_title[i], training_data.product_description[i])
    s_data.append((training_data["query"][i], s, str(training_data["median_relevance"][i])))

In [8]:
# Cleaning test data
t_data = []
for i in range(len(test_data.id)):
    s = ML_TEXT_CLEAN(test_data.product_title[i], test_data.product_description[i])
    t_data.append((test_data["query"][i], s, test_data.id[i]))

In [9]:
training_df = pd.DataFrame(s_data)
test_df = pd.DataFrame(t_data)

In [10]:
# tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as SVD

tf_idf             = TfidfVectorizer(lowercase=True, stop_words= ML_STOP_WORDS, analyzer='word', ngram_range=(1,2), strip_accents='unicode')
svd                = SVD(n_components=200)
X_query_doc_tf_idf = svd.fit_transform(tf_idf.fit_transform(training_df[0], training_df[1]))
X_query_tf_idf     = svd.transform(tf_idf.transform(training_df[0]))
X_doc_tf_idf       = svd.transform(tf_idf.transform(training_df[1]))

In [11]:
# query-doc tf-idf dot product
N = len(training_df)
dot     = np.reshape([X_query_tf_idf[i].dot(X_doc_tf_idf[i]) for i in range(len(X_query_tf_idf))], (N,1))

# Query Perfect Match
#N = len(training_df)
#match = np.reshape([1 if re.search(training_df[0].iloc[i].lower(), training_df[1].iloc[i].lower()) else 0 for i in range(N)], (N,1))

print("Extra features done!")

Extra features done!


In [12]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_query_doc_tf_idf = ss.fit_transform(X_query_doc_tf_idf)

ss = StandardScaler()
dot = ss.fit_transform(dot)

X = np.hstack((X_query_doc_tf_idf,dot))



---



In [13]:
# Training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X[:len(training_df)], training_df[2], test_size=0.15,
                                                      stratify=training_df[2])

from sklearn.metrics import cohen_kappa_score as kappa

Kappa = lambda y_pred, y_true: kappa(np.clip(np.round(y_pred), 1, 4), y_true, weights="quadratic")

**MLP**

In [14]:
from sklearn.neural_network import MLPRegressor

In [15]:
nn = MLPRegressor(hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='tanh')
nn.fit(X_train, y_train)

MLPRegressor(activation='tanh',
             hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100, 8),
             max_iter=500, random_state=1)

In [23]:
nn_pred_val = nn.predict(X_val)

In [25]:
y_val = pd.to_numeric(y_val)

In [26]:
print("NN Kappa: {0}".format(Kappa(nn_pred_val, y_val)))

NN Kappa: 0.51896055341184


Test sample data:

In [16]:
# Vectorize test data
#Y_query_doc_tf_idf = svd.fit_transform(tf_idf.fit_transform(test_df[0], test_df[1]))
#Y_query_tf_idf     = svd.transform(tf_idf.transform(test_df[0]))
#Y_doc_tf_idf       = svd.transform(tf_idf.transform(test_df[1]))

In [17]:
# query-doc tf-idf dot product
#N = len(test_df)
#dot = np.reshape([Y_query_tf_idf[i].dot(Y_doc_tf_idf[i]) for i in range(len(Y_query_tf_idf))], (N,1))

In [18]:
'''
ss = StandardScaler()
X_query_doc_tf_idf = ss.fit_transform(Y_query_doc_tf_idf)

ss = StandardScaler()
dot = ss.fit_transform(dot)

Y = np.hstack((Y_query_doc_tf_idf,dot))
'''

In [27]:
#nn_pred_test = nn.predict(Y)

In [28]:
#y_test = pd.to_numeric(test_data.median_relevance)

**ExtraTrees**

In [30]:
from sklearn.ensemble import ExtraTreesRegressor as ET

In [31]:
et = ET(n_estimators=250, min_samples_split=10)
et.fit(X_train, y_train)

ExtraTreesRegressor(min_samples_split=10, n_estimators=250)

In [38]:
et_pred_val = et.predict(X_val)
print("ET Kappa: {0}".format(Kappa(et_pred_val, y_val)))

ET Kappa: 0.5305658593170102


**ElasticNet**

In [40]:
from sklearn.linear_model import ElasticNet

In [53]:
# Blending
ensemble = ElasticNet(alpha=0.01, l1_ratio=0.5, fit_intercept=False)
ensemble.fit(np.vstack((et_pred_val, nn_pred_val)).T, y_val)

ElasticNet(alpha=0.01, fit_intercept=False)

In [54]:
ensemble_pred = ensemble.predict(np.vstack((et_pred_val, nn_pred_val)).T)
print("Ensemble Kappa (in-sample): {0}".format(Kappa(ensemble_pred, y_val)))

Ensemble Kappa (in-sample): 0.5234087887388139
