In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.stem.porter import *
import re
from sklearn.feature_extraction import text

# for visualisation
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt

# hide warnings
import warnings 
warnings.filterwarnings("ignore")

In [4]:
training_data = pd.read_csv('Data/train.csv', index_col = [0])

In [5]:
test_data = pd.read_csv('Data/test.csv', index_col = [0])

In [6]:
training_data.product_description = training_data.product_description.fillna('')
test_data.product_description = test_data.product_description.fillna('')

Preprocessing

In [7]:
# stop word removal
stemmer = PorterStemmer()
sw=[]
ML_STOP_WORDS = ['http','www','img','border','color','style','padding','table','font','inch','width','height']
ML_STOP_WORDS += list(text.ENGLISH_STOP_WORDS)
for stw in ML_STOP_WORDS:
    sw.append("z"+str(stw))
ML_STOP_WORDS += sw
for i in range(len(ML_STOP_WORDS)):
    ML_STOP_WORDS[i]=stemmer.stem(ML_STOP_WORDS[i])

In [8]:
# declarations
def ML_TEXT_CLEAN(f2,f3):
    if len(f2)<3:
        f2="feature2null"
    if len(f3)<3:
        f3="feature3null"
    tx = BeautifulSoup(f3)
    tx1 = [x.extract() for x in tx.findAll('script')]
    tx = tx.get_text(" ").strip()
    #s = (" ").join(["z"+ str(z) for z in f2.split(" ")]) + " " + tx
    s = (" ").join([str(z) for z in f2.split(" ")]) + " " + tx
    s = re.sub("[^a-zA-Z0-9]"," ", s)
    s = re.sub("[0-9]{1,3}px"," ", s)
    s = re.sub(" [0-9]{1,6} |000"," ", s)
    s = (" ").join([stemmer.stem(z) for z in s.split(" ") if len(z)>2])
    s = s.lower()
    return s

In [9]:
# Cleaning training data
s_data = []
for i in range(len(training_data.id)):
    s = ML_TEXT_CLEAN(training_data.product_title[i], training_data.product_description[i])
    s_data.append((training_data["query"][i], s, str(training_data["median_relevance"][i])))

In [10]:
# Cleaning test data
t_data = []
for i in range(len(test_data.id)):
    s = ML_TEXT_CLEAN(test_data.product_title[i], test_data.product_description[i])
    t_data.append((test_data["query"][i], s, str(test_data["median_relevance"][i])))

In [11]:
training_df = pd.DataFrame(s_data)
test_df = pd.DataFrame(t_data)

In [12]:
# tf-idf Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD as SVD

tf_idf             = TfidfVectorizer(lowercase=True, stop_words= ML_STOP_WORDS, analyzer='word', ngram_range=(1,2), strip_accents='unicode')
svd                = SVD(n_components=200)
X_query_doc_tf_idf = svd.fit_transform(tf_idf.fit_transform(training_df[0], training_df[1]))
X_query_tf_idf     = svd.transform(tf_idf.transform(training_df[0]))
X_doc_tf_idf       = svd.transform(tf_idf.transform(training_df[1]))

In [12]:
# query-doc tf-idf dot product
N = len(training_df)
dot     = np.reshape([X_query_tf_idf[i].dot(X_doc_tf_idf[i]) for i in range(len(X_query_tf_idf))], (N,1))

# Query Perfect Match
#N = len(training_df)
#match = np.reshape([1 if re.search(training_df[0].iloc[i].lower(), training_df[1].iloc[i].lower()) else 0 for i in range(N)], (N,1))

print("Extra features done!")

Extra features done!


In [13]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_query_doc_tf_idf = ss.fit_transform(X_query_doc_tf_idf)

ss = StandardScaler()
dot = ss.fit_transform(dot)

X = np.hstack((X_query_doc_tf_idf,dot))



---



**MLP**

In [14]:
# Training and validation sets
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X[:len(training_df)], training_df[2], test_size=0.15,
                                                      stratify=training_df[2])

from sklearn.metrics import cohen_kappa_score as kappa

Kappa = lambda y_pred, y_true: kappa(np.clip(np.round(y_pred), 1, 4), y_true, weights="quadratic")

In [15]:
from sklearn.neural_network import MLPRegressor

In [16]:
# Neural Net
#nn = MLPRegressor(hidden_layer_sizes=(100, ), random_state=1, max_iter=500)
#nn = MLPRegressor(hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='relu')
#nn = MLPRegressor(hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='identity')
#nn = MLPRegressor(hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='tanh')
nn = MLPRegressor(hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='tanh',
                  learning_rate='adaptive', learning_rate_init=0.0001, early_stopping=True)
nn.fit(X_train, y_train)

MLPRegressor(activation='tanh', early_stopping=True,
             hidden_layer_sizes=(100, 100, 100, 100, 100, 100, 100, 100, 8),
             learning_rate='adaptive', learning_rate_init=0.0001, max_iter=500,
             random_state=1)

For validation data:

In [17]:
nn_pred = nn.predict(X_val)

In [18]:
y_val = pd.to_numeric(y_val)

In [19]:
print("NN Kappa: {0}".format(Kappa(nn_pred, y_val)))

NN Kappa: 0.5078249498190097


With:


1.   hidden_layer_sizes=(100, ), random_state=1, max_iter=500
2.   hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='identity'
3.   hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='relu'
4.   hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='tanh'
5.   hidden_layer_sizes=(100,100,100,100,100,100,100,100,8), random_state=1, max_iter=500, activation='tanh',learning_rate='adaptive', learning_rate_init=0.0001, early_stopping=True



1.   we get validation accuracy around **0.50293**
and test data acccuracy around **0.00000**
2.   we get validation accuracy around **0.48371**
and test data acccuracy around **0.14346**
3.   we get validation accuracy around **0.39754**
and test data acccuracy around **0.12073**
4.   we get validation accuracy around **0.48605**
and test data acccuracy around **0.20501**
5.   we get validation accuracy around **0.50782**
and test data acccuracy around **0.14874**

For test data:

In [20]:
# Vectorize test data

Y_query_doc_tf_idf = svd.fit_transform(tf_idf.fit_transform(test_df[0], test_df[1]))
Y_query_tf_idf     = svd.transform(tf_idf.transform(test_df[0]))
Y_doc_tf_idf       = svd.transform(tf_idf.transform(test_df[1]))

In [21]:
# query-doc tf-idf dot product
N = len(test_df)
dot     = np.reshape([Y_query_tf_idf[i].dot(Y_doc_tf_idf[i]) for i in range(len(Y_query_tf_idf))], (N,1))

In [22]:
ss = StandardScaler()
X_query_doc_tf_idf = ss.fit_transform(Y_query_doc_tf_idf)

ss = StandardScaler()
dot = ss.fit_transform(dot)

Y = np.hstack((Y_query_doc_tf_idf,dot))

In [23]:
nn_pred_test = nn.predict(Y)

In [24]:
y_test = pd.to_numeric(test_data.median_relevance)

In [25]:
print("NN Kappa: {0}".format(Kappa(nn_pred_test, y_test)))

NN Kappa: 0.14874783433169336
