In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.stem.porter import *
import re

In [None]:
training_data = pd.read_csv('train.csv', index_col = [0])

In [None]:
training_data.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance
0,0,zippo hand warmer,"Zippo Black Polish Hand Warmer, with Filling C...",,4
1,1,wall mirrors,Cooper Classics Benedetta Wall Mirrors - 12 di...,The small round shape and shiny nickel finish ...,4
2,2,rachel ray cookware,Rachael Ray Porcelain II Green 10-piece Cookwa...,This versatile 10-piece cookware set from Rach...,4
3,3,flea and tick control for dogs,Hartz UltraGuard Plus Flea and Tick Home Fogge...,details\nThe Hartz UltraGuard Plus Home Fogger...,3
4,4,batman,DC Comics Arkham Asylum Batman Series The Joke...,<ul>\n\t\t<li>\n\t\t\tEnglish \n\t\t\t\t</li>\...,2


In [None]:
#training_data.loc[training_data['query']=="zippo hand warmer"]

In [None]:
test_data = pd.read_csv('test.csv', index_col = [0])

In [None]:
test_data.head()

Unnamed: 0,id,query,product_title,product_description,median_relevance
0,0,pittsburgh pirates,Fanmats Machine-Made Pittsburgh Pirates Black ...,Show your team pride and add style to your tai...,4
1,1,dish towels,Oasis Cotton Flour Sack Towel (Set of 3),These Flour Sack Towelsoffers the ultimate in ...,3
2,2,portable hard drive,Toshiba Canvio Slim 1 TB External Hard Drive,Toshiba Canvio® Slim II for Mac® Portable Exte...,3
3,3,playstation vita system,Legends Of Chima: Laval's Journey (PlayStation...,Get ready for an epic adventure as Laval races...,2
4,4,galaxy note 3,3M Natural View Screen Protection Film for Sam...,"Helps keep your device screen in pristine, lik...",2


In [None]:
training_data.isna().sum()

id                        0
query                     0
product_title             0
product_description    1842
median_relevance          0
dtype: int64

In [None]:
test_data.isna().sum()

id                       0
query                    0
product_title            0
product_description    602
median_relevance         0
dtype: int64

In [None]:
training_data.product_description = training_data.product_description.fillna('')
test_data.product_description = test_data.product_description.fillna('')

In [None]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7618 entries, 0 to 7617
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   7618 non-null   int64 
 1   query                7618 non-null   object
 2   product_title        7618 non-null   object
 3   product_description  7618 non-null   object
 4   median_relevance     7618 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 357.1+ KB


In [None]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2540 entries, 0 to 2539
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   id                   2540 non-null   int64 
 1   query                2540 non-null   object
 2   product_title        2540 non-null   object
 3   product_description  2540 non-null   object
 4   median_relevance     2540 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 119.1+ KB


Preprocessing

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words -= {'http','www','img','border','color','style','padding','table','font','inch','width','height'}

In [None]:
# stop word removal
def preprocess(text):
  doc = nlp(text)
  return [token.lemma_ for token in doc if(not token.is_stop and token.lemma_.isalpha())]

In [None]:
from sklearn.feature_extraction import text
# stop word removal
stemmer = PorterStemmer()
sw=[]
ML_STOP_WORDS = ['http','www','img','border','color','style','padding','table','font','inch','width','height']
ML_STOP_WORDS += list(text.ENGLISH_STOP_WORDS)
for stw in ML_STOP_WORDS:
    sw.append(str(stw))
ML_STOP_WORDS += sw
for i in range(len(ML_STOP_WORDS)):
    ML_STOP_WORDS[i]=stemmer.stem(ML_STOP_WORDS[i])

In [None]:
# declarations
def ML_TEXT_CLEAN(f2,f3):
    if len(f2)<3:
        f2="feature2null"
    if len(f3)<3:
        f3="feature3null"
    tx = BeautifulSoup(f3)
    tx1 = [x.extract() for x in tx.findAll('script')]
    tx = tx.get_text(" ").strip()
    s = (" ").join([str(z) for z in f2.split(" ")]) + " " + tx
    s = re.sub("[^a-zA-Z0-9]"," ", s)
    s = re.sub("[0-9]{1,3}px"," ", s)
    s = re.sub(" [0-9]{1,6} |000"," ", s)
    s = (" ").join([stemmer.stem(z) for z in s.split(" ") if len(z)>2])
    s = s.lower()
    return s

In [None]:
# Cleaning training data
s_data = []
for i in range(len(training_data.id)):
    s = ML_TEXT_CLEAN(training_data.product_title[i], training_data.product_description[i])
    s_data.append((training_data["query"][i], s, str(training_data["median_relevance"][i])))

In [None]:
# Cleaning test data
t_data = []
for i in range(len(test_data.id)):
    s = ML_TEXT_CLEAN(test_data.product_title[i], test_data.product_description[i])
    t_data.append((test_data["query"][i], s, test_data.id[i]))

In [None]:
training_df = pd.DataFrame(s_data)
test_df = pd.DataFrame(t_data)



---



Naive Bayes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
clf = MultinomialNB(alpha=0.01)
v = TfidfVectorizer(use_idf=True,min_df=0,ngram_range=(1,6),lowercase=True,sublinear_tf=True, stop_words = ML_STOP_WORDS)
clf.fit(v.fit_transform(training_df[1]), training_df[2])

MultinomialNB(alpha=0.01)

In [None]:
t_labels_nb = clf.predict(v.transform(test_df[1]))

In [None]:
t_labels_nb
t_labels_nb_df = pd.DataFrame (t_labels_nb, columns = ['median_relevance'])

In [None]:
t_labels_nb_df = pd.to_numeric(t_labels_nb_df['median_relevance'])

In [None]:
print(classification_report(test_data['median_relevance'], t_labels_nb_df))

              precision    recall  f1-score   support

           1       0.57      0.24      0.34       186
           2       0.33      0.24      0.28       337
           3       0.31      0.20      0.24       451
           4       0.70      0.86      0.77      1566

    accuracy                           0.62      2540
   macro avg       0.48      0.39      0.41      2540
weighted avg       0.57      0.62      0.58      2540





---

