In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from nltk.stem.porter import *
import re
from sklearn.feature_extraction import text

In [None]:
training_data = pd.read_csv('train.csv', index_col = [0])

In [None]:
test_data = pd.read_csv('test.csv', index_col = [0])

In [None]:
training_data.product_description = training_data.product_description.fillna('')
test_data.product_description = test_data.product_description.fillna('')

Preprocessing

In [None]:
# stop word removal
stemmer = PorterStemmer()
sw=[]
ML_STOP_WORDS = ['http','www','img','border','color','style','padding','table','font','inch','width','height']
ML_STOP_WORDS += list(text.ENGLISH_STOP_WORDS)
for stw in ML_STOP_WORDS:
    sw.append(str(stw))
ML_STOP_WORDS += sw
for i in range(len(ML_STOP_WORDS)):
    ML_STOP_WORDS[i]=stemmer.stem(ML_STOP_WORDS[i])

In [None]:
# declarations
def ML_TEXT_CLEAN(f2,f3):
    if len(f2)<3:
        f2="feature2null"
    if len(f3)<3:
        f3="feature3null"
    tx = BeautifulSoup(f3)
    tx1 = [x.extract() for x in tx.findAll('script')]
    tx = tx.get_text(" ").strip()
    s = (" ").join(["z"+ str(z) for z in f2.split(" ")]) + " " + tx
    s = re.sub("[^a-zA-Z0-9]"," ", s)
    s = re.sub("[0-9]{1,3}px"," ", s)
    s = re.sub(" [0-9]{1,6} |000"," ", s)
    s = (" ").join([stemmer.stem(z) for z in s.split(" ") if len(z)>2])
    s = s.lower()
    return s

In [None]:
# Cleaning training data
s_data = []
for i in range(len(training_data.id)):
    s = ML_TEXT_CLEAN(training_data.product_title[i], training_data.product_description[i])
    s_data.append((training_data["query"][i], s, str(training_data["median_relevance"][i])))

In [None]:
# Cleaning test data
t_data = []
for i in range(len(test_data.id)):
    s = ML_TEXT_CLEAN(test_data.product_title[i], test_data.product_description[i])
    t_data.append((test_data["query"][i], s, test_data.id[i]))

In [None]:
training_df = pd.DataFrame(s_data)
test_df = pd.DataFrame(t_data)



---



SVM

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
clf = Pipeline([('v',TfidfVectorizer(max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 6),
                                     use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words = ML_STOP_WORDS)),
                ('svd', TruncatedSVD(n_components=100)),  ('scl', StandardScaler()), ('svm', SVC(C=10))])
clf.fit(training_df[1], training_df[2])

In [None]:
t_labels_svm = clf.predict(test_df[1])

In [None]:
t_labels_svm_df = pd.DataFrame (t_labels_svm, columns = ['median_relevance'])

In [None]:
t_labels_svm_df = pd.to_numeric(t_labels_svm_df['median_relevance'])

In [None]:
print(classification_report(test_data['median_relevance'], t_labels_svm_df))

              precision    recall  f1-score   support

           1       0.50      0.05      0.09       186
           2       0.38      0.11      0.17       337
           3       0.41      0.09      0.14       451
           4       0.64      0.95      0.76      1566

    accuracy                           0.62      2540
   macro avg       0.48      0.30      0.29      2540
weighted avg       0.55      0.62      0.53      2540





---

