In [1]:
import pandas as pd

search_df=pd.read_csv('SearchReltrain.csv')

In [2]:
search_test=pd.read_csv('SearchReltest.csv')

In [3]:
search_df['query'].value_counts()

wireless mouse             113
rachel ray cookware         91
memory foam pillow          90
bike lock                   84
16 gb memory card           64
                          ... 
longboard pads              13
silicone toe separators     12
polo bear sweater           10
8 ounce mason jars          10
dollhouse bathtub            8
Name: query, Length: 261, dtype: int64

In [4]:
#Checking target labels -
search_df['median_relevance'].value_counts()

4    6171
3    1737
2    1476
1     774
Name: median_relevance, dtype: int64

In [5]:
pip install textdistance

Collecting textdistance
  Using cached textdistance-4.2.2-py3-none-any.whl (28 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.2.2
Note: you may need to restart the kernel to use updated packages.


In [6]:
#Removing numbers from and punctuations from the title field
import re
def clean_data(text):
    text=re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?','NUMBER', text)
    text=re.sub(r'\W+',' ',text,flags=re.M)
    return text

In [7]:
search_title=search_df['product_title'].apply(clean_data)

In [8]:
y_train=search_df['median_relevance']
search_df=search_df.drop(['median_relevance','relevance_variance'],axis=1)

In [9]:
train = search_df.drop('id', axis=1)
test = search_test.drop('id', axis=1)

In [10]:
traindata = list(train.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))
testdata = list(test.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1)) 

In [11]:
traindata[0:5]

['bridal shower decorations Accent Pillow with Heart Design - Red/Black',
 'led christmas lights Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire',
 'projector ViewSonic Pro8200 DLP Multimedia Projector',
 'wine rack Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle',
 'light bulb Wintergreen Lighting Christmas LED Light Bulb (Pack of 25)']

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_cnv=TfidfVectorizer(stop_words='english',min_df=3,analyzer='word',ngram_range=(1,5),token_pattern=r'\w{1,}')

In [15]:
#Dimensionality Reduction as an Initialization step
from sklearn.decomposition import TruncatedSVD
svd=TruncatedSVD()

In [16]:
#Scaling the training data
from sklearn.preprocessing import StandardScaler
std=StandardScaler()

In [17]:
#SVM Model
from sklearn.svm import SVC
svm_model=SVC()

In [18]:
#creating a scorer(evaluator) function
def build_confusion_matrix(rater_a,rater_b,min_rating=None,max_rating=None):
    assert(len(rater_a)==len(rater_b))
    if min_rating == None:
        min_rating=min(rater_a + rater_b)
    if max_rating == None:
        max_rating=max(rater_a + rater_b)
    num_ratings=int(max_rating - min_rating +1)
    print(min_rating)
    conf_mat=[[0 for i in range(num_ratings)] for j in range(num_ratings)]
    for a,b in zip(rater_a,rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

In [19]:
#Count of each type of rating user makes
def histogram(ratings,min_rating=None,max_rating=None):
    if min_rating == None:
        min_rating=min(ratings)
    if max_rating == None:
        max_rating=max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for i in range(num_ratings)]
    for i in hist_ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

In [20]:
#The kappa statistic is frequently used to test interrater reliability. The importance of rater reliability lies in the fact that it represents the extent to which 
#the data collected in the study are correct representations of the variables measured. Measurement of the extent to which data collectors (raters) assign the same 
#score to the same variable is called interrater reliability. While there have been a variety of methods to measure interrater reliability, traditionally it was 
#measured as percent agreement, calculated as the number of agreement scores divided by the total number of scores.
#For deatils on Kappa - link - https://www.kaggle.com/aroraaman/quadratic-kappa-metric-explained-in-5-simple-steps
def quadratic_weighted_kappa(y, y_pred):
    rater_a=y
    rater_b=y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)                     #they were df object so converted them into array
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a)==len(rater_b))
    if min_rating == None:
        min_rating=min(min(rater_a),min(rater_b))
    if max_rating == None:
        max_rating=max(max(rater_a),max(rater_b))
    conf_mat=build_confusion_matrix(rater_a,rater_b,min_rating,max_rating)          #Maps the predicted and actual o/p values
    num_ratings=len(conf_mat)
    num_scored_items=float(len(rater_a))
    
    hist_rater_a = histogram(rater_a, min_rating, max_rating)                       #Create vectors depicting which catagory contains how many no of instances for 
    hist_rater_b = histogram(rater_b, min_rating, max_rating)                       #both predicted and actual
    
    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]   #expected matrix - outer product of the above derived vectors and normalized using num_scored_items
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)       #Assigning a weight to be applied on each element of confusion matrix
            numerator += d * conf_mat[i][j] / num_scored_items    #weight*confusion matrix along with normalization
            denominator += d * expected_count / num_scored_items  #weight*expected matrix elemenr along with normalization

    return (1.0 - numerator / denominator)                        #kappa coefficient formula

In [21]:
#Setting validation params for GridSearchCV
param_grid={'svd__n_components':[200,400],
            'svm__C':[10,12]}

In [22]:
#Setting model scorer
from sklearn import metrics
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)

In [23]:
import textdistance

In [62]:
#Finding out the cosine distance of query and product-title
import numpy as np
dist=np.zeros([len(search_df)])
idx=0
for i,j in zip(search_df['query'],search_df['product_title']):
    dist[idx]=(textdistance.cosine(i,j))
    idx+=1
dist

array([0.54899485, 0.50069396, 0.4114756 , ..., 0.52868053, 0.3441236 ,
       0.56077215])

In [63]:
dist.shape

(10158,)

In [64]:
features=pd.DataFrame(traindata)
features=features.rename(columns={0:'text'})

In [65]:
dist_cnv=np.reshape(len(dist),1)
features['distance']=pd.DataFrame(dist)

In [99]:
test_dist=np.zeros((len(testdata),1))
idx=0
for i,j in zip(search_test['query'],search_test['product_title']):
    test_dist[idx]=textdistance.cosine(i,j)
    idx+=1
test_dist

array([[0.62861856],
       [0.49102862],
       [0.71205164],
       ...,
       [0.36514837],
       [0.62401997],
       [0.40546538]])

In [100]:
feature_test=pd.DataFrame(testdata)
feature_test=feature_test.rename(columns={0:'text'})
feature_test.head()

Unnamed: 0,text
0,electric griddle Star-Max 48 in Electric Griddle
1,phillips coffee maker Philips SENSEO HD7810 WH...
2,san francisco 49ers 2013 San Francisco 49ers C...
3,aveeno shampoo AVEENO 10.5FLOZ NRSH SHIN...
4,flea and tick control for dogs Merial Frontlin...


In [101]:
test_dist.shape

(22513, 1)

In [102]:
feature_test['distance']=pd.DataFrame(test_dist)

In [103]:
feature_test

Unnamed: 0,text,distance
0,electric griddle Star-Max 48 in Electric Griddle,0.628619
1,phillips coffee maker Philips SENSEO HD7810 WH...,0.491029
2,san francisco 49ers 2013 San Francisco 49ers C...,0.712052
3,aveeno shampoo AVEENO 10.5FLOZ NRSH SHIN...,0.045175
4,flea and tick control for dogs Merial Frontlin...,0.578829
...,...,...
22508,seagate 2tb hard disk Seagate Backup Plus STDU...,0.370970
22509,"storage ottoman Chevron Storage Ottoman, Gray ...",0.578829
22510,gym gloves Lion Martial Arts Large Red Grappli...,0.365148
22511,wreck it ralph Wreck-It Ralph (Original Score),0.624020


In [104]:
from sklearn.compose import ColumnTransformer

feature_conv=ColumnTransformer([
    ('tf_idf',tfidf_cnv,'text')],
    remainder='passthrough'
)

In [105]:
X_train_new=feature_conv.fit_transform(features)

In [106]:
#Creating a pipeline to initialize, scale and Train the model
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

clf=Pipeline([
    ('svd',svd),
    ('std',std),
    ('svm',svm_model)
])

In [107]:
clf_model2=Pipeline([
    ('svd',svd),
    ('std',std),
    ('svm',svm_model)
])

In [108]:
grid_search_clf=GridSearchCV(estimator=clf_model2,param_grid=param_grid,scoring=kappa_scorer,verbose=10, n_jobs=-1, refit=True, cv=2)

In [112]:
grid_search_clf.fit(X_train_new,y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('svd', TruncatedSVD()),
                                       ('std', StandardScaler()),
                                       ('svm', SVC())]),
             n_jobs=-1,
             param_grid={'svd__n_components': [200, 400], 'svm__C': [10, 12]},
             scoring=make_scorer(quadratic_weighted_kappa), verbose=10)

In [110]:
X_test_new=feature_conv.transform(feature_test)

In [113]:
grid_search_clf.best_estimator_

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('std', StandardScaler()), ('svm', SVC(C=10))])

In [114]:
model_2=grid_search_clf.best_estimator_

In [115]:
model_2.fit(X_train_new,y_train)

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('std', StandardScaler()), ('svm', SVC(C=10))])

In [117]:
from sklearn.metrics import accuracy_score
y_pred_model2=model_2.predict(X_train_new)
accuracy_score(y_train, y_pred_model2)

0.7869659381768065

In [119]:
y_test_model2=model_2.predict(X_test_new)

In [120]:
np.unique(y_test_model2,return_counts=True)

(array([1, 2, 3, 4]), array([ 1054,  2280,  1712, 17467]))