In [153]:
import pandas as pd

search_df=pd.read_csv('SearchReltrain.csv')

In [154]:
search_test=pd.read_csv('SearchReltest.csv')

In [158]:
search_df['query'].value_counts()

wireless mouse             113
rachel ray cookware         91
memory foam pillow          90
bike lock                   84
16 gb memory card           64
                          ... 
longboard pads              13
silicone toe separators     12
8 ounce mason jars          10
polo bear sweater           10
dollhouse bathtub            8
Name: query, Length: 261, dtype: int64

In [159]:
#Checking target labels -
search_df['median_relevance'].value_counts()

4    6171
3    1737
2    1476
1     774
Name: median_relevance, dtype: int64

In [160]:
pip install textdistance

Collecting textdistance
  Using cached textdistance-4.2.2-py3-none-any.whl (28 kB)
Installing collected packages: textdistance
Successfully installed textdistance-4.2.2
Note: you may need to restart the kernel to use updated packages.


In [161]:
#Removing numbers from and punctuations from the title field
import re
def clean_data(text):
    text=re.sub(r'\d+(?:\.\d*(?:[eE]\d+))?','NUMBER', text)
    text=re.sub(r'\W+',' ',text,flags=re.M)
    return text

In [162]:
search_title=search_df['product_title'].apply(clean_data)

In [163]:
y_train=search_df['median_relevance']
search_df=search_df.drop(['median_relevance','relevance_variance'],axis=1)

In [164]:
train = search_df.drop('id', axis=1)
test = search_test.drop('id', axis=1)

In [165]:
traindata = list(train.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1))
testdata = list(test.apply(lambda x:'%s %s' % (x['query'],x['product_title']),axis=1)) 

In [166]:
traindata[0:5]

['bridal shower decorations Accent Pillow with Heart Design - Red/Black',
 'led christmas lights Set of 10 Battery Operated Multi LED Train Christmas Lights - Clear Wire',
 'projector ViewSonic Pro8200 DLP Multimedia Projector',
 'wine rack Concept Housewares WR-44526 Solid-Wood Ceiling/Wall-Mount Wine Rack, Charcoal Grey, 6 Bottle',
 'light bulb Wintergreen Lighting Christmas LED Light Bulb (Pack of 25)']

In [167]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_cnv=TfidfVectorizer(stop_words='english',min_df=3,analyzer='word',ngram_range=(1,5),token_pattern=r'\w{1,}')

In [168]:
#Dimensionality Reduction as an Initialization step
from sklearn.decomposition import TruncatedSVD
svd=TruncatedSVD()

In [169]:
#Scaling the training data
from sklearn.preprocessing import StandardScaler
std=StandardScaler()

In [170]:
#SVM Model
from sklearn.svm import SVC
svm_model=SVC()

In [171]:
#creating a scorer(evaluator) function
def build_confusion_matrix(rater_a,rater_b,min_rating=None,max_rating=None):
    assert(len(rater_a)==len(rater_b))
    if min_rating == None:
        min_rating=min(rater_a + rater_b)
    if max_rating == None:
        max_rating=max(rater_a + rater_b)
    num_ratings=int(max_rating - min_rating +1)
    print(min_rating)
    conf_mat=[[0 for i in range(num_ratings)] for j in range(num_ratings)]
    for a,b in zip(rater_a,rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

In [172]:
#Count of each type of rating user makes
def histogram(ratings,min_rating=None,max_rating=None):
    if min_rating == None:
        min_rating=min(ratings)
    if max_rating == None:
        max_rating=max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for i in range(num_ratings)]
    for i in hist_ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

In [173]:
#The kappa statistic is frequently used to test interrater reliability. The importance of rater reliability lies in the fact that it represents the extent to which 
#the data collected in the study are correct representations of the variables measured. Measurement of the extent to which data collectors (raters) assign the same 
#score to the same variable is called interrater reliability. While there have been a variety of methods to measure interrater reliability, traditionally it was 
#measured as percent agreement, calculated as the number of agreement scores divided by the total number of scores.
#For deatils on Kappa - link - https://www.kaggle.com/aroraaman/quadratic-kappa-metric-explained-in-5-simple-steps
def quadratic_weighted_kappa(y, y_pred):
    rater_a=y
    rater_b=y_pred
    min_rating=None
    max_rating=None
    rater_a = np.array(rater_a, dtype=int)                     #they were df object so converted them into array
    rater_b = np.array(rater_b, dtype=int)
    assert(len(rater_a)==len(rater_b))
    if min_rating == None:
        min_rating=min(min(rater_a),min(rater_b))
    if max_rating == None:
        max_rating=max(max(rater_a),max(rater_b))
    conf_mat=build_confusion_matrix(rater_a,rater_b,min_rating,max_rating)          #Maps the predicted and actual o/p values
    num_ratings=len(conf_mat)
    num_scored_items=float(len(rater_a))
    
    hist_rater_a = histogram(rater_a, min_rating, max_rating)                       #Create vectors depicting which catagory contains how many no of instances for 
    hist_rater_b = histogram(rater_b, min_rating, max_rating)                       #both predicted and actual
    
    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]   #expected matrix - outer product of the above derived vectors and normalized using num_scored_items
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)       #Assigning a weight to be applied on each element of confusion matrix
            numerator += d * conf_mat[i][j] / num_scored_items    #weight*confusion matrix along with normalization
            denominator += d * expected_count / num_scored_items  #weight*expected matrix elemenr along with normalization

    return (1.0 - numerator / denominator)                        #kappa coefficient formula

In [174]:
#Setting validation params for GridSearchCV
param_grid={'svd__n_components':[200,400],
            'svm__C':[10,12]}

In [175]:
#Setting model scorer
from sklearn import metrics
kappa_scorer = metrics.make_scorer(quadratic_weighted_kappa, greater_is_better = True)

In [176]:
import textdistance

In [177]:
#Finding out the cosine distance of query and product-title
import numpy as np
dist=np.zeros([len(search_df)])
idx=0
for i,j in zip(search_df['query'],search_df['product_title']):
    dist[idx]=(textdistance.cosine(i,j))
    idx+=1
dist

array([0.54899485, 0.50069396, 0.4114756 , ..., 0.52868053, 0.3441236 ,
       0.56077215])

In [178]:
dist.shape

(10158,)

In [179]:
features=pd.DataFrame(traindata)
features=features.rename(columns={0:'text'})

In [180]:
dist_cnv=np.reshape(len(dist),1)
features['distance']=pd.DataFrame(dist)

In [181]:
test_dist=np.zeros((len(testdata),1))
idx=0
for i,j in zip(search_test['query'],search_test['product_title']):
    test_dist[idx]=textdistance.cosine(i,j)
    idx+=1
test_dist

array([[0.62861856],
       [0.49102862],
       [0.71205164],
       ...,
       [0.36514837],
       [0.62401997],
       [0.40546538]])

In [182]:
feature_test=pd.DataFrame(testdata)
feature_test=feature_test.rename(columns={0:'text'})
feature_test.head()

Unnamed: 0,text
0,electric griddle Star-Max 48 in Electric Griddle
1,phillips coffee maker Philips SENSEO HD7810 WH...
2,san francisco 49ers 2013 San Francisco 49ers C...
3,aveeno shampoo AVEENO 10.5FLOZ NRSH SHIN...
4,flea and tick control for dogs Merial Frontlin...


In [183]:
test_dist.shape

(22513, 1)

In [184]:
feature_test['distance']=pd.DataFrame(test_dist)

In [185]:
feature_test

Unnamed: 0,text,distance
0,electric griddle Star-Max 48 in Electric Griddle,0.628619
1,phillips coffee maker Philips SENSEO HD7810 WH...,0.491029
2,san francisco 49ers 2013 San Francisco 49ers C...,0.712052
3,aveeno shampoo AVEENO 10.5FLOZ NRSH SHIN...,0.045175
4,flea and tick control for dogs Merial Frontlin...,0.578829
...,...,...
22508,seagate 2tb hard disk Seagate Backup Plus STDU...,0.370970
22509,"storage ottoman Chevron Storage Ottoman, Gray ...",0.578829
22510,gym gloves Lion Martial Arts Large Red Grappli...,0.365148
22511,wreck it ralph Wreck-It Ralph (Original Score),0.624020


In [186]:
from sklearn.compose import ColumnTransformer

feature_conv=ColumnTransformer([
    ('tf_idf',tfidf_cnv,'text')],
    remainder='passthrough'
)

In [187]:
X_train_new=feature_conv.fit_transform(features)

In [188]:
#Creating a pipeline to initialize, scale and Train the model
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [189]:
clf_model2=Pipeline([
    ('svd',svd),
    ('std',std),
    ('svm',svm_model)
])

In [35]:
grid_search_clf=GridSearchCV(estimator=clf_model2,param_grid=param_grid,scoring=kappa_scorer,verbose=10, n_jobs=-1, refit=True, cv=2)

In [36]:
grid_search_clf.fit(X_train_new,y_train)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


GridSearchCV(cv=2,
             estimator=Pipeline(steps=[('svd', TruncatedSVD()),
                                       ('std', StandardScaler()),
                                       ('svm', SVC())]),
             n_jobs=-1,
             param_grid={'svd__n_components': [200, 400], 'svm__C': [10, 12]},
             scoring=make_scorer(quadratic_weighted_kappa), verbose=10)

In [156]:
X_test_new=feature_conv.transform(feature_test)

In [38]:
grid_search_clf.best_estimator_

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('std', StandardScaler()), ('svm', SVC(C=10))])

In [39]:
model_2=grid_search_clf.best_estimator_

In [40]:
model_2.fit(X_train_new,y_train)

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('std', StandardScaler()), ('svm', SVC(C=10))])

In [41]:
from sklearn.metrics import accuracy_score
y_pred_model2=model_2.predict(X_train_new)
accuracy_score(y_train, y_pred_model2)

0.7843079346328017

In [42]:
y_test_model2=model_2.predict(X_test_new)

In [43]:
np.unique(y_test_model2,return_counts=True)

(array([1, 2, 3, 4]), array([ 1050,  2269,  1728, 17466]))

In [44]:
#Copying the test output label
test_label=pd.read_csv('SearchRelevanceSubmission.csv')
test_label['prediction'].value_counts()

4    13680
3     3849
2     3270
1     1714
Name: prediction, dtype: int64

In [45]:
accuracy_score(test_label['prediction'],y_test_model2)

0.7254031004308622

In [46]:
#RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier(random_state=42)

In [47]:
clf_2=Pipeline([
    ('svd',svd),
#    ('std',std),
    ('model',forest)
])

In [48]:
clf_2.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'svd', 'model', 'svd__algorithm', 'svd__n_components', 'svd__n_iter', 'svd__random_state', 'svd__tol', 'model__bootstrap', 'model__ccp_alpha', 'model__class_weight', 'model__criterion', 'model__max_depth', 'model__max_features', 'model__max_leaf_nodes', 'model__max_samples', 'model__min_impurity_decrease', 'model__min_impurity_split', 'model__min_samples_leaf', 'model__min_samples_split', 'model__min_weight_fraction_leaf', 'model__n_estimators', 'model__n_jobs', 'model__oob_score', 'model__random_state', 'model__verbose', 'model__warm_start'])

In [109]:
param_grid={'svd__n_components':[200,400],
            'model__n_estimators':[10,12,15],
            'model__max_depth':[20,30]
}

In [110]:
grid_search_clf=GridSearchCV(estimator=clf_2,param_grid=param_grid,scoring=kappa_scorer,verbose=10, n_jobs=-1, refit=True, cv=4)

In [111]:
grid_search_clf.fit(X_train_new,y_train)

Fitting 4 folds for each of 12 candidates, totalling 48 fits


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('svd', TruncatedSVD()),
                                       ('model',
                                        RandomForestClassifier(random_state=42))]),
             n_jobs=-1,
             param_grid={'model__max_depth': [20, 30],
                         'model__n_estimators': [10, 12, 15],
                         'svd__n_components': [200, 400]},
             scoring=make_scorer(quadratic_weighted_kappa), verbose=10)

In [112]:
grid_search_clf.best_estimator_

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('model',
                 RandomForestClassifier(max_depth=20, n_estimators=10,
                                        random_state=42))])

In [113]:
forest_model=grid_search_clf.best_estimator_

In [114]:
forest_model.fit(X_train_new,y_train)

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('model',
                 RandomForestClassifier(max_depth=20, n_estimators=10,
                                        random_state=42))])

In [115]:
y_pred_forest=forest_model.predict(X_train_new)
accuracy_score(y_train,y_pred_forest)

0.9392596967907069

In [116]:
y_pred_test=forest_model.predict(X_test_new)

In [117]:
accuracy_score(test_label['prediction'],y_pred_test)

0.6732110336250167

[CV 3/4; 6/12] START model__max_depth=10, model__n_estimators=15, svd__n_components=400
1
[CV 3/4; 6/12] END model__max_depth=10, model__n_estimators=15, svd__n_components=400; total time=   6.9s
[CV 1/4; 7/12] START model__max_depth=20, model__n_estimators=10, svd__n_components=200
1
[CV 1/4; 7/12] END model__max_depth=20, model__n_estimators=10, svd__n_components=200; total time=   3.5s
[CV 3/4; 7/12] START model__max_depth=20, model__n_estimators=10, svd__n_components=200
1
[CV 3/4; 7/12] END model__max_depth=20, model__n_estimators=10, svd__n_components=200; total time=   3.3s
[CV 1/4; 8/12] START model__max_depth=20, model__n_estimators=10, svd__n_components=400
1
[CV 1/4; 8/12] END model__max_depth=20, model__n_estimators=10, svd__n_components=400; total time=   7.1s
[CV 4/4; 8/12] START model__max_depth=20, model__n_estimators=10, svd__n_components=400
1
[CV 4/4; 8/12] END model__max_depth=20, model__n_estimators=10, svd__n_components=400; total time=   7.0s
[CV 2/4; 9/12] START

In [69]:
np.unique(y_pred_forest,return_counts=True)

(array([1, 2, 3, 4]), array([ 701, 1344, 1380, 6733]))

In [76]:
#Logistic Model building
from sklearn.linear_model import LogisticRegression
log_clf=LogisticRegression()

In [77]:
#Creating Pipeline for Logistic Regression
clf_log=Pipeline([
    ('svd',svd),
    ('std',std),
    ('model',log_clf)
    ])

In [78]:
clf_log.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'svd', 'std', 'model', 'svd__algorithm', 'svd__n_components', 'svd__n_iter', 'svd__random_state', 'svd__tol', 'std__copy', 'std__with_mean', 'std__with_std', 'model__C', 'model__class_weight', 'model__dual', 'model__fit_intercept', 'model__intercept_scaling', 'model__l1_ratio', 'model__max_iter', 'model__multi_class', 'model__n_jobs', 'model__penalty', 'model__random_state', 'model__solver', 'model__tol', 'model__verbose', 'model__warm_start'])

In [79]:
param_grid={'svd__n_components':[200,400],
            'model__C':[10,12]
}

In [80]:
grid_search_clf=GridSearchCV(estimator=clf_log,param_grid=param_grid,scoring=kappa_scorer,verbose=10, n_jobs=-1, refit=True, cv=4)

In [82]:
grid_search_clf.fit(X_train_new,y_train)

Fitting 4 folds for each of 4 candidates, totalling 16 fits


GridSearchCV(cv=4,
             estimator=Pipeline(steps=[('svd', TruncatedSVD()),
                                       ('std', StandardScaler()),
                                       ('model', LogisticRegression())]),
             n_jobs=-1,
             param_grid={'model__C': [10, 12], 'svd__n_components': [200, 400]},
             scoring=make_scorer(quadratic_weighted_kappa), verbose=10)

In [83]:
grid_search_clf.best_estimator_

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('std', StandardScaler()),
                ('model', LogisticRegression(C=10))])

In [84]:
model_log=grid_search_clf.best_estimator_

In [85]:
model_log.fit(X_train_new,y_train)

Pipeline(steps=[('svd', TruncatedSVD(n_components=200)),
                ('std', StandardScaler()),
                ('model', LogisticRegression(C=10))])

In [86]:
y_log_train=model_log.predict(X_train_new)
accuracy_score(y_train,y_log_train)

0.6311281748375664

In [88]:
y_log_test=model_log.predict(X_test_new)
accuracy_score(test_label['prediction'],y_log_test)

0.6471372096122241

In [214]:
#Stacking models SVC and RandomForest using LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline

estimator=[
#    ('svc',make_pipeline(TruncatedSVD(n_components=200, random_state=42),StandardScaler(),SVC(random_state=42))),
#    ('forest',make_pipeline(TruncatedSVD(n_components=200, random_state=42),RandomForestClassifier(random_state=42,n_estimators=10,max_depth=20)))
    ('SVC',model_2),
    ('forest',forest_model)
]

In [224]:
stack_clf=StackingClassifier(estimators=estimator,final_estimator=LogisticRegression())

In [225]:
stack_clf.fit(X_train_new,y_train)

StackingClassifier(estimators=[('SVC',
                                Pipeline(steps=[('svd',
                                                 TruncatedSVD(n_components=200)),
                                                ('std', StandardScaler()),
                                                ('svm', SVC(C=10))])),
                               ('forest',
                                Pipeline(steps=[('svd',
                                                 TruncatedSVD(n_components=200)),
                                                ('model',
                                                 RandomForestClassifier(max_depth=20,
                                                                        n_estimators=10,
                                                                        random_state=42))]))],
                   final_estimator=LogisticRegression())

In [226]:
y_final_pred=stack_clf.predict(X_train_new)

In [227]:
accuracy_score(y_train,y_final_pred)

0.8041937389249852

In [228]:
y_final_test=stack_clf.predict(X_test_new)

In [229]:
accuracy_score(test_label['prediction'],y_final_test)

0.7059476746768534

In [232]:
subm=pd.DataFrame(search_test['id'])
subm['prediction']=y_final_test
subm

Unnamed: 0,id,prediction
0,3,4
1,6,3
2,9,4
3,11,4
4,12,4
...,...,...
22508,32665,4
22509,32667,4
22510,32669,1
22511,32670,4


In [234]:
subm['prediction'].value_counts()

4    18690
2     2141
1     1155
3      527
Name: prediction, dtype: int64