In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/crowdflower-search-relevance/train.csv.zip')
test = pd.read_csv('/kaggle/input/crowdflower-search-relevance/test.csv.zip')
train.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
train['query'].map(lambda x:len(x.split())).value_counts()

In [None]:
train['product_title'].map(lambda x:len(x.split())).value_counts()

In [None]:
split = int(len(train)*0.8)
train_0, dev = train[:split], train[split:]

In [None]:
clean_train_1 = train_0[train_0.relevance_variance <1].copy()
clean_train_2 = train_0[train_0.relevance_variance <0.50].copy()
dev.describe()

In [None]:
clean_train_1.describe()

In [None]:
clean_train_2.describe()

In [None]:
## Skipping product description as it's too lengthy and missing values
train = train_0#clean_train_1
# train_input = train.apply(lambda x: x['query']+' '+x['product_title']+' '+str(x['product_description']), axis=1)
# dev_input =  dev.apply(lambda x: x['query']+' '+x['product_title']+' '+str(x['product_description']), axis=1)
train_input = train.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
dev_input =  dev.apply(lambda x: x['query']+' '+x['product_title'], axis=1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf = TfidfVectorizer(ngram_range=(1, 5),stop_words = 'english', strip_accents='unicode', max_features=2*10**5)
train_x = tfidf.fit_transform(train_input)
dev_x = tfidf.transform(dev_input)
train_x

In [None]:
train_y, dev_y = train.median_relevance.to_list(), dev.median_relevance.to_list()
train_y = [(x-1)/3 for x in train_y]
dev_y = [(x-1)/3 for x in dev_y]
np.mean(train_y), np.max(train_y), np.min(train_y)

In [None]:
from sklearn.metrics import mean_squared_error, cohen_kappa_score, make_scorer
def reg_scorer(true, pred):
    pred = [min(1, max(0,x)) for x in pred]
    pred = [int(round((x*3)+1)) for x in pred]
    true = [int(round((x*3)+1)) for x in true]
    return cohen_kappa_score(true, pred)

In [None]:
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
#clf = LinearRegression().fit(train_x, train_y)
#clf = SGDRegressor(verbose=1,n_iter_no_change=20).fit(train_x, train_y)
param_grid = {'C': [1,2,5,10], 'kernel': ('rbf','sigmoid')} #('linear','rbf', 'poly','sigmoid') #'epsilon':[0.1,0.2]
svr  = SVR()
scorer = make_scorer(reg_scorer, greater_is_better=True)
clf = GridSearchCV(svr, param_grid, verbose=True,scoring=scorer, n_jobs=16)
clf.fit(train_x, train_y)
clf.best_estimator_, clf.best_params_, clf.best_score_

In [None]:
## 0.26 is the best score till now

preds = clf.best_estimator_.predict(dev_x)
mean_squared_error(dev_y, preds),  reg_scorer(dev_y, preds)

In [None]:
test_input =  test.apply(lambda x: x['query']+' '+x['product_title'], axis=1)
test_x = tfidf.transform(test_input)
pred = clf.best_estimator_.predict(test_x)
pred = [min(1, max(0,x)) for x in pred]
pred = [int(round((x*3)+1)) for x in pred]
out = pd.DataFrame({"id": test.id.to_list(), "prediction": pred})
out.to_csv('submission.csv', index=False)

### Using Deeplearning based model for classification

In [None]:
train = train_0
train_texts = train.apply(lambda x: x['query']+'<\s><s>'+x['product_title'], axis=1).to_list()
val_texts =  dev.apply(lambda x: x['query']+'<\s><s>'+x['product_title'], axis=1).to_list()
train_labels, val_labels = train.median_relevance.to_list(), dev.median_relevance.to_list()
test_texts = test.apply(lambda x: x['query']+'<\s><s>'+x['product_title'], axis=1).to_list()

In [None]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')
train_encodings = tokenizer.batch_encode_plus(train_texts, truncation=True, padding=True)
val_encodings = tokenizer.batch_encode_plus(val_texts, truncation=True, padding=True)
test_encodings = tokenizer.batch_encode_plus(test_texts, truncation=True, padding=True)

In [None]:
import torch

class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)
#test_dataset = Dataset(test_encodings, test_labels)

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',         # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=50,
    load_best_model_at_end=True,
    overwrite_output_dir = True,
    no_cuda=True
)

model = AutoModelForSequenceClassification.from_pretrained("roberta-base")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

In [None]:
sub = pd.read_csv('/kaggle/input/crowdflower-search-relevance/sampleSubmission.csv.zip')
sub


In [None]:
out