In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sample-submission/sample_submission-v0.2.csv
/kaggle/input/rankingmodelsentencetransformers/config.json
/kaggle/input/rankingmodelsentencetransformers/training_args.bin
/kaggle/input/rankingmodelsentencetransformers/tokenizer.json
/kaggle/input/rankingmodelsentencetransformers/tokenizer_config.json
/kaggle/input/rankingmodelsentencetransformers/pytorch_model.bin
/kaggle/input/rankingmodelsentencetransformers/special_tokens_map.json
/kaggle/input/rankingmodelsentencetransformers/vocab.txt
/kaggle/input/product-catalogue/data/processed/public/task_1_query-product_ranking/product_catalogue-v0.2.csv
/kaggle/input/test-ranking-public/data/processed/public/task_1_query-product_ranking/test_public-v0.2.csv


In [2]:
!pip install datasets==2.1.0

Collecting datasets==2.1.0
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m325.4/325.4 KB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.1.0 responses-0.18.0 xxhash-3.0.0
[0m

In [3]:
pd.set_option('display.max_rows', 100)

In [4]:
from datasets import ClassLabel, Value, DatasetDict, Dataset

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
checkpoint = '../input/rankingmodelsentencetransformers'
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=50)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
from transformers import pipeline

In [7]:
test_df = pd.read_csv('../input/test-ranking-public/data/processed/public/task_1_query-product_ranking/test_public-v0.2.csv')

In [8]:
test_df.head()

Unnamed: 0,query_id,query,query_locale,product_id
0,33777,!qscreen fence without holes,us,B000HMCA7W
1,33777,!qscreen fence without holes,us,B0019Y6TTC
2,33777,!qscreen fence without holes,us,B001OJXVKW
3,33777,!qscreen fence without holes,us,B0030E1WLE
4,33777,!qscreen fence without holes,us,B003VNKLIY


In [9]:
query_prod = {}
for index, row in test_df.iterrows():
    query_id = str(row['query_id'])
    query_locale = str(row['query_locale'])
    if query_id not in query_prod.keys():
        query_prod[query_id] = {}
    if query_locale not in query_prod[query_id].keys():
        query_prod[query_id][query_locale] = set()
    query_prod[query_id][query_locale].add(str(row['product_id']))

In [10]:
prod_cat = pd.read_csv('../input/product-catalogue/data/processed/public/task_1_query-product_ranking/product_catalogue-v0.2.csv')

In [11]:
query_prod_test = pd.merge(test_df, prod_cat, right_on=['product_id','product_locale'], left_on=['product_id', 'query_locale'], suffixes=('_test','_catlog'), how='left')

In [12]:
def get_query_input(row):
    query_info = str(row.get('query', '')) + ' ' + str(row.get('query_locale', ''))
    return query_info.strip()

d = {'query': 'hi hello',}
assert get_query_input(d) == 'hi hello'
d = {'query': 'hi hello', 'query_locale': 'en'}
assert get_query_input(d) == 'hi hello en'

In [13]:
query_prod_test.fillna('', inplace=True)
query_prod_test['query_input'] = query_prod_test.apply(lambda row: get_query_input(row), axis=1)

In [14]:
def get_product_input(row):
    product_columns = ['product_title', 'product_locale', 'product_description']
    product_info = ''
    for col in product_columns:
        product_info += str(row.get(col, '')) + ' '
    return product_info.strip()

d = {'product_title': 'hi hello', 'product_locale': 'en', 'product_description': 'nike'}
assert get_product_input(d) == 'hi hello en nike'

In [15]:
query_prod_test['product_input'] = query_prod_test.apply(lambda row: get_product_input(row), axis=1)

In [16]:
scores = {'exact': 1.0, 'substitute': 0.1, 'complement': 0.01, 'irrelevant': 0.0}

In [17]:
query_prod_test.head()

Unnamed: 0,query_id,query,query_locale,product_id,product_title,product_description,product_bullet_point,product_brand,product_color_name,product_locale,query_input,product_input
0,33777,!qscreen fence without holes,us,B000HMCA7W,"Tenax 60041989 Multi-Purpose Net, 3' x 50', Black",,"Mesh .70""X.98""\nVirtually invisible\nWill not ...",Tenax,Black,us,!qscreen fence without holes us,"Tenax 60041989 Multi-Purpose Net, 3' x 50', Bl..."
1,33777,!qscreen fence without holes,us,B0019Y6TTC,Garden Creations JB4710 Extendable Instant Fence,,EXPANDABLE FLEXI-SCREEN stretch your fence to ...,"Creation's Garden Natural Products, Inc",Brown,us,!qscreen fence without holes us,Garden Creations JB4710 Extendable Instant Fen...
2,33777,!qscreen fence without holes,us,B001OJXVKW,Windscreen4less Heavy Duty Privacy Screen Fenc...,,"Product measures exactly 50' long x 5'8"" tall,...",Windscreen4less,Black,us,!qscreen fence without holes us,Windscreen4less Heavy Duty Privacy Screen Fenc...
3,33777,!qscreen fence without holes,us,B0030E1WLE,"Richell Freestanding Pet Gate, Large, Origami ...",,Freestanding gate with side panels for safely ...,Richell,Origami White,us,!qscreen fence without holes us,"Richell Freestanding Pet Gate, Large, Origami ..."
4,33777,!qscreen fence without holes,us,B003VNKLIY,Regalo 192-Inch Super Wide Adjustable Baby Gat...,,SUPERWIDE: Fits opens up to 192 inches wide an...,Regalo,White,us,!qscreen fence without holes us,Regalo 192-Inch Super Wide Adjustable Baby Gat...


In [18]:

test = []
for (query_id, query_locale), __df in query_prod_test.groupby(['query_id', 'query_locale']):
    if __df.shape[0] < 50:
        extra_df = __df.sample(frac=(50//__df.shape[0]) + 1.0, replace=True, random_state=17)
    test.append(__df.sample(n=min(50, __df.shape[0]), random_state=17))
    if __df.shape[0] < 50:
        test.append(extra_df.sample(n=50-__df.shape[0], random_state=17))

In [19]:
final_test_df = pd.concat(test, ignore_index=True)

In [20]:
for (query_id, query_locale), __df in final_test_df.groupby(['query_id', 'query_locale']):
    assert len(__df) == 50, (query_id, query_locale)

In [21]:
final_test_df.head(2)

Unnamed: 0,query_id,query,query_locale,product_id,product_title,product_description,product_bullet_point,product_brand,product_color_name,product_locale,query_input,product_input
0,33777,!qscreen fence without holes,us,B07MFP4PPQ,Sunnyglade 6 feet x 50 feet Privacy Screen Fen...,,Privacy Solution: Most economical way for priv...,Sunnyglade,Green,us,!qscreen fence without holes us,Sunnyglade 6 feet x 50 feet Privacy Screen Fen...
1,33777,!qscreen fence without holes,us,B01ESSA9VO,Flux Phenom Magnetic Screen Door - Retractable...,The Flux Phenom magnetic screen door is made f...,🔨Installs in an Instant: Our magnetic door scr...,Flux Phenom,Black,us,!qscreen fence without holes us,Flux Phenom Magnetic Screen Door - Retractable...


In [22]:
# convert to query_input [SEP] prod_input1 [SEP] prod_input2 ... [SEP] prod_inputn
def ranking_format(df):
    train = {}
    train['input'] = []
    train['prod_ids'] = []
    train['query_id'] = []
    train['query_locale'] = []
    for (query, query_locale), _df in df.groupby(['query_input', 'query_locale']):
        input_str = query + '[SEP] '
        inputs = _df.product_input.to_list()
        ids = _df.product_id.to_list()
        for i in inputs:
            input_str += i + ' [SEP] '
        train['input'].append(input_str.strip())
        train['prod_ids'].append(ids)
        train['query_id'].append(_df.query_id.to_list()[0])
        train['query_locale'].append(query_locale)
    return pd.DataFrame.from_dict(train,orient='index').transpose()

In [23]:
compr_test_df = ranking_format(final_test_df)

In [24]:
compr_test_df.head()

Unnamed: 0,input,prod_ids,query_id,query_locale
0,!qscreen fence without holes us[SEP] Sunnyglad...,"[B07MFP4PPQ, B01ESSA9VO, B018JYCBWI, B07D7TBSG...",33777,us
1,#1 rated resveratrol supplement without tea le...,"[B08J1ZDJTY, B07B1247Q6, B01N3LZY3O, B08N2XFKX...",33778,us
2,#10 envelopes without security tint us[SEP] Qu...,"[B08P2FWYKG, B074M9XL6R, B06VVLD2GL, B08TRNHQW...",33779,us
3,#10 standard no tint no window not self seal u...,"[B0797ZRR13, B0186D2JGK, B00W3OBU3C, B07C28VWV...",33780,us
4,$100 things that are not electronics us[SEP] 2...,"[B08L37T43T, B083S3ZXDF, B07XG6Y847, B007XVYPS...",33781,us


In [25]:
compr_test_df.shape

(7246, 4)

In [26]:
import ast
compr_test_df['prod_ids'] = compr_test_df['prod_ids'].apply(lambda ids: ast.literal_eval(str(ids)))

In [27]:
compr_test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7246 entries, 0 to 7245
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   input         7246 non-null   object
 1   prod_ids      7246 non-null   object
 2   query_id      7246 non-null   object
 3   query_locale  7246 non-null   object
dtypes: object(4)
memory usage: 226.6+ KB


In [28]:
compr_test_df.head()

Unnamed: 0,input,prod_ids,query_id,query_locale
0,!qscreen fence without holes us[SEP] Sunnyglad...,"[B07MFP4PPQ, B01ESSA9VO, B018JYCBWI, B07D7TBSG...",33777,us
1,#1 rated resveratrol supplement without tea le...,"[B08J1ZDJTY, B07B1247Q6, B01N3LZY3O, B08N2XFKX...",33778,us
2,#10 envelopes without security tint us[SEP] Qu...,"[B08P2FWYKG, B074M9XL6R, B06VVLD2GL, B08TRNHQW...",33779,us
3,#10 standard no tint no window not self seal u...,"[B0797ZRR13, B0186D2JGK, B00W3OBU3C, B07C28VWV...",33780,us
4,$100 things that are not electronics us[SEP] 2...,"[B08L37T43T, B083S3ZXDF, B07XG6Y847, B007XVYPS...",33781,us


In [29]:
for index, row in compr_test_df.iterrows():
    assert len(row['prod_ids']) == 50, index

In [30]:
from transformers import pipeline
import torch
device = 0 if torch.cuda.is_available() else -1
print(device)
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=device)

0


In [31]:
from transformers import pipeline
from torch.utils.data import Dataset
from tqdm.auto import tqdm

sample = compr_test_df.reset_index(drop=True)

class MyDataset(Dataset):
    def __init__(self, df):
        super(Dataset).__init__()
        self.df = df
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        return self.df.iloc[i, self.df.columns.get_loc("input")]

dataset = MyDataset(sample)
preds = []

for batch_size in [8]:
    print("-" * 30)
    print(f"Streaming batch_size={batch_size}")
    for out in tqdm(pipe(dataset, batch_size=batch_size, truncation=True,return_all_scores=True), total=len(dataset)):
        preds.append(out)

------------------------------
Streaming batch_size=8


  0%|          | 0/7246 [00:00<?, ?it/s]

In [32]:
final_preds = []
len_preds = 50
for pred in preds:
    cur_pred = []
    for i, output in enumerate(pred):
        cur_pred.append((i, output['score']))
    cur_pred = sorted(cur_pred, key=lambda x: x[1], reverse=True)
    assert len(cur_pred) == len_preds
    final_preds.append(cur_pred)

In [33]:
test_df.shape

(167799, 4)

In [34]:
sample = compr_test_df.reset_index(drop=True)
sample.reset_index(drop=True, inplace=True)

In [35]:
sample.head()

Unnamed: 0,input,prod_ids,query_id,query_locale
0,!qscreen fence without holes us[SEP] Sunnyglad...,"[B07MFP4PPQ, B01ESSA9VO, B018JYCBWI, B07D7TBSG...",33777,us
1,#1 rated resveratrol supplement without tea le...,"[B08J1ZDJTY, B07B1247Q6, B01N3LZY3O, B08N2XFKX...",33778,us
2,#10 envelopes without security tint us[SEP] Qu...,"[B08P2FWYKG, B074M9XL6R, B06VVLD2GL, B08TRNHQW...",33779,us
3,#10 standard no tint no window not self seal u...,"[B0797ZRR13, B0186D2JGK, B00W3OBU3C, B07C28VWV...",33780,us
4,$100 things that are not electronics us[SEP] 2...,"[B08L37T43T, B083S3ZXDF, B07XG6Y847, B007XVYPS...",33781,us


In [36]:
output = {}
output['product_id'] = []
output['query_id'] = []
for index, row in sample.iterrows():
    for pred in final_preds[index]:
        #print(row['prod_ids'])
        #print(pred)
        #break
        assert len(row['prod_ids']) == len(final_preds[index]), index 
        if row['prod_ids'][pred[0]] in query_prod[str(row['query_id'])][str(row['query_locale'])]:
            output['query_id'].append(str(row['query_id']))
            output['product_id'].append(row['prod_ids'][pred[0]])
            query_prod[str(row['query_id'])][str(row['query_locale'])].remove(row['prod_ids'][pred[0]])
    for prod in query_prod[str(row['query_id'])][str(row['query_locale'])]:
        output['query_id'].append(str(row['query_id']))
        output['product_id'].append(prod)

In [37]:
len(sample.iloc[4239]['prod_ids'])

50

In [38]:
output_df = pd.DataFrame.from_dict(output,orient='index').transpose()

In [39]:
total = 0
for index, row in sample.iterrows():
    total += len(set(row['prod_ids']))

In [40]:
output_df.head()

Unnamed: 0,product_id,query_id
0,B078RHZTVK,33777
1,B07R3TNQDM,33777
2,B01N4AMZ8Y,33777
3,B072M34RQC,33777
4,B0079GH6J6,33777


In [41]:
output_df.to_csv('./output.csv', index=False)

In [42]:
output_df.shape

(167799, 2)

In [43]:
test_df.shape

(167799, 4)