In [41]:
import sys
sys.path.append('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/')
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import pandas as pd 
import numpy as np 
import datasets
from datasets import Dataset as TFDataset 
import pickle
from bm25.rank_bm25 import BM25Okapi
import random
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache, partial
from tqdm import tqdm, trange
from collections import Counter, defaultdict
from transformers import PreTrainedTokenizer, AutoTokenizer
import multiprocessing

In [42]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [43]:
def tokenize_function(examples, corpus_col_name, tokenizer, max_length):
    if corpus_col_name in examples:
        return tokenizer(examples[corpus_col_name], 
            add_special_tokens=False, # don't add special tokens when preprocess
            truncation=True, 
            max_length=max_length,
            return_attention_mask=False,
            return_token_type_ids=False)

In [44]:
def construct_query_list_from_sessions(sessions_df:pd.DataFrame, product_map:dict, max_seq_len:int, product_corpus:list):
    query_list = []
    for sess in tqdm(sessions_df.itertuples(), total=sessions_df.shape[0]):
        sess_locale = sess.locale
        prev_items = eval(sess.prev_items.replace(' ', ','))[-max_seq_len : ]
        prev_items = [product_map.get(sess_locale+'_'+item, 0) for item in prev_items]
        sess_query = sum([product_corpus[item] for item in prev_items], [])
        query_list.append(sess_query)
    return query_list

# Merge valid BM25 score

In [5]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_no_hist_feature.parquet'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'

In [6]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)


In [7]:
merged_candidates_feature = read_merged_candidates_feature()
product_data = read_product_data()
valid_sessions = read_valid_sessions()

In [45]:
TOKENIZER_NAME = 'xlm-roberta-base'
TITLE_MAX_LENGTH = 200

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
        TOKENIZER_NAME,
        use_fast=False,
)
title_corpus = product_data[['title']]
title_corpus = pd.concat([pd.DataFrame({'title' : ['']}), title_corpus]).reset_index(drop=True) # add padding product
title_corpus['title'] = title_corpus['title'].fillna('')
title_corpus = TFDataset.from_pandas(title_corpus, preserve_index=False)
title_corpus = title_corpus.map(partial(tokenize_function, corpus_col_name='title', tokenizer=tokenizer, max_length=TITLE_MAX_LENGTH), 
                                num_proc=8, remove_columns=['title'], batched=True)
title_corpus_list = title_corpus['input_ids']

 #0:  10%|▉         | 19/194 [00:03<00:32,  5.45ba/s]
 #0:  10%|█         | 20/194 [00:04<00:33,  5.18ba/s]
 #0:  11%|█         | 21/194 [00:04<00:32,  5.31ba/s]
[A
 #0:  11%|█▏        | 22/194 [00:04<00:40,  4.30ba/s]
 #0:  12%|█▏        | 23/194 [00:04<00:38,  4.44ba/s]
 #0:  12%|█▏        | 24/194 [00:05<00:35,  4.82ba/s]
 #0:  13%|█▎        | 25/194 [00:05<00:35,  4.78ba/s]
 #0:  13%|█▎        | 26/194 [00:05<00:33,  5.05ba/s]
 #0:  14%|█▍        | 27/194 [00:05<00:31,  5.28ba/s]
[A
 #0:  14%|█▍        | 28/194 [00:06<00:39,  4.26ba/s]

[A[A
 #0:  15%|█▍        | 29/194 [00:06<00:44,  3.68ba/s]

[A[A
[A

 #0:  15%|█▌        | 30/194 [00:06<00:41,  3.96ba/s]

[A[A
 #0:  16%|█▌        | 31/194 [00:06<00:36,  4.42ba/s]

[A[A
 #0:  16%|█▋        | 32/194 [00:06<00:33,  4.83ba/s]

 #0:  17%|█▋        | 33/194 [00:07<00:32,  4.92ba/s]
[A

[A[A
[A

 #0:  18%|█▊        | 34/194 [00:07<00:32,  4.90ba/s]

[A[A
 #0:  18%|█▊        | 35/194 [00:07<00:32,  4.85ba/s]

[A[A

[

In [10]:
with open('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/bm25/cache/title_bm25.pkl', 'rb') as f:
    title_BM25 = pickle.load(f)

In [11]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
merged_candidates

Unnamed: 0,sess_id,sess_locale,product
0,0,UK,B000OPPVCS
1,0,UK,B000V599Y2
2,0,UK,B0018HH444
3,0,UK,B0079JI4DU
4,0,UK,B0079JI4EY
...,...,...,...
84407334,361580,DE,B0BB7XV97M
84407335,361580,DE,B0BB7YSRBX
84407336,361580,DE,B0BB7ZMGY8
84407337,361580,DE,B0BD4CP7N3


In [12]:
product_index = product_data[['id', 'locale']]
product_index['product_index'] = product_index.index + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_index['product_index'] = product_index.index + 1


In [13]:
merged_candidates = merged_candidates.merge(product_index, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'id'])
merged_candidates['product_index'] = merged_candidates['product_index'].fillna(0)
merged_candidates['product_index'] = merged_candidates['product_index'].astype('int64')
merged_candidates.drop(columns=['id', 'locale'], inplace=True)
assert len(merged_candidates) == len(merged_candidates_feature)
merged_candidates_grouped = merged_candidates.groupby(by='sess_id')['product_index'].apply(list)

In [14]:
locale_product_map = {}
for row in tqdm(product_data.itertuples(), total=product_data.shape[0]):
    locale_product_map[row.locale+'_'+row.id] = row.Index + 1

100%|██████████| 1551057/1551057 [00:05<00:00, 287842.71it/s]


In [15]:
# construct query list 
valid_query_list = construct_query_list_from_sessions(valid_sessions, locale_product_map, 5, product_corpus=title_corpus_list)

100%|██████████| 361581/361581 [00:14<00:00, 25160.92it/s]


In [16]:
def get_sess_scores(sess):
    sess_id = sess['sess_id']
    scores = title_BM25.get_batch_scores(valid_query_list[sess_id], merged_candidates_grouped[sess_id])
    return {'sess_bm25_scores' : scores}

In [17]:
# about 10 mins
datasets.set_progress_bar_enabled(False)
valid_query_dataset = TFDataset.from_dict({'sess_id' : list(range(len(valid_query_list)))})
valid_query_dataset = valid_query_dataset.map(get_sess_scores, num_proc=10, batched=False)
datasets.set_progress_bar_enabled(True)

In [19]:
valid_scores_list = valid_query_dataset['sess_bm25_scores']

In [22]:
merged_bm25_scores = []
for scores_set in tqdm(valid_scores_list):
    for s in scores_set:
        merged_bm25_scores.append(s)
assert len(merged_bm25_scores) == len(merged_candidates)
assert len(merged_bm25_scores) == len(merged_candidates_feature)

100%|██████████| 361581/361581 [00:09<00:00, 36352.42it/s]


In [23]:
merged_candidates_feature['title_BM25_scores'] = merged_bm25_scores

In [38]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path)

In [34]:
valid_sessions.iloc[300001], valid_sessions.iloc[300001]['prev_items']

(prev_items    ['B07ZZ5JH12' 'B09KTRFTJJ']
 next_item                      B091BFSKKM
 locale                                 UK
 Name: 300001, dtype: object,
 "['B07ZZ5JH12' 'B09KTRFTJJ']")

In [35]:
product_data.iloc[locale_product_map['UK_B09KTRFTJJ'] - 1]['id'], \
product_data.iloc[locale_product_map['UK_B09KTRFTJJ'] - 1]['title'], \
product_data.iloc[locale_product_map['UK_B07ZZ5JH12'] - 1]['id'], \
product_data.iloc[locale_product_map['UK_B07ZZ5JH12'] - 1]['title']

('B09KTRFTJJ',
 'Bird Feeders for Outside, Bird feeder, Wild Bird seed for Outside Feeders, Squirrel Proof Birds Feeder, Garden Decoration Black',
 'B07ZZ5JH12',
 'Oakdale Wild Bird Feeder Pre-Filled with Premium Seeds, Large Hanging Metal Frame with Dual Perches, Refillable Lawn and Garden Outdoor Use, Enjoy Birdwatching or Birding')

In [37]:
product_data.iloc[locale_product_map['UK_B09PR5X9LY'] - 1]['id'], \
product_data.iloc[locale_product_map['UK_B09PR5X9LY'] - 1]['title'], \
product_data.iloc[locale_product_map['UK_B093GF9T5N'] - 1]['id'], \
product_data.iloc[locale_product_map['UK_B093GF9T5N'] - 1]['title']

('B09PR5X9LY',
 'Bird Feeders Hanging - Wild Bird Seed Feeder Garden Metal Bird Feeders for Garden Squirrel Proof Unusual Bird Feeders Frog',
 'B093GF9T5N',
 'Bird Feeders Hanging - Wild Bird Seed Feeder Garden Metal Bird Feeders for Garden Squirrel Proof Unusual Bird Feeders Sunflower')

In [36]:
merged_candidates_feature.query('sess_id==300001').sort_values(by=['title_BM25_scores'], ascending=False)[['sess_id', 'sess_locale', 'product', 'title_BM25_scores', 'roberta_normalized_scores']][:30]

Unnamed: 0,sess_id,sess_locale,product,title_BM25_scores,roberta_normalized_scores
70037487,300001,UK,B09PR5X9LY,247.387451,0.026942
70037452,300001,UK,B093GF9T5N,247.387451,0.027762
70037400,300001,UK,B089CTHNZK,245.563441,0.0101
70037437,300001,UK,B08RYPDTW6,236.703964,0.009866
70037448,300001,UK,B091JTV535,230.513335,0.013937
70037444,300001,UK,B08ZJJFT9S,227.260079,0.008184
70037518,300001,UK,B0BDZPDSDQ,221.808994,0.008671
70037472,300001,UK,B09CLFTRN6,221.808994,0.008671
70037406,300001,UK,B08D3FQTF3,218.61069,0.007244
70037469,300001,UK,B099DV98NN,218.417748,0.001979


In [18]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,product_index
0,0,UK,B000OPPVCS,1375599
1,0,UK,B000V599Y2,1324417
2,0,UK,B0018HH444,1413111
3,0,UK,B0079JI4DU,0
4,0,UK,B0079JI4EY,0
...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,446969
84407335,361580,DE,B0BB7YSRBX,275922
84407336,361580,DE,B0BB7ZMGY8,429872
84407337,361580,DE,B0BD4CP7N3,276547


In [17]:
merged_candidates_grouped

sess_id
0         [1375599, 1324417, 1413111, 0, 0, 970646, 1132...
1         [826127, 673569, 751275, 889131, 654649, 77556...
2         [1149066, 1253359, 1343812, 1310769, 960407, 9...
3         [1186226, 1165726, 1126038, 1410888, 1153463, ...
4         [766710, 592913, 695302, 904349, 882975, 76287...
                                ...                        
361576    [1134842, 1094904, 1212843, 1260859, 1094344, ...
361577    [843495, 679363, 887052, 666868, 521356, 67026...
361578    [111083, 428215, 21233, 56473, 116992, 87684, ...
361579    [140800, 457181, 477726, 329896, 479625, 33046...
361580    [476029, 0, 287329, 372558, 232874, 8876, 8945...
Name: product_index, Length: 361581, dtype: object

In [21]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,product_index
0,0,UK,B000OPPVCS,1375599
1,0,UK,B000V599Y2,1324417
2,0,UK,B0018HH444,1413111
3,0,UK,B0079JI4DU,0
4,0,UK,B0079JI4EY,0
...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,446969
84407335,361580,DE,B0BB7YSRBX,275922
84407336,361580,DE,B0BB7ZMGY8,429872
84407337,361580,DE,B0BD4CP7N3,276547


In [40]:
merged_candidates_feature['title_BM25_scores']

0           298.915375
1           111.069756
2             0.000000
3             0.000000
4             0.000000
               ...    
84407334    118.126396
84407335    124.881615
84407336    124.881615
84407337    192.540955
84407338      0.000000
Name: title_BM25_scores, Length: 84407339, dtype: float32

In [20]:
valid_query_dataset

Dataset({
    features: ['sess_id', 'sess_bm25_scores'],
    num_rows: 361581
})

# Merge test BM25 score

In [5]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test_no_hist_feature.parquet'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'

In [6]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)


In [7]:
merged_candidates_feature_test = read_merged_candidates_feature_test()
product_data = read_product_data()
test_sessions = read_test_sessions()

In [8]:
TOKENIZER_NAME = 'xlm-roberta-base'
TITLE_MAX_LENGTH = 200

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
        TOKENIZER_NAME,
        use_fast=False,
)
title_corpus = product_data[['title']]
title_corpus = pd.concat([pd.DataFrame({'title' : ['']}), title_corpus]).reset_index(drop=True) # add padding product
title_corpus['title'] = title_corpus['title'].fillna('')
title_corpus = TFDataset.from_pandas(title_corpus, preserve_index=False)
title_corpus = title_corpus.map(partial(tokenize_function, corpus_col_name='title', tokenizer=tokenizer, max_length=TITLE_MAX_LENGTH), 
                                num_proc=8, remove_columns=['title'], batched=True)
title_corpus_list = title_corpus['input_ids']

 #0:  16%|█▌        | 31/194 [00:06<00:35,  4.65ba/s]
 #0:  16%|█▋        | 32/194 [00:07<00:48,  3.37ba/s]
 #0:  17%|█▋        | 33/194 [00:07<00:41,  3.85ba/s]
 #0:  18%|█▊        | 34/194 [00:07<00:36,  4.33ba/s]
 #0:  18%|█▊        | 35/194 [00:07<00:33,  4.72ba/s]
 #0:  19%|█▊        | 36/194 [00:07<00:31,  4.99ba/s]
 #0:  19%|█▉        | 37/194 [00:07<00:30,  5.20ba/s]
 #0:  20%|██        | 39/194 [00:08<00:29,  5.28ba/s]
 #0:  21%|██        | 40/194 [00:08<00:34,  4.49ba/s]
 #0:  21%|██        | 41/194 [00:08<00:31,  4.79ba/s]
 #0:  22%|██▏       | 42/194 [00:08<00:29,  5.07ba/s]
 #0:  22%|██▏       | 43/194 [00:09<00:28,  5.24ba/s]

[A[A
 #0:  23%|██▎       | 44/194 [00:09<00:28,  5.35ba/s]
[A

 #0:  23%|██▎       | 45/194 [00:09<00:27,  5.51ba/s]

[A[A
 #0:  24%|██▎       | 46/194 [00:09<00:26,  5.58ba/s]

[A[A
[A

 #0:  24%|██▍       | 47/194 [00:10<00:26,  5.62ba/s]

 #0:  25%|██▍       | 48/194 [00:10<00:35,  4.08ba/s]
[A

 #0:  25%|██▌       | 49/194 [00:10<00:32,

In [18]:
with open('/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/bm25/cache/title_bm25.pkl', 'rb') as f:
    title_BM25 = pickle.load(f)

In [10]:
merged_candidates = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]
merged_candidates

Unnamed: 0,sess_id,sess_locale,product
0,0,DE,4088833651
1,0,DE,B000H6W2GW
2,0,DE,B000JG2RAG
3,0,DE,B000RYSOUW
4,0,DE,B000UGZVQM
...,...,...,...
69428426,316970,UK,B0BJCTH4NH
69428427,316970,UK,B0BJTQQWLG
69428428,316970,UK,B0BJV3RL4H
69428429,316970,UK,B0BK7SPC84


In [11]:
product_index = product_data[['id', 'locale']]
product_index['product_index'] = product_index.index + 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  product_index['product_index'] = product_index.index + 1


In [12]:
merged_candidates = merged_candidates.merge(product_index, how='left', left_on=['sess_locale', 'product'], right_on=['locale', 'id'])
merged_candidates['product_index'] = merged_candidates['product_index'].fillna(0)
merged_candidates['product_index'] = merged_candidates['product_index'].astype('int64')
merged_candidates.drop(columns=['id', 'locale'], inplace=True)
assert len(merged_candidates) == len(merged_candidates_feature_test)
merged_candidates_grouped = merged_candidates.groupby(by='sess_id')['product_index'].apply(list)

In [13]:
locale_product_map = {}
for row in tqdm(product_data.itertuples(), total=product_data.shape[0]):
    locale_product_map[row.locale+'_'+row.id] = row.Index + 1

100%|██████████| 1551057/1551057 [00:05<00:00, 269858.44it/s]


In [14]:
# construct query list 
test_query_list = construct_query_list_from_sessions(test_sessions, locale_product_map, 5, product_corpus=title_corpus_list)

100%|██████████| 316971/316971 [00:09<00:00, 32692.67it/s]


In [27]:
def get_sess_scores(sess):
    sess_id = sess['sess_id']
    scores = title_BM25.get_batch_scores(test_query_list[sess_id], merged_candidates_grouped[sess_id])
    return {'sess_bm25_scores' : scores}

In [45]:
datasets.set_progress_bar_enabled(False)
test_query_dataset = TFDataset.from_dict({'sess_id' : list(range(len(test_query_list)))})
test_query_dataset = test_query_dataset.map(get_sess_scores, num_proc=10, batched=False)
datasets.set_progress_bar_enabled(True)

In [51]:
test_scores_list = test_query_dataset['sess_bm25_scores']

In [66]:
merged_bm25_scores = []
for scores_set in tqdm(test_scores_list):
    for s in scores_set:
        merged_bm25_scores.append(s)
assert len(merged_bm25_scores) == len(merged_candidates)
assert len(merged_bm25_scores) == len(merged_candidates_feature_test)

100%|██████████| 316971/316971 [00:09<00:00, 34849.68it/s]


In [68]:
merged_candidates_feature_test['title_BM25_scores'] = merged_bm25_scores

In [95]:
cast_dtype(merged_candidates_feature_test)
merged_candidates_feature_test.to_parquet(merged_candidates_feature_test_path)

In [81]:
test_sessions.iloc[200000], test_sessions.iloc[200000]['prev_items']

(prev_items    ['B09NBQKRPC' 'B09NBQKRPC' 'B0BHQQQK2D']
 locale                                              JP
 Name: 200000, dtype: object,
 "['B09NBQKRPC' 'B09NBQKRPC' 'B0BHQQQK2D']")

In [93]:
product_data.iloc[locale_product_map['JP_B0BHQQQK2D'] - 1]['id'], \
product_data.iloc[locale_product_map['JP_B0BHQQQK2D'] - 1]['title'], \
product_data.iloc[locale_product_map['JP_B09NBQKRPC'] - 1]['id'], \
product_data.iloc[locale_product_map['JP_B09NBQKRPC'] - 1]['title']

('B0BHQQQK2D',
 'ライトニングケーブル USB-C 短い 0.5M【MFi認証/2022進化モデル】iPhone 充電ケーブル タイプC NINGKPOW lightning ケーブル type-c ライトニングケーブル タイプc 断線防止 超高耐久 PD対応 iPhone 13/12/11/XS Max/XR/X/SE/8 Plus 各種対応-グレー',
 'B09NBQKRPC',
 'RAVIAD USB C ライトニングケーブル 【0.5M/MFi 認証】 iPhone 充電ケーブル 急速充電 データ転送 高耐久 タイプC ライトニングケーブル PD対応 iPhone 13/13 Pro/13 Pro Max/12/12 Pro/12 Pro Max/12 mini/11 Pro Max/SE/XS/XR/X/8/8 Plus各種対応 Type C Lightningケーブル')

In [86]:
product_data.iloc[locale_product_map['JP_B09V2MMFX4'] - 1]['title'], product_data.iloc[locale_product_map['JP_B08QVNCF14'] - 1]['title']

('RAVIAD USB C ライトニングケーブル 【1M/MFi 認証】 iPhone 充電ケーブル 急速充電 データ転送 高耐久 タイプC ライトニングケーブル PD対応 iPhone 13/13 Pro/13 Pro Max/12/12 Pro/12 Pro Max/12 mini/11 Pro Max/SE/XS/XR/X/8/8 Plus各種対応 Type C Lightningケーブル',
 'RAVIAD USB C ライトニングケーブル 【2M/MFi 認証】 iPhone 充電ケーブル 急速充電 データ転送 高耐久 タイプC ライトニングケーブル PD対応 iPhone 13/13 Pro/13 Pro Max/12/12 Pro/12 Pro Max/12 mini/11 Pro Max/SE/XS/XR/X/8/8 Plus各種対応 Type C Lightningケーブル')

In [84]:
merged_candidates_feature_test.query('sess_id==200000').sort_values(by=['title_BM25_scores'], ascending=False)[['sess_id', 'sess_locale', 'product', 'title_BM25_scores']]

Unnamed: 0,sess_id,sess_locale,product,title_BM25_scores
43632311,200000,JP,B09V2MMFX4,820.670510
43632253,200000,JP,B08QVNCF14,820.670510
43632252,200000,JP,B08QVJ2BDF,820.670510
43632310,200000,JP,B09TVFSQ7F,796.628235
43632246,200000,JP,B08PS31FCM,788.036359
...,...,...,...,...
43632308,200000,JP,B09TPDVRT9,0.000000
43632171,200000,JP,B07FF436N5,0.000000
43632307,200000,JP,B09TMF6742,0.000000
43632224,200000,JP,B083NMTVS5,0.000000


In [94]:
merged_candidates_feature_test

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,sasrec_normalized_scores_2,gru4rec_scores,gru4rec_normalized_scores,product_freq,sess_avg_price,product_price,...,gru4rec_normalized_scores_2,co_graph_counts_0,normalized_co_graph_counts_0,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,roberta_scores,roberta_normalized_scores,title_BM25_scores
0,0,DE,4088833651,0.000000,2.975813e-09,0.000000,1.580065e-09,828,25.195269,36.761604,...,1.326730e-09,0,0.0,0.0,0.0,0,0.0,0.000000,0.000000,0.000000
1,0,DE,B000H6W2GW,0.000000,2.975813e-09,0.000000,1.580065e-09,875,25.195269,36.761604,...,1.326730e-09,0,0.0,0.0,0.0,0,0.0,0.000000,0.000000,0.000000
2,0,DE,B000JG2RAG,7.665308,6.347557e-06,8.104032,5.226502e-06,24,25.195269,23.190001,...,1.152972e-04,0,0.0,0.0,0.0,0,0.0,267.192719,0.004943,287.809594
3,0,DE,B000RYSOUW,-2.951060,1.555882e-10,-2.857798,9.068785e-11,5,25.195269,6.900000,...,1.461790e-10,0,0.0,0.0,0.0,0,0.0,267.322815,0.005629,321.394645
4,0,DE,B000UGZVQM,3.977920,1.589257e-07,4.688567,1.717488e-07,4,25.195269,21.990000,...,6.919625e-06,0,0.0,0.0,0.0,0,0.0,267.242462,0.005195,285.328702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69428426,316970,UK,B0BJCTH4NH,11.327528,1.041200e-04,10.629994,3.818184e-04,74,16.950001,5.800000,...,2.638649e-04,0,0.0,0.0,0.0,0,0.0,270.043762,0.014921,449.867410
69428427,316970,UK,B0BJTQQWLG,5.604142,3.403292e-07,6.052083,3.923694e-06,6,16.950001,9.880000,...,1.243056e-05,0,0.0,0.0,0.0,0,0.0,269.350769,0.007462,431.585801
69428428,316970,UK,B0BJV3RL4H,9.146974,1.176336e-05,7.667603,1.973815e-05,7,16.950001,22.097065,...,6.248733e-05,0,0.0,0.0,0.0,0,0.0,269.313751,0.007191,419.572671
69428429,316970,UK,B0BK7SPC84,-10.383047,3.879279e-14,-6.356799,1.601719e-11,0,16.950001,5.960000,...,2.368389e-12,0,0.0,0.0,0.0,0,0.0,270.200653,0.017456,420.993555
