# Necessary Common Functions

Those functions should be ran before each part.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [2]:
def get_sessions(df: pd.DataFrame, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    return all_item

In [3]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [4]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [5]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [6]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

In [7]:
def get_co_graph_counts(session_last_items, merged_candidates_df, co_graph_dict):
    co_graph_count_list = []
    for idx, row in tqdm(merged_candidates_df.iterrows(), total=merged_candidates_df.shape[0]):
        sess_id = row['sess_id']
        product = row['product']
        last_item = session_last_items[sess_id]
        co_graph_count = co_graph_dict[last_item][product]
        co_graph_count_list.append(co_graph_count)
    return co_graph_count_list

In [8]:
def flatten_co_graph_dict(co_graph_dict):
    product_list = []
    neighbor_list = []
    counts_list = []
    for product in tqdm(co_graph_dict.keys(), total=len(co_graph_dict)):
        for neigh in co_graph_dict[product].keys():
            product_list.append(product)
            neighbor_list.append(neigh)
            counts_list.append(co_graph_dict[product][neigh])
    return pd.DataFrame({'product_' : product_list, 'neighbor' : neighbor_list, 'counts' : counts_list})

In [9]:
def normalize_co_graph_counts(merged_candidates_counts):
    # normalize co graph counts 
    # merged_candidates_counts_g = cudf.from_pandas(merged_candidates_counts)
    sessions_count_sum = merged_candidates_counts[['sess_id', 'counts']].groupby('sess_id').sum()
    sessions_count_sum.sort_index(inplace=True)

    # sessions_count_sum = sessions_count_sum.to_pandas()

    candidates_count_sum = sessions_count_sum.loc[merged_candidates_counts['sess_id']].reset_index(drop=True)
    merged_candidates_counts['counts_sum'] = candidates_count_sum['counts']
    merged_candidates_counts['normalized_counts'] = merged_candidates_counts['counts'] / merged_candidates_counts['counts_sum']

    # del merged_candidates_counts_g
    # del sessions_count_sum_g
    

# Merge valid co-graph counts 

In [10]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions_phase2.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1_phase2.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [11]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [12]:
merged_candidates_feature = read_merged_candidates_feature()

In [13]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]

In [14]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [15]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, test=True, list_item=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3966659/3966659 [02:44<00:00, 24108.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261816/261816 [00:08<00:00, 30671.89it/s]


In [16]:
valid_session_last_items = get_session_last_item(valid_sess_data)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261816/261816 [00:11<00:00, 22503.70it/s]


In [17]:
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


## bidirection

In [18]:
# valid is included in train 
co_occurence_dict_bi = get_co_occurence_dict(train_sess_item, bidirection=True, weighted=False)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3966659/3966659 [01:39<00:00, 40037.31it/s]


In [19]:
merged_candidates_feature = read_merged_candidates_feature()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
valid_session_last_items = get_session_last_item(valid_sess_data)
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261816/261816 [00:11<00:00, 22296.58it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [20]:
co_occurence_dict_bi_df = flatten_co_graph_dict(co_occurence_dict_bi)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1338342/1338342 [00:26<00:00, 50758.70it/s]


In [21]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_occurence_dict_bi_df_g = cudf.from_pandas(co_occurence_dict_bi_df)

In [22]:
# merged_candidates_bi_g = merged_candidates_g.merge(co_occurence_dict_bi_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
# merged_candidates_bi_g.drop(columns=['product_', 'neighbor'], inplace=True)
# merged_candidates_bi_g = merged_candidates_bi_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_bi_g.reset_index(drop=True, inplace=True)
# merged_candidates_bi_g['counts'] = merged_candidates_bi_g['counts'].fillna(0)
# assert len(merged_candidates_bi_g['counts']) == len(merged_candidates)
# merged_candidates_bi = merged_candidates_bi_g.to_pandas()

In [23]:
merged_candidates_bi = merged_candidates.merge(co_occurence_dict_bi_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_bi.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_bi = merged_candidates_bi.sort_values(by=['sess_id', 'product'])
merged_candidates_bi.reset_index(drop=True, inplace=True)
merged_candidates_bi['counts'] = merged_candidates_bi['counts'].fillna(0)
assert len(merged_candidates_bi['counts']) == len(merged_candidates)
# merged_candidates_bi = merged_candidates_bi_g.to_pandas()

In [24]:
# del merged_candidates_g
# del co_occurence_dict_bi_df_g
# del merged_candidates_bi_g

In [25]:
normalize_co_graph_counts(merged_candidates_bi)

In [26]:
merged_candidates_feature['co_graph_counts_0'] = merged_candidates_bi['counts']
merged_candidates_feature['normalized_co_graph_counts_0'] = merged_candidates_bi['normalized_counts']

In [27]:
merged_candidates_feature.query('sess_id==20010').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_0', 'normalized_co_graph_counts_0']][:25]

Unnamed: 0,product,sasrec_scores_2,co_graph_counts_0,normalized_co_graph_counts_0
5968515,B08737HD3S,23.872364,0.0,
5968602,B09CDPZF8C,23.619312,0.0,
5968527,B08DRBQ7P2,23.481756,0.0,
5968581,B097D4NTBN,23.15797,0.0,
5968595,B09BN9P5Y2,22.849251,0.0,
5968431,B01BZOKLOY,22.255867,0.0,
5968682,B0BHDJNL3K,22.105152,0.0,
5968583,B0988TCBT6,21.59745,0.0,
5968482,B07W3T98LC,21.548342,0.0,
5968523,B08BNL9HXF,21.398357,0.0,


In [28]:
cast_dtype(merged_candidates_feature, ['co_graph_counts_0', 'normalized_co_graph_counts_0'])
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [29]:
merged_candidates_feature.columns

Index(['sess_id', 'sess_locale', 'product', 'target', 'sess_avg_price',
       'product_price', 'sasrec_scores_3', 'normalized_sasrec_scores_3',
       'sasrec_scores_2', 'normalized_sasrec_scores_2', 'seqmlp_scores',
       'normalized_seqmlp_scores', 'narm_scores', 'normalized_narm_scores',
       'gru4rec_scores_2', 'normalized_gru4rec_scores_2', 'title_BM25_scores',
       'desc_BM25_scores', 'normalized_all_items_co_graph_count_0',
       'all_items_co_graph_count_0', 'normalized_all_items_co_graph_count_1',
       'all_items_co_graph_count_1', 'normalized_all_items_co_graph_count_2',
       'all_items_co_graph_count_2', 'co_graph_counts_0',
       'normalized_co_graph_counts_0', 'co_graph_counts_1',
       'normalized_co_graph_counts_1', 'co_graph_counts_2',
       'normalized_co_graph_counts_2', 'cos_text_bert_scores',
       'text_bert_scores', 'normalized_text_bert_scores', 'roberta_scores',
       'normalized_roberta_scores', 'product_freq'],
      dtype='object')

## uni and weight

In [30]:
co_occurence_dict_uni_weight = get_co_occurence_dict(train_sess_item, bidirection=False, weighted=True)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3966659/3966659 [01:16<00:00, 51701.19it/s]


In [31]:
co_graph_uni_weight_df = flatten_co_graph_dict(co_occurence_dict_uni_weight)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1338342/1338342 [00:19<00:00, 67054.00it/s]


In [32]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_graph_uni_weight_df_g = cudf.from_pandas(co_graph_uni_weight_df)

In [33]:
merged_candidates_uni_weight = merged_candidates.merge(co_graph_uni_weight_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_weight.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_weight = merged_candidates_uni_weight.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_weight.reset_index(drop=True, inplace=True)
merged_candidates_uni_weight['counts'] = merged_candidates_uni_weight['counts'].fillna(0)
assert len(merged_candidates_uni_weight['counts']) == len(merged_candidates)
# merged_candidates_uni_weight = merged_candidates_uni_weight_g.to_pandas()

In [34]:
# del merged_candidates_g
# del co_graph_uni_weight_df_g
# del merged_candidates_uni_weight_g

In [35]:
normalize_co_graph_counts(merged_candidates_uni_weight)

In [36]:
merged_candidates_feature['co_graph_counts_1'] = merged_candidates_uni_weight['counts']
merged_candidates_feature['normalized_co_graph_counts_1'] = merged_candidates_uni_weight['normalized_counts']

In [37]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [38]:
merged_candidates_uni_weight

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_sum,normalized_counts
0,0,DE,355165591X,B09Y94D1D3,0.000000,148.912915,0.000000
1,0,DE,3833237058,B09Y94D1D3,0.090909,148.912915,0.000610
2,0,DE,B00CIXSI6U,B09Y94D1D3,0.000000,148.912915,0.000000
3,0,DE,B00NVDOWUW,B09Y94D1D3,0.000000,148.912915,0.000000
4,0,DE,B00NVDP3ZU,B09Y94D1D3,0.000000,148.912915,0.000000
...,...,...,...,...,...,...,...
78842194,261815,UK,B0BCX524Y6,B0080ZSQKS,0.000000,364.690102,0.000000
78842195,261815,UK,B0BCX6QB4L,B0080ZSQKS,4.750000,364.690102,0.013025
78842196,261815,UK,B0BFPJYXQL,B0080ZSQKS,0.000000,364.690102,0.000000
78842197,261815,UK,B0BH3X67S3,B0080ZSQKS,0.000000,364.690102,0.000000


In [39]:
merged_candidates_uni_weight.query("sess_id==0")['counts'].sum()

148.91291506481136

In [40]:
merged_candidates_uni_weight.query("sess_id==0")['normalized_counts'].max()

0.5435423526243408

In [41]:
co_graph_uni_weight_df

Unnamed: 0,product_,neighbor,counts
0,3949568239,B09CLBRV16,1.000000
1,3949568239,B0B7237CF5,1.000000
2,3949568239,B09FJRKC4S,0.333333
3,3949568239,B0B723DDTK,0.250000
4,3949568239,B0B726DZT4,0.200000
...,...,...,...
29794358,B00KII42DK,B07L1VJLLC,1.000000
29794359,B08TMPJ5JT,B07MVBKVDB,1.000000
29794360,B08TMPJ5JT,B00BG4R7F0,0.500000
29794361,B08TMPJ5JT,B08TRBGSGP,0.333333


In [42]:
merged_candidates_feature.query('sess_id==20000').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_1', 'normalized_co_graph_counts_1']][:25]

Unnamed: 0,product,sasrec_scores_2,co_graph_counts_1,normalized_co_graph_counts_1
5965883,B09ZKF77CC,13.667938,0.25,0.003841
5965840,B09G2YWKP3,13.521121,0.5,0.007681
5965875,B09Z2XHMS3,13.468659,1.533333,0.023556
5965830,B09D415MK8,13.464596,0.0,0.0
5965831,B09D443ZZ7,13.088786,2.0,0.030725
5965876,B09Z2XVBJQ,12.996532,0.75,0.011522
5965880,B09ZK5K3JQ,12.818775,1.5,0.023044
5965820,B09489RCTT,12.725793,0.0,0.0
5965747,B07PBN67YJ,12.459669,0.0,0.0
5965827,B099ZR7QL5,12.331591,1.236111,0.01899


## uni and dis=1

In [43]:
co_occurence_dict_uni_dis1 = get_co_occurence_dict(train_sess_item, bidirection=False, weighted=False, max_dis=1)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3966659/3966659 [00:51<00:00, 77206.81it/s]


In [44]:
merged_candidates_feature = read_merged_candidates_feature()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
valid_session_last_items = get_session_last_item(valid_sess_data)
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 261816/261816 [00:12<00:00, 21741.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [45]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,DE,355165591X,B09Y94D1D3
1,0,DE,3833237058,B09Y94D1D3
2,0,DE,B00CIXSI6U,B09Y94D1D3
3,0,DE,B00NVDOWUW,B09Y94D1D3
4,0,DE,B00NVDP3ZU,B09Y94D1D3
...,...,...,...,...
78842194,261815,UK,B0BCX524Y6,B0080ZSQKS
78842195,261815,UK,B0BCX6QB4L,B0080ZSQKS
78842196,261815,UK,B0BFPJYXQL,B0080ZSQKS
78842197,261815,UK,B0BH3X67S3,B0080ZSQKS


In [46]:
co_occurence_dict_uni_dis1_df = flatten_co_graph_dict(co_occurence_dict_uni_dis1)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1338342/1338342 [00:06<00:00, 193688.69it/s]


In [47]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_occurence_dict_uni_dis1_df_g = cudf.from_pandas(co_occurence_dict_uni_dis1_df)

In [48]:
# merged_candidates_uni_dis1_g = merged_candidates_g.merge(co_occurence_dict_uni_dis1_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
# merged_candidates_uni_dis1_g.drop(columns=['product_', 'neighbor'], inplace=True)
# merged_candidates_uni_dis1_g = merged_candidates_uni_dis1_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_uni_dis1_g.reset_index(drop=True, inplace=True)
# merged_candidates_uni_dis1_g['counts'] = merged_candidates_uni_dis1_g['counts'].fillna(0)
# assert len(merged_candidates_uni_dis1_g['counts']) == len(merged_candidates)
# merged_candidates_uni_dis1 = merged_candidates_uni_dis1_g.to_pandas()

In [49]:
merged_candidates_uni_dis1 = merged_candidates.merge(co_occurence_dict_uni_dis1_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_dis1.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_dis1 = merged_candidates_uni_dis1.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_dis1.reset_index(drop=True, inplace=True)
merged_candidates_uni_dis1['counts'] = merged_candidates_uni_dis1['counts'].fillna(0)
assert len(merged_candidates_uni_dis1['counts']) == len(merged_candidates)
# merged_candidates_uni_dis1 = merged_candidates_uni_dis1_g.to_pandas()

In [50]:
# del merged_candidates_g
# del co_occurence_dict_uni_dis1_df_g
# del merged_candidates_uni_dis1_g

In [51]:
normalize_co_graph_counts(merged_candidates_uni_dis1)

In [52]:
merged_candidates_feature['co_graph_counts_2'] = merged_candidates_uni_dis1['counts']
merged_candidates_feature['normalized_co_graph_counts_2'] = merged_candidates_uni_dis1['normalized_counts']

In [53]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [54]:
merged_candidates_uni_dis1.query("sess_id==99")

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_sum,normalized_counts
29127,99,DE,1782326952,B07RVHG3WD,0.0,40.0,0.0
29128,99,DE,1789416205,B07RVHG3WD,0.0,40.0,0.0
29129,99,DE,3401416049,B07RVHG3WD,0.0,40.0,0.0
29130,99,DE,3438040700,B07RVHG3WD,0.0,40.0,0.0
29131,99,DE,3466310954,B07RVHG3WD,0.0,40.0,0.0
...,...,...,...,...,...,...,...
29532,99,DE,B0BD4T1SGD,B07RVHG3WD,0.0,40.0,0.0
29533,99,DE,B0BFJ4S4LB,B07RVHG3WD,0.0,40.0,0.0
29534,99,DE,B0BJC4SQ9M,B07RVHG3WD,0.0,40.0,0.0
29535,99,DE,B0BJH5CSCJ,B07RVHG3WD,0.0,40.0,0.0


In [55]:
merged_candidates_uni_dis1.query("sess_id==99")['counts'].sum()

40.0

In [56]:
merged_candidates_uni_dis1.query("sess_id==99")['counts'].max(), merged_candidates_uni_dis1.query("sess_id==99")['normalized_counts'].max()

(3.0, 0.075)

In [57]:
merged_candidates_uni_dis1.query('sess_id==200')['counts'].sum()

21.0

In [58]:
merged_candidates_uni_dis1.query('sess_id==200')['counts'].max(), merged_candidates_uni_dis1.query('sess_id==200')['normalized_counts'].max()

(9.0, 0.42857142857142855)

In [59]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,target,sess_avg_price,product_price,sasrec_scores_3,normalized_sasrec_scores_3,sasrec_scores_2,normalized_sasrec_scores_2,...,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,cos_text_bert_scores,text_bert_scores,normalized_text_bert_scores,roberta_scores,normalized_roberta_scores,product_freq
0,0,DE,355165591X,0.0,43.256542,8.990000,2.230508,7.658405e-09,0.512931,1.377575e-09,...,0.000000,0.000000,0.0,0.000000,0.903757,378.286041,1.296655e-08,276.525787,7.975509e-07,48.0
1,0,DE,3833237058,0.0,43.256542,22.000000,9.605231,1.221631e-05,9.325538,9.255110e-06,...,0.090909,0.000610,0.0,0.000000,0.921604,387.624756,1.474268e-04,284.460052,2.226209e-03,80.0
2,0,DE,B00CIXSI6U,0.0,43.256542,6.470000,0.714114,1.681035e-09,-0.115904,7.345399e-10,...,0.000000,0.000000,0.0,0.000000,0.901061,374.802551,3.980740e-10,278.039612,3.624132e-06,7.0
3,0,DE,B00NVDOWUW,0.0,43.256542,11.990000,8.750996,5.199363e-06,8.507557,4.084482e-06,...,0.000000,0.000000,0.0,0.000000,0.927298,385.701782,2.154962e-05,285.239197,4.852260e-03,155.0
4,0,DE,B00NVDP3ZU,0.0,43.256542,22.990000,8.056712,2.596729e-06,5.898870,3.007453e-07,...,0.000000,0.000000,0.0,0.000000,0.930655,385.398499,1.591202e-05,284.763611,3.015780e-03,462.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78842194,261815,UK,B0BCX524Y6,0.0,9.383333,16.990000,6.813615,1.076201e-03,7.203015,4.597607e-04,...,0.000000,0.000000,0.0,0.000000,0.972680,438.956238,1.636532e-04,281.902344,1.476179e-04,7.0
78842195,261815,UK,B0BCX6QB4L,0.0,9.383333,10.990000,9.030836,9.881445e-03,10.123234,8.526421e-03,...,4.750000,0.013025,2.0,0.009091,0.972680,438.956238,1.636532e-04,281.902344,1.476179e-04,51.0
78842196,261815,UK,B0BFPJYXQL,0.0,9.383333,10.560000,0.796892,2.623396e-06,1.711608,1.895152e-06,...,0.000000,0.000000,0.0,0.000000,0.953467,430.164368,2.486932e-08,283.306732,6.012531e-04,7.0
78842197,261815,UK,B0BH3X67S3,0.0,9.383333,6.830000,4.250781,8.296004e-05,6.447586,2.159998e-04,...,0.000000,0.000000,0.0,0.000000,0.961829,434.029083,1.186011e-06,273.954742,5.218429e-08,37.0


In [60]:
merged_candidates_feature.query('sess_id==20010').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_2', 'normalized_co_graph_counts_2']][:25]

Unnamed: 0,product,sasrec_scores_2,co_graph_counts_2,normalized_co_graph_counts_2
5968515,B08737HD3S,23.872364,0.0,
5968602,B09CDPZF8C,23.619312,0.0,
5968527,B08DRBQ7P2,23.481756,0.0,
5968581,B097D4NTBN,23.15797,0.0,
5968595,B09BN9P5Y2,22.849251,0.0,
5968431,B01BZOKLOY,22.255867,0.0,
5968682,B0BHDJNL3K,22.105152,0.0,
5968583,B0988TCBT6,21.59745,0.0,
5968482,B07W3T98LC,21.548342,0.0,
5968523,B08BNL9HXF,21.398357,0.0,
