# Necessary Common Functions

Those functions should be ran before each part.

In [2]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [3]:
def get_sessions(df: pd.DataFrame, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    return all_item

In [4]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [5]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [6]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [7]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

In [8]:
def get_co_graph_counts(session_last_items, merged_candidates_df, co_graph_dict):
    co_graph_count_list = []
    for idx, row in tqdm(merged_candidates_df.iterrows(), total=merged_candidates_df.shape[0]):
        sess_id = row['sess_id']
        product = row['product']
        last_item = session_last_items[sess_id]
        co_graph_count = co_graph_dict[last_item][product]
        co_graph_count_list.append(co_graph_count)
    return co_graph_count_list

In [9]:
def flatten_co_graph_dict(co_graph_dict):
    product_list = []
    neighbor_list = []
    counts_list = []
    for product in tqdm(co_graph_dict.keys(), total=len(co_graph_dict)):
        for neigh in co_graph_dict[product].keys():
            product_list.append(product)
            neighbor_list.append(neigh)
            counts_list.append(co_graph_dict[product][neigh])
    return pd.DataFrame({'product_' : product_list, 'neighbor' : neighbor_list, 'counts' : counts_list})

In [10]:
def normalize_co_graph_counts(merged_candidates_counts):
    # normalize co graph counts 
    # merged_candidates_counts_g = cudf.from_pandas(merged_candidates_counts)
    sessions_count_sum = merged_candidates_counts[['sess_id', 'counts']].groupby('sess_id').sum()
    sessions_count_sum.sort_index(inplace=True)

    # sessions_count_sum = sessions_count_sum.to_pandas()

    candidates_count_sum = sessions_count_sum.loc[merged_candidates_counts['sess_id']].reset_index(drop=True)
    merged_candidates_counts['counts_sum'] = candidates_count_sum['counts']
    merged_candidates_counts['normalized_counts'] = merged_candidates_counts['counts'] / merged_candidates_counts['counts_sum']

    # del merged_candidates_counts_g
    # del sessions_count_sum_g
    

# Merge valid co-graph counts 

In [11]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_2_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [12]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [13]:
merged_candidates_feature = read_merged_candidates_feature()

In [14]:
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]

In [15]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [16]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, test=True, list_item=False)

100%|██████████| 3557898/3557898 [03:41<00:00, 16044.28it/s]
100%|██████████| 361581/361581 [00:21<00:00, 17138.52it/s]


In [17]:
valid_session_last_items = get_session_last_item(valid_sess_data)

100%|██████████| 361581/361581 [00:12<00:00, 27824.86it/s]


In [18]:
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


## bidirection

In [18]:
co_occurence_dict_bi = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=True, weighted=False)

100%|██████████| 3919479/3919479 [02:24<00:00, 27146.95it/s]


In [19]:
merged_candidates_feature = read_merged_candidates_feature()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
valid_session_last_items = get_session_last_item(valid_sess_data)
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

100%|██████████| 361581/361581 [00:18<00:00, 19573.18it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [20]:
co_occurence_dict_bi_df = flatten_co_graph_dict(co_occurence_dict_bi)

100%|██████████| 1401599/1401599 [00:37<00:00, 37872.54it/s] 


In [None]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_occurence_dict_bi_df_g = cudf.from_pandas(co_occurence_dict_bi_df)

In [None]:
# merged_candidates_bi_g = merged_candidates_g.merge(co_occurence_dict_bi_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
# merged_candidates_bi_g.drop(columns=['product_', 'neighbor'], inplace=True)
# merged_candidates_bi_g = merged_candidates_bi_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_bi_g.reset_index(drop=True, inplace=True)
# merged_candidates_bi_g['counts'] = merged_candidates_bi_g['counts'].fillna(0)
# assert len(merged_candidates_bi_g['counts']) == len(merged_candidates)
# merged_candidates_bi = merged_candidates_bi_g.to_pandas()

In [21]:
merged_candidates_bi = merged_candidates.merge(co_occurence_dict_bi_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_bi.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_bi = merged_candidates_bi.sort_values(by=['sess_id', 'product'])
merged_candidates_bi.reset_index(drop=True, inplace=True)
merged_candidates_bi['counts'] = merged_candidates_bi['counts'].fillna(0)
assert len(merged_candidates_bi['counts']) == len(merged_candidates)
# merged_candidates_bi = merged_candidates_bi_g.to_pandas()

In [None]:
# del merged_candidates_g
# del co_occurence_dict_bi_df_g
# del merged_candidates_bi_g

In [22]:
normalize_co_graph_counts(merged_candidates_bi)

In [23]:
merged_candidates_feature['co_graph_counts_0'] = merged_candidates_bi['counts']
merged_candidates_feature['normalized_co_graph_counts_0'] = merged_candidates_bi['normalized_counts']

In [53]:
merged_candidates_feature.query('sess_id==20010').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_0', 'normalized_co_graph_counts_0']][:25]

Unnamed: 0,product,sasrec_scores_2,co_graph_counts_0,normalized_co_graph_counts_0
4287227,B09C2Y41GZ,14.97872,65.0,0.326633
4287139,B072C4GT6P,13.678986,12.0,0.060302
4287144,B076593J1C,13.579645,6.0,0.030151
4287253,B09SFRPQ3Q,13.052922,13.0,0.065327
4287140,B073Q3CPYH,11.974123,8.0,0.040201
4287274,B0B66SSJW4,11.504192,3.0,0.015075
4287194,B08BWL2CY4,11.193488,10.0,0.050251
4287205,B08VFMLDDY,11.116684,3.0,0.015075
4287264,B09ZB7TWLM,10.825944,1.0,0.005025
4287155,B07CMP39R1,10.765284,1.0,0.005025


In [27]:
cast_dtype(merged_candidates_feature, ['co_graph_counts_0', 'normalized_co_graph_counts_0'])
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [50]:
merged_candidates_feature.columns

Index(['sess_id', 'sess_locale', 'product', 'target', 'sess_avg_price',
       'product_price', 'product_freq', 'sasrec_scores_2',
       'normalized_sasrec_scores_2', 'sasrec_scores_3',
       'normalized_sasrec_scores_3', 'seqmlp_scores',
       'normalized_seqmlp_scores', 'narm_scores', 'normalized_narm_scores',
       'gru4rec_scores', 'normalized_gru4rec_scores', 'gru4rec_scores_2',
       'normalized_gru4rec_scores_2', 'normalized_all_items_co_graph_count_0',
       'all_items_co_graph_count_0', 'normalized_all_items_co_graph_count_1',
       'all_items_co_graph_count_1', 'normalized_all_items_co_graph_count_2',
       'all_items_co_graph_count_2', 'sasrec_feat_scores',
       'normalized_sasrec_feat_scores', 'title_BM25_scores',
       'desc_BM25_scores', 'roberta_scores', 'normalized_roberta_scores',
       'co_graph_counts_0', 'normalized_co_graph_counts_0',
       'co_graph_counts_1', 'normalized_co_graph_counts_1'],
      dtype='object')

## uni and weight

In [29]:
co_occurence_dict_uni_weight = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=True)

100%|██████████| 3919479/3919479 [01:51<00:00, 35200.63it/s]


In [30]:
co_graph_uni_weight_df = flatten_co_graph_dict(co_occurence_dict_uni_weight)

100%|██████████| 1401599/1401599 [00:27<00:00, 50797.52it/s] 


In [22]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_graph_uni_weight_df_g = cudf.from_pandas(co_graph_uni_weight_df)

In [31]:
merged_candidates_uni_weight = merged_candidates.merge(co_graph_uni_weight_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_weight.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_weight = merged_candidates_uni_weight.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_weight.reset_index(drop=True, inplace=True)
merged_candidates_uni_weight['counts'] = merged_candidates_uni_weight['counts'].fillna(0)
assert len(merged_candidates_uni_weight['counts']) == len(merged_candidates)
# merged_candidates_uni_weight = merged_candidates_uni_weight_g.to_pandas()

In [25]:
# del merged_candidates_g
# del co_graph_uni_weight_df_g
# del merged_candidates_uni_weight_g

In [32]:
normalize_co_graph_counts(merged_candidates_uni_weight)

In [None]:
merged_candidates_feature['co_graph_counts_1'] = merged_candidates_uni_weight['counts']
merged_candidates_feature['normalized_co_graph_counts_1'] = merged_candidates_uni_weight['normalized_counts']

In [None]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [None]:
merged_candidates_uni_weight

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_cum,normalized_counts
0,0,UK,B000OPPVCS,B077XGDMD2,1,457,0.002188
1,0,UK,B000V599Y2,B077XGDMD2,0,457,0.000000
2,0,UK,B0018HH444,B077XGDMD2,1,457,0.002188
3,0,UK,B0079JI4DU,B077XGDMD2,1,457,0.002188
4,0,UK,B0079JI4EY,B077XGDMD2,1,457,0.002188
...,...,...,...,...,...,...,...
84407334,361580,DE,B0BB7XV97M,B08427PFR5,0,128,0.000000
84407335,361580,DE,B0BB7YSRBX,B08427PFR5,0,128,0.000000
84407336,361580,DE,B0BB7ZMGY8,B08427PFR5,0,128,0.000000
84407337,361580,DE,B0BD4CP7N3,B08427PFR5,0,128,0.000000


In [None]:
merged_candidates_uni_weight.query("sess_id==0")['counts'].sum()

457

In [None]:
merged_candidates_uni_weight.query("sess_id==0")['normalized_counts'].max()

0.08971553610503283

In [23]:
co_graph_uni_weight_df

Unnamed: 0,product_,neighbor,counts
0,B005ZJTUXE,B005ZJTUXE,7
1,B005ZJTUXE,B00P8VIBBG,11
2,B005ZJTUXE,B07TVSL9TW,7
3,B005ZJTUXE,B014J7P4KU,4
4,B005ZJTUXE,B07HJCRPTB,9
...,...,...,...
29994681,B09KXXFJS7,B09KXXGQX8,1
29994682,B09KXVZ7YQ,B09KXVDK5F,1
29994683,B09KXVZ7YQ,B09KXXGQX8,1
29994684,B09KXVDK5F,B09KXXGQX8,1


In [51]:
merged_candidates_feature.query('sess_id==20000').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_1', 'normalized_co_graph_counts_1']][:25]

Unnamed: 0,product,sasrec_scores_2,co_graph_counts_1,normalized_co_graph_counts_1
4284904,B07D98ZNJX,25.67992,5.0,0.5
4284905,B07D997JWG,24.778963,2.5,0.25
4284854,B007SQ3HGS,19.812937,1.0,0.1
4284918,B07QJB4MFD,18.794708,0.833333,0.083333
4284942,B081N8XPQH,17.467945,0.0,0.0
4285001,B09Y5GC5R7,17.269825,0.333333,0.033333
4284966,B08SVPH33Z,16.826963,0.0,0.0
4284939,B081N7R51C,16.567038,0.0,0.0
4284927,B07V4PBMWT,15.528075,0.0,0.0
4284931,B07X9Q39H6,14.468607,0.333333,0.033333


## uni and dis=1

In [19]:
co_occurence_dict_uni_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item, bidirection=False, weighted=False, max_dis=1)

100%|██████████| 3919479/3919479 [01:06<00:00, 59104.04it/s] 


In [20]:
merged_candidates_feature = read_merged_candidates_feature()
merged_candidates = merged_candidates_feature[['sess_id', 'sess_locale', 'product']]
valid_session_last_items = get_session_last_item(valid_sess_data)
valid_session_last_items = np.array(valid_session_last_items)
merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]

100%|██████████| 361581/361581 [00:15<00:00, 23362.07it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates['last_item'] = valid_session_last_items[merged_candidates['sess_id']]


In [21]:
merged_candidates

Unnamed: 0,sess_id,sess_locale,product,last_item
0,0,UK,B000V599Y2,B077XGDMD2
1,0,UK,B007VZUA7U,B077XGDMD2
2,0,UK,B009EUAEQC,B077XGDMD2
3,0,UK,B00AH02IWG,B077XGDMD2
4,0,UK,B00I0UKKD4,B077XGDMD2
...,...,...,...,...
77570148,361580,DE,B0BB7XV97M,B08427PFR5
77570149,361580,DE,B0BB7YSRBX,B08427PFR5
77570150,361580,DE,B0BB7ZMGY8,B08427PFR5
77570151,361580,DE,B0BD4CP7N3,B08427PFR5


In [22]:
co_occurence_dict_uni_dis1_df = flatten_co_graph_dict(co_occurence_dict_uni_dis1)

100%|██████████| 1401599/1401599 [00:23<00:00, 59629.06it/s] 


In [23]:
# merged_candidates_g = cudf.from_pandas(merged_candidates)
# co_occurence_dict_uni_dis1_df_g = cudf.from_pandas(co_occurence_dict_uni_dis1_df)

In [24]:
# merged_candidates_uni_dis1_g = merged_candidates_g.merge(co_occurence_dict_uni_dis1_df_g, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
# merged_candidates_uni_dis1_g.drop(columns=['product_', 'neighbor'], inplace=True)
# merged_candidates_uni_dis1_g = merged_candidates_uni_dis1_g.sort_values(by=['sess_id', 'product'])
# merged_candidates_uni_dis1_g.reset_index(drop=True, inplace=True)
# merged_candidates_uni_dis1_g['counts'] = merged_candidates_uni_dis1_g['counts'].fillna(0)
# assert len(merged_candidates_uni_dis1_g['counts']) == len(merged_candidates)
# merged_candidates_uni_dis1 = merged_candidates_uni_dis1_g.to_pandas()

In [25]:
merged_candidates_uni_dis1 = merged_candidates.merge(co_occurence_dict_uni_dis1_df, how='left', left_on=['last_item', 'product'], right_on=['product_', 'neighbor'])
merged_candidates_uni_dis1.drop(columns=['product_', 'neighbor'], inplace=True)
merged_candidates_uni_dis1 = merged_candidates_uni_dis1.sort_values(by=['sess_id', 'product'])
merged_candidates_uni_dis1.reset_index(drop=True, inplace=True)
merged_candidates_uni_dis1['counts'] = merged_candidates_uni_dis1['counts'].fillna(0)
assert len(merged_candidates_uni_dis1['counts']) == len(merged_candidates)
# merged_candidates_uni_dis1 = merged_candidates_uni_dis1_g.to_pandas()

In [26]:
# del merged_candidates_g
# del co_occurence_dict_uni_dis1_df_g
# del merged_candidates_uni_dis1_g

In [27]:
normalize_co_graph_counts(merged_candidates_uni_dis1)

In [28]:
merged_candidates_feature['co_graph_counts_2'] = merged_candidates_uni_dis1['counts']
merged_candidates_feature['normalized_co_graph_counts_2'] = merged_candidates_uni_dis1['normalized_counts']

In [29]:
cast_dtype(merged_candidates_feature)
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [30]:
merged_candidates_uni_dis1.query("sess_id==99")

Unnamed: 0,sess_id,sess_locale,product,last_item,counts,counts_sum,normalized_counts
21214,99,UK,B002F4SM04,B0BC6DL1FW,0.0,9.0,0.0
21215,99,UK,B002RDVIGS,B0BC6DL1FW,0.0,9.0,0.0
21216,99,UK,B004DE7JQW,B0BC6DL1FW,0.0,9.0,0.0
21217,99,UK,B00656235S,B0BC6DL1FW,0.0,9.0,0.0
21218,99,UK,B0074JDF52,B0BC6DL1FW,0.0,9.0,0.0
...,...,...,...,...,...,...,...
21408,99,UK,B0BF14VCQW,B0BC6DL1FW,0.0,9.0,0.0
21409,99,UK,B0BF15FRZT,B0BC6DL1FW,0.0,9.0,0.0
21410,99,UK,B0BF15JRSV,B0BC6DL1FW,0.0,9.0,0.0
21411,99,UK,B0BF62M27B,B0BC6DL1FW,0.0,9.0,0.0


In [31]:
merged_candidates_uni_dis1.query("sess_id==99")['counts'].sum()

9.0

In [32]:
merged_candidates_uni_dis1.query("sess_id==99")['counts'].max(), merged_candidates_uni_dis1.query("sess_id==99")['normalized_counts'].max()

(1.0, 0.1111111111111111)

In [33]:
merged_candidates_uni_dis1.query('sess_id==200')['counts'].sum()

19.0

In [34]:
merged_candidates_uni_dis1.query('sess_id==200')['counts'].max(), merged_candidates_uni_dis1.query('sess_id==200')['normalized_counts'].max()

(1.0, 0.05263157894736842)

In [35]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,target,sess_avg_price,product_price,product_freq,sasrec_scores_2,normalized_sasrec_scores_2,sasrec_scores_3,...,title_BM25_scores,desc_BM25_scores,roberta_scores,normalized_roberta_scores,co_graph_counts_0,normalized_co_graph_counts_0,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2
0,0,UK,B000V599Y2,0.0,7.388571,5.200000,37.0,13.152878,7.433639e-04,10.677187,...,111.069756,64.592583,259.157867,1.341519e-06,3.0,0.004992,0.000000,0.000000,0.0,0.000000
1,0,UK,B007VZUA7U,0.0,7.388571,7.000000,36.0,9.393598,1.732076e-05,8.838863,...,130.196732,200.158524,257.981598,4.137609e-07,3.0,0.004992,0.000000,0.000000,0.0,0.000000
2,0,UK,B009EUAEQC,0.0,7.388571,7.490000,4.0,11.754339,1.835794e-04,10.670128,...,99.471718,30.517475,255.483337,3.402269e-08,6.0,0.009983,1.033333,0.004797,0.0,0.000000
3,0,UK,B00AH02IWG,0.0,7.388571,8.500000,3.0,12.194766,2.851667e-04,11.166204,...,2.061926,216.653198,255.024780,2.150898e-08,4.0,0.006656,1.250000,0.005803,1.0,0.007937
4,0,UK,B00I0UKKD4,0.0,7.388571,17.049999,118.0,11.835367,1.990737e-04,11.681271,...,611.139954,323.099518,267.615601,6.320386e-03,3.0,0.004992,1.833333,0.008512,1.0,0.007937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77570148,361580,DE,B0BB7XV97M,0.0,32.424000,47.990002,56.0,9.117821,6.076918e-05,9.635838,...,118.126396,0.000000,263.574158,1.367507e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000
77570149,361580,DE,B0BB7YSRBX,0.0,32.424000,43.990002,58.0,9.163816,6.362959e-05,9.159988,...,124.881615,0.000000,263.523743,1.300273e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000
77570150,361580,DE,B0BB7ZMGY8,0.0,32.424000,41.990002,452.0,11.256460,5.158017e-04,10.119755,...,124.881615,0.000000,263.567017,1.357776e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000
77570151,361580,DE,B0BD4CP7N3,0.0,32.424000,24.990000,1.0,-3.778687,1.523355e-10,-1.612869,...,192.540955,36.028561,265.401611,8.503204e-03,0.0,0.000000,0.000000,0.000000,0.0,0.000000


In [37]:
merged_candidates_feature.query('sess_id==20010').sort_values(by=['sasrec_scores_2'], ascending=False)[['product', 'sasrec_scores_2', 'co_graph_counts_2', 'normalized_co_graph_counts_2']][:25]

Unnamed: 0,product,sasrec_scores_2,co_graph_counts_2,normalized_co_graph_counts_2
4287227,B09C2Y41GZ,14.97872,8.0,0.275862
4287139,B072C4GT6P,13.678986,3.0,0.103448
4287144,B076593J1C,13.579645,3.0,0.103448
4287253,B09SFRPQ3Q,13.052922,3.0,0.103448
4287140,B073Q3CPYH,11.974123,1.0,0.034483
4287274,B0B66SSJW4,11.504192,1.0,0.034483
4287194,B08BWL2CY4,11.193488,1.0,0.034483
4287205,B08VFMLDDY,11.116684,1.0,0.034483
4287264,B09ZB7TWLM,10.825944,0.0,0.0
4287155,B07CMP39R1,10.765284,0.0,0.0
