# Necessary Common Functions

Those functions should be ran before each part.

In [1]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict
import numba 
from numba import jit
import datasets
from datasets import Dataset as TFDataset
import multiprocessing

In [2]:
def get_sessions(df: pd.DataFrame, test=False, list_item=False) -> list:
    
    all_item = []
    if 'next_item' in df and not test:
        if list_item:
            for i in trange(len(df)):
                all_item.append(np.concatenate([np.array(df.loc[i, 'prev_items']), np.array(df.loc[i, 'next_item'])], axis=0))
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items'][:-1]+f" '{df.loc[i, 'next_item']}']").replace(" ", ",")))
    else:
        if list_item:
            all_item = df['prev_items']
        else:
            for i in trange(len(df)):
                all_item.append(eval((df.loc[i, 'prev_items']).replace(" ", ",")))
    return all_item

In [3]:
def get_co_occurence_dict(sessions: list, bidirection: bool=True, weighted: bool=False, max_dis=None) -> dict:
    res = {}
    for sess in tqdm(sessions):
        for i, id in enumerate(sess):
            if id not in res:
                res[id] = Counter()
            
            if max_dis == None:
                e = len(sess)
            else:
                e = min(i + max_dis + 1, len(sess))

            for j in range(i+1, e):
                if not weighted:
                    res[id][sess[j]] += 1
                else:
                    res[id][sess[j]] += 1 / (j-i)
                if bidirection:
                    if sess[j] not in res:
                        res[sess[j]] = Counter()
                    if not weighted:
                        res[sess[j]][id] += 1
                    else:
                        res[sess[j]][id] += 1 / (j-i)
    return res

In [4]:
def sort_co_occurence_dict(co_occurence_dict: dict) -> dict:
    res = {}
    for k,v in co_occurence_dict.items():
        res[k] = dict(sorted(v.items(), key=lambda item: -item[1]))
    return res

In [5]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [6]:
def get_session_last_item(session_df):
    last_items = []
    num_sessions = len(session_df)
    for i in tqdm(range(num_sessions)):
        sess = session_df.iloc[i]
        sess_prev_items = sess['prev_items']
        
        product_list = sess_prev_items.strip('[]').split(' ')
        last_item = product_list[-1].strip("'\n")

        last_items.append(last_item)
    return last_items 

In [7]:
def get_co_graph_counts(session_last_items, merged_candidates_df, co_graph_dict):
    co_graph_count_list = []
    for idx, row in tqdm(merged_candidates_df.iterrows(), total=merged_candidates_df.shape[0]):
        sess_id = row['sess_id']
        product = row['product']
        last_item = session_last_items[sess_id]
        co_graph_count = co_graph_dict[last_item][product]
        co_graph_count_list.append(co_graph_count)
    return co_graph_count_list

In [8]:
def flatten_co_graph_dict(co_graph_dict):
    product_list = []
    neighbor_list = []
    counts_list = []
    for product in tqdm(co_graph_dict.keys(), total=len(co_graph_dict)):
        for neigh in co_graph_dict[product].keys():
            product_list.append(product)
            neighbor_list.append(neigh)
            counts_list.append(co_graph_dict[product][neigh])
    return pd.DataFrame({'product_' : product_list, 'neighbor' : neighbor_list, 'counts' : counts_list})

In [9]:
def normalize_co_graph_counts(merged_candidates_counts):
    # normalize co graph counts 
    # merged_candidates_counts_g = cudf.from_pandas(merged_candidates_counts)
    sessions_count_sum = merged_candidates_counts[['sess_id', 'counts']].groupby('sess_id').sum()
    sessions_count_sum.sort_index(inplace=True)

    sessions_count_sum = sessions_count_sum.to_pandas()

    candidates_count_sum = sessions_count_sum.loc[merged_candidates_counts['sess_id']].reset_index(drop=True)
    merged_candidates_counts['counts_cum'] = candidates_count_sum['counts']
    merged_candidates_counts['normalized_counts'] = merged_candidates_counts['counts'] / merged_candidates_counts['counts_cum']

    # del merged_candidates_counts_g
    # del sessions_count_sum_g
    

# Merge test co-graph counts 

In [10]:
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_test_feature.parquet'
train_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_train_sessions_phase2.csv'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1_phase2.csv'
product_data_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/products_train.csv'

In [11]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path)

@lru_cache(maxsize=1)
def read_product_data():
    return pd.read_csv(product_data_path)

@lru_cache(maxsize=1)
def read_train_data():
    return pd.read_csv(train_sessions_path)

@lru_cache(maxsize=1)
def read_valid_data():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_data():
    return pd.read_csv(test_sessions_path)

In [12]:
merged_candidates_feature_test = read_merged_candidates_feature_test()

In [13]:
merged_candidates_test = merged_candidates_feature_test[['sess_id', 'sess_locale', 'product']]

In [14]:
train_sess_data = read_train_data()
valid_sess_data = read_valid_data()
test_sess_data = read_test_data()
product = read_product_data()

In [15]:
train_sess_item = get_sessions(train_sess_data, list_item=False)
valid_sess_item = get_sessions(valid_sess_data, list_item=False)
test_sess_item = get_sessions(test_sess_data, test=True, list_item=False)

100%|██████████| 3010900/3010900 [02:21<00:00, 21317.65it/s]
100%|██████████| 261816/261816 [00:13<00:00, 18983.33it/s]
100%|██████████| 316972/316972 [00:09<00:00, 33182.20it/s]


In [16]:
merged_candidates_feature_test[['all_items_co_graph_count_0', 'all_items_co_graph_count_1', 'all_items_co_graph_count_2']]

Unnamed: 0,all_items_co_graph_count_0,all_items_co_graph_count_1,all_items_co_graph_count_2
0,4,0.557576,0
1,6,1.083766,0
2,4,0.577778,0
3,3,0.507576,0
4,0,0.000000,0
...,...,...,...
96556030,0,0.000000,0
96556031,0,0.000000,0
96556032,1,0.333333,0
96556033,0,0.000000,0


# bidirection

In [17]:
# bidirection 
# valid and test are included in train
co_occurence_dict_bi = get_co_occurence_dict(train_sess_item, bidirection=True, weighted=False)

100%|██████████| 3010900/3010900 [01:30<00:00, 33209.80it/s]


In [18]:
# only one arg, can't use another arg
def get_bi_test_session_co_graph_candidates(sess_id):
    sess = test_sess_item[sess_id]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_bi and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_bi[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    return cand_counter

In [19]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [20]:
# about 1 mins
sess_ids = list(range(len(test_sess_item)))
pool = multiprocessing.Pool(10)
bi_test_sessions_counter = pool.map(get_bi_test_session_co_graph_candidates, sess_ids)

In [21]:
test_co_graph_candidates, len(bi_test_sessions_counter)

(Dataset({
     features: ['sess_id'],
     num_rows: 316972
 }),
 316972)

In [22]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(bi_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_0'] = all_items_co_graph_count_list

100%|██████████| 96556035/96556035 [02:51<00:00, 563221.76it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['all_items_co_graph_count_0'] = all_items_co_graph_count_list


In [23]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_0'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0'] / merged_candidates_test['normalized_all_items_co_graph_count_0']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_0'] = count_sum_array[merged_candidates_test['sess_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0'] / merged_candidates_test['normalized_all_items_co_graph_count_0']


In [24]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_0'] = merged_candidates_test['normalized_all_items_co_graph_count_0']
merged_candidates_feature_test['all_items_co_graph_count_0'] = merged_candidates_test['all_items_co_graph_count_0']

In [34]:
merged_candidates_feature_test[['sess_id', 'all_items_co_graph_count_2']].query('sess_id==0')

Unnamed: 0,sess_id,all_items_co_graph_count_2
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
295,0,0
296,0,0
297,0,15
298,0,0


## uni and weighted

In [35]:
# weight 
# co_occurence_dict_bi_weight = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=True, weighted=True)
co_occurence_dict_uni_weight = get_co_occurence_dict(train_sess_item, bidirection=False, weighted=True)

100%|██████████| 3010900/3010900 [01:20<00:00, 37612.14it/s]


In [36]:
# only one arg, can't use another arg
def get_uni_wei_test_session_co_graph_candidates(sess_id):
    sess = test_sess_item[sess_id]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_weight and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_weight[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    return cand_counter

In [43]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [44]:
# about 1 mins
sess_ids = list(range(len(test_sess_item)))
pool = multiprocessing.Pool(10)
uni_wei_test_sessions_counter = pool.map(get_uni_wei_test_session_co_graph_candidates, sess_ids)

In [45]:
test_co_graph_candidates, len(uni_wei_test_sessions_counter)

(Dataset({
     features: ['sess_id'],
     num_rows: 316972
 }),
 316972)

In [46]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(uni_wei_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_1'] = all_items_co_graph_count_list

100%|██████████| 96556035/96556035 [03:32<00:00, 454758.78it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['all_items_co_graph_count_1'] = all_items_co_graph_count_list


In [47]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_1'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_1'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1'] / merged_candidates_test['normalized_all_items_co_graph_count_1']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_1'] = count_sum_array[merged_candidates_test['sess_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1'] / merged_candidates_test['normalized_all_items_co_graph_count_1']


In [57]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_1'] = merged_candidates_test['normalized_all_items_co_graph_count_1']
merged_candidates_feature_test['all_items_co_graph_count_1'] = merged_candidates_test['all_items_co_graph_count_1']

In [55]:
merged_candidates_feature_test[['sess_id', 'all_items_co_graph_count_1']].query('sess_id==250000')

Unnamed: 0,sess_id,all_items_co_graph_count_1
76095555,250000,0.835723
76095556,250000,0.000000
76095557,250000,0.211111
76095558,250000,0.394935
76095559,250000,1.418290
...,...,...
76095864,250000,0.000000
76095865,250000,0.000000
76095866,250000,0.809524
76095867,250000,0.000000


## uni and dis=1

In [58]:
# max dis = 1 
# co_occurence_dict_bi_dis1 = get_co_occurence_dict(train_sess_item + valid_sess_item + test_sess_item, bidirection=True, weighted=False, max_dis=1)
co_occurence_dict_uni_dis1 = get_co_occurence_dict(train_sess_item, bidirection=False, weighted=False, max_dis=1)

100%|██████████| 3010900/3010900 [01:00<00:00, 50161.60it/s]


In [59]:
# only one arg, can't use another arg
def get_uni_dis1_test_session_co_graph_candidates(sess_id):
    sess = test_sess_item[sess_id]
    prev_items = set()
    cand_counter = Counter()
    for item in sess:
        if item in co_occurence_dict_uni_dis1 and item not in prev_items:
            cand_counter = cand_counter + co_occurence_dict_uni_dis1[item]
            prev_items.add(item) # one time for every item
    for item in sess:
        if item in cand_counter:
            cand_counter.pop(item) # remove history items 
    
    return cand_counter

In [60]:
test_co_graph_candidates = TFDataset.from_dict({'sess_id' : list(range(test_sess_data.shape[0]))})

In [61]:
# about 1 mins
sess_ids = list(range(len(test_sess_item)))
pool = multiprocessing.Pool(10)
uni_dis1_test_sessions_counter = pool.map(get_uni_dis1_test_session_co_graph_candidates, sess_ids)

In [62]:
test_co_graph_candidates, len(uni_dis1_test_sessions_counter)

(Dataset({
     features: ['sess_id'],
     num_rows: 316972
 }),
 316972)

In [63]:
all_items_co_graph_count_list = []
for row in tqdm(merged_candidates_test.itertuples(), total=merged_candidates_test.shape[0]):
    all_items_co_graph_count_list.append(uni_dis1_test_sessions_counter[row.sess_id][row.product])
assert len(all_items_co_graph_count_list) == merged_candidates_test.shape[0]
merged_candidates_test['all_items_co_graph_count_2'] = all_items_co_graph_count_list

100%|██████████| 96556035/96556035 [03:50<00:00, 419329.75it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['all_items_co_graph_count_2'] = all_items_co_graph_count_list


In [65]:
count_sum_array = merged_candidates_test.groupby(by='sess_id')['all_items_co_graph_count_2'].sum().to_numpy()
assert len(count_sum_array[merged_candidates_test['sess_id']]) == merged_candidates_test.shape[0]
merged_candidates_test['normalized_all_items_co_graph_count_2'] = count_sum_array[merged_candidates_test['sess_id']]
merged_candidates_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2'] / merged_candidates_test['normalized_all_items_co_graph_count_2']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_2'] = count_sum_array[merged_candidates_test['sess_id']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_candidates_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2'] / merged_candidates_test['normalized_all_items_co_graph_count_2']


In [68]:
merged_candidates_test[['sess_id', 'all_items_co_graph_count_2']].query('sess_id==15000')

Unnamed: 0,sess_id,all_items_co_graph_count_2
4495722,15000,2
4495723,15000,1
4495724,15000,2
4495725,15000,0
4495726,15000,1
...,...,...
4496069,15000,0
4496070,15000,0
4496071,15000,0
4496072,15000,0


In [69]:
merged_candidates_feature_test[['sess_id', 'all_items_co_graph_count_2']].query('sess_id==15000')

Unnamed: 0,sess_id,all_items_co_graph_count_2
4495722,15000,2
4495723,15000,1
4495724,15000,2
4495725,15000,0
4495726,15000,1
...,...,...
4496069,15000,0
4496070,15000,0
4496071,15000,0
4496072,15000,0


In [70]:
merged_candidates_feature_test['normalized_all_items_co_graph_count_2'] = merged_candidates_test['normalized_all_items_co_graph_count_2']
merged_candidates_feature_test['all_items_co_graph_count_2'] = merged_candidates_test['all_items_co_graph_count_2']

In [71]:
cast_dtype(merged_candidates_feature_test, 
    ['all_items_co_graph_count_0', 'normalized_all_items_co_graph_count_0', 'all_items_co_graph_count_1', 'normalized_all_items_co_graph_count_1', 'all_items_co_graph_count_2', 'normalized_all_items_co_graph_count_2'])
merged_candidates_feature_test.to_parquet(merged_candidates_feature_test_path, engine='pyarrow')

In [72]:
merged_candidates_feature_test

Unnamed: 0,sess_id,sess_locale,product,sasrec_scores_2,normalized_sasrec_scores_2,sasrec_scores_3,normalized_sasrec_scores_3,sess_avg_price,product_price,seqmlp_scores,...,co_graph_counts_1,normalized_co_graph_counts_1,co_graph_counts_2,normalized_co_graph_counts_2,cos_text_bert_scores,text_bert_scores,normalized_text_bert_scores,roberta_scores,normalized_roberta_scores,product_freq
0,0,DE,B000Q87D0Q,0.000000,3.282997e-10,0.000000,6.689660e-10,67.527199,36.761604,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,11.0
1,0,DE,B000QB30DW,0.501346,5.420036e-10,-0.588501,3.713825e-10,67.527199,9.990000,7.260942,...,0.600000,0.001276,0.0,0.0,0.924509,382.443390,1.724279e-04,278.861938,1.579214e-06,98.0
2,0,DE,B004BIG55Q,6.917523,3.315223e-07,5.737720,2.076175e-07,67.527199,8.990000,2.454817,...,0.311111,0.000662,0.0,0.0,0.906834,376.119781,3.092420e-07,280.436859,7.628168e-06,859.0
3,0,DE,B0053FTNQY,-0.100895,2.967921e-10,1.507319,3.020121e-09,67.527199,36.761604,3.837643,...,0.090909,0.000193,0.0,0.0,0.885923,366.794250,2.755989e-11,279.552673,3.150818e-06,22.0
4,0,DE,B007QWII1S,3.768980,1.422714e-08,4.594047,6.615662e-08,67.527199,54.950001,4.923371,...,0.000000,0.000000,0.0,0.0,0.904845,377.558044,1.302938e-06,286.498260,3.272302e-03,33.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96556030,316971,UK,B0B82N3CQQ,-1.076433,6.007382e-08,-0.457645,1.105378e-07,19.459999,13.990000,6.433315,...,0.000000,0.000000,0.0,0.0,0.979710,421.320526,6.821542e-04,286.819031,3.196098e-03,0.0
96556031,316971,UK,B0BB9NW3F3,0.000000,1.762683e-07,0.000000,1.746882e-07,19.459999,22.097065,0.000000,...,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.000000e+00,0.000000,0.000000e+00,5.0
96556032,316971,UK,B0BDMVKTQ3,-1.079334,5.989980e-08,-1.901198,2.609658e-08,19.459999,41.990002,-1.094359,...,0.000000,0.000000,0.0,0.0,0.952480,410.857330,1.948851e-08,272.765411,2.518899e-09,33.0
96556033,316971,UK,B0BHW1D5VP,6.722834,1.465088e-04,6.111193,7.876277e-05,19.459999,26.990000,8.700006,...,0.000000,0.000000,0.0,0.0,0.973597,418.673431,4.833641e-05,285.864410,1.230364e-03,10.0


In [73]:
merged_candidates_feature_test.query('sess_id==250000').sort_values(by=['sasrec_scores_2'], ascending=False)[['sasrec_scores_2', 'all_items_co_graph_count_0', 'all_items_co_graph_count_1', 'all_items_co_graph_count_2']]

Unnamed: 0,sasrec_scores_2,all_items_co_graph_count_0,all_items_co_graph_count_1,all_items_co_graph_count_2
76095575,25.595394,234,81.465820,48
76095687,23.410030,170,31.882179,21
76095699,23.054081,73,13.618290,9
76095737,20.244171,45,7.538095,1
76095797,20.104771,15,7.583333,4
...,...,...,...,...
76095583,-1.819422,1,0.045455,0
76095785,-2.388196,0,0.000000,0
76095867,-4.275507,0,0.000000,0
76095854,-5.016989,0,0.000000,0
