In [1]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [2]:
merged_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_2.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions.csv'
sasrec_valid_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/SASRec_Next/seperate_locale/SASRec_Next_04_26_15_26_valid_100_with_score.parquet'
roberta_valid_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/roberta/roberta_valid_100_with_score.parquet'
co_graph_valid_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/co_graph/co_graph_valid_100_with_normalized_score_2.parquet'

In [3]:
@lru_cache(maxsize=1)
def read_merged_candidates():
    return pd.read_parquet(merged_candidates_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_sasrec_valid_candidates():
    return pd.read_parquet(sasrec_valid_candidates_path, engine='pyarrow')

lru_cache(maxsize=1)
def read_roberta_valid_candidates():
    return pd.read_parquet(roberta_valid_candidates_path, engine='pyarrow')

lru_cache(maxsize=1)
def read_co_graph_valid_candidates():
    return pd.read_parquet(co_graph_valid_candidates_path, engine='pyarrow')


In [4]:
def cast_dtype(df : pd.DataFrame):
    for k in df.columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

In [5]:
def normalize_score(candidates_df):
    normalized_score = []
    for i in tqdm(range(len(candidates_df))):
        scores = candidates_df.iloc[i]['scores'] - candidates_df.iloc[i]['scores'].max()
        scores = np.array(scores)
        scores = np.exp(scores) / np.exp(scores).sum()
        normalized_score.append(scores)
    candidates_df['normalized_scores'] = normalized_score

In [6]:
def flatten_candidates(candidates_df):
    column_list = list(candidates_df.keys())
    values_dict = {key : [] for key in column_list}
    num_candidates = len(candidates_df.iloc[0]['candidates'])
    for i in tqdm(range(candidates_df.shape[0])):
        sess = candidates_df.iloc[i]
        for key in column_list:
            cur_value = sess[key]
            if type(cur_value) == np.ndarray or type(cur_value) == list:
                for x in cur_value:
                    values_dict[key].append(x)
            else:
                for _ in range(num_candidates):
                    values_dict[key].append(cur_value)
    
    return pd.DataFrame(values_dict)

# Merge valid score features

In [7]:
merged_candidates = read_merged_candidates()
valid_sessions = read_valid_sessions()

## merge sasrec scores 

In [8]:
sasrec_valid_candidates = read_sasrec_valid_candidates()

In [9]:
normalize_score(sasrec_valid_candidates)

100%|██████████| 361581/361581 [00:36<00:00, 9800.54it/s] 


In [10]:
flattened_sasrec_valid_candidates = flatten_candidates(sasrec_valid_candidates.drop(columns=['scores']))

100%|██████████| 361581/361581 [00:33<00:00, 10673.23it/s]


In [11]:
flattened_sasrec_valid_candidates.rename(columns={'normalized_scores' : 'sasrec_normalized_scores', 'candidates' : 'product'}, inplace=True)
flattened_sasrec_valid_candidates['sess_id'] = flattened_sasrec_valid_candidates['sess_id'].astype('int32')

In [12]:
merged_candidates_g = cudf.from_pandas(merged_candidates)
flattened_sasrec_valid_candidates_g = cudf.from_pandas(flattened_sasrec_valid_candidates)

In [13]:
merged_candidates_sasrec_g = merged_candidates_g.merge(flattened_sasrec_valid_candidates_g[['sess_id', 'product', 'sasrec_normalized_scores']], how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
merged_candidates_sasrec_g = merged_candidates_sasrec_g.sort_values(by=['sess_id']).reset_index(drop=True)
merged_candidates_sasrec_g['sasrec_normalized_scores'] = merged_candidates_sasrec_g['sasrec_normalized_scores'].fillna(0.0)
cast_dtype(merged_candidates_sasrec_g)

In [14]:
merged_candidates_sasrec = merged_candidates_sasrec_g.to_pandas()
merged_candidates_sasrec.to_parquet('./candidates/merged_candidates_feature_2.parquet', engine='pyarrow')

In [15]:
# release GPU
del merged_candidates_sasrec_g
del merged_candidates_g
del flattened_sasrec_valid_candidates_g

## merge roberta scores 

In [16]:
merged_candidates_score = pd.read_parquet('./candidates/merged_candidates_feature_2.parquet', engine='pyarrow')
merged_candidates_score

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_normalized_scores
0,0,UK,B098QH6122,0.0,0.000063
1,0,UK,B076PN1SKG,0.0,0.022821
2,0,UK,B09WF4MM8F,0.0,0.000248
3,0,UK,B08G86V98M,0.0,0.005668
4,0,UK,B077VWJYHT,0.0,0.000166
...,...,...,...,...,...
85365224,361580,DE,B07TV22X9M,0.0,0.000000
85365225,361580,DE,B003BZT45I,0.0,0.000000
85365226,361580,DE,B07DPW7V27,0.0,0.000000
85365227,361580,DE,B09RKGYBR2,0.0,0.000208


In [17]:
roberta_valid_candidates = read_roberta_valid_candidates()

In [18]:
normalize_score(roberta_valid_candidates)

100%|██████████| 361581/361581 [01:11<00:00, 5073.99it/s]


In [19]:
flattened_roberta_valid_candidates = flatten_candidates(roberta_valid_candidates.drop(columns=['scores']))

100%|██████████| 361581/361581 [00:37<00:00, 9732.05it/s] 


In [20]:
flattened_roberta_valid_candidates.rename(columns={'normalized_scores' : 'roberta_normalized_scores', 'candidates' : 'product'}, inplace=True)

In [21]:
merged_candidates_score_g = cudf.from_pandas(merged_candidates_score)
flattened_roberta_valid_candidates_g = cudf.from_pandas(flattened_roberta_valid_candidates)

In [22]:
merged_candidates_roberta_g = merged_candidates_score_g.merge(flattened_roberta_valid_candidates_g[['sess_id', 'product', 'roberta_normalized_scores']], how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
merged_candidates_roberta_g = merged_candidates_roberta_g.sort_values(by=['sess_id']).reset_index(drop=True)
merged_candidates_roberta_g['roberta_normalized_scores'] = merged_candidates_roberta_g['roberta_normalized_scores'].fillna(0.0)
cast_dtype(merged_candidates_roberta_g)

In [23]:
merged_candidates_roberta = merged_candidates_roberta_g.to_pandas()
merged_candidates_roberta.to_parquet('./candidates/merged_candidates_feature_2.parquet', engine='pyarrow')

In [24]:
# release GPU
del merged_candidates_score_g
del merged_candidates_roberta_g
del flattened_roberta_valid_candidates_g

## merge co-graph scores 

In [25]:
merged_candidates_score = pd.read_parquet('./candidates/merged_candidates_feature_2.parquet', engine='pyarrow')
merged_candidates_score

Unnamed: 0,sess_id,sess_locale,product,target,sasrec_normalized_scores,roberta_normalized_scores
0,0,UK,B098QH6122,0.0,0.000063,0.011196
1,0,UK,B08G86V98M,0.0,0.005668,0.013206
2,0,UK,B077VWJYHT,0.0,0.000166,0.008670
3,0,UK,B09XDVQFWD,0.0,0.000168,0.011001
4,0,UK,B097Y5FZBM,0.0,0.000000,0.007589
...,...,...,...,...,...,...
85441841,361580,DE,B09GLTD98C,0.0,0.000617,0.000000
85441842,361580,DE,B07BG8D61S,0.0,0.000096,0.000000
85441843,361580,DE,B0BB7ZMGY8,0.0,0.000055,0.000000
85441844,361580,DE,B0798Z1T76,0.0,0.000000,0.012702


In [26]:
co_graph_valid_candidates = read_co_graph_valid_candidates()
co_graph_valid_candidates

Unnamed: 0,candidates,counts,normalized_counts,state,counts_0,normalized_counts_0,counts_1,normalized_counts_1,counts_2,normalized_counts_2,sess_id
0,"[B077XGDMD2, B06XGDZVZR, B06XG1LZ6Z, B06XGD9VL...","[178.33333, 117.833336, 114.833336, 103.5, 53....","[0.1224957, 0.08093875, 0.078878075, 0.0710933...",Pad,"[66, 41, 41, 37, 19, 20, 15, 15, 13, 10, 8, 9,...","[0.11340206, 0.07044674, 0.07044674, 0.0635738...","[28.333334, 27.833334, 23.833334, 21.5, 10.5, ...","[0.1147873, 0.11276165, 0.09655638, 0.08710331...","[84, 49, 50, 45, 24, 17, 12, 12, 11, 9, 11, 8,...","[0.13397129, 0.07814992, 0.079744816, 0.071770...",0
1,"[B09LCPT9DQ, B09WM9W6WQ, B092D5HM5S, B09MRYK5C...","[1579.8334, 300.83334, 145.16667, 133.83333, 9...","[0.29414138, 0.056010675, 0.027027866, 0.02491...",Full,"[497, 100, 44, 49, 29, 41, 25, 20, 28, 18, 19,...","[0.22357175, 0.044984255, 0.019793073, 0.02204...","[431.83334, 42.833332, 19.166666, 36.833332, 1...","[0.45939717, 0.045567375, 0.020390071, 0.03918...","[651, 158, 82, 48, 51, 32, 32, 42, 25, 35, 32,...","[0.29483697, 0.07155797, 0.03713768, 0.0217391...",1
2,"[B00L529BAC, B01EV58VX2, B07VYSSRL7, B003TJATC...","[5.5, 4.8333335, 4.0, 3.0, 3.0, 3.0, 3.0, 3.0,...","[0.12267658, 0.10780669, 0.08921933, 0.0669145...",Pad,"[2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, ...","[0.18181819, 0.18181819, 0.09090909, 0.0909090...","[1.5, 0.8333333, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,...","[0.16981132, 0.094339624, 0.11320755, 0.113207...","[2, 2, 2, 1, 1, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, ...","[0.08, 0.08, 0.08, 0.04, 0.04, 0.04, 0.04, 0.0...",2
3,"[1839941960, 1788009975, 024157563X, 024156343...","[26.5, 11.0, 5.3333335, 5.0, 4.6666665, 3.5, 3...","[0.24018127, 0.09969789, 0.04833837, 0.0453172...",Pad,"[8, 3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0.22857143, 0.08571429, 0.057142857, 0.057142...","[7.5, 2.0, 1.3333334, 1.0, 0.6666667, 0.5, 0.5...","[0.38793105, 0.10344828, 0.06896552, 0.0517241...","[11, 6, 2, 2, 2, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1,...","[0.19642857, 0.10714286, 0.035714287, 0.035714...",3
4,"[B0B6PF619D, B09BJF6N8K, B0B6P77ZRN, B0B6P6DKN...","[33.666668, 12.0, 10.0, 8.333333, 8.166667, 6....","[0.28291318, 0.10084034, 0.084033616, 0.070028...",Pad,"[13, 4, 5, 4, 3, 3, 3, 4, 4, 2, 2, 0, 1, 1, 0,...","[0.26530612, 0.08163265, 0.10204082, 0.0816326...","[7.6666665, 4.0, 1.0, 1.3333334, 1.1666666, 1....","[0.40350878, 0.21052632, 0.05263158, 0.0701754...","[13, 4, 4, 3, 4, 2, 2, 0, 0, 1, 1, 3, 1, 1, 2,...","[0.25490198, 0.078431375, 0.078431375, 0.05882...",4
...,...,...,...,...,...,...,...,...,...,...,...
361576,"[B08HH6L4PB, B08F5D8T22, B0050IG9DE, B01A955L8...","[8.0, 4.0, 3.5, 3.0, 2.5, 2.3333333, 2.0, 1.0,...","[0.24742268, 0.12371134, 0.10824742, 0.0927835...",Pad,"[1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, ...","[0.125, 0.125, 0.125, 0.125, 0.125, 0.125, 0.0...","[1.0, 1.0, 0.5, 1.0, 0.5, 0.33333334, 0.0, 0.0...","[0.23076923, 0.23076923, 0.115384616, 0.230769...","[6, 2, 2, 1, 1, 1, 2, 0, 0, 1, 1, 1, 1, 1, 0, ...","[0.3, 0.1, 0.1, 0.05, 0.05, 0.05, 0.1, 0.0, 0....",361576
361577,"[B09BBX1T4S, B09D76FT9D, B09B9V4PXC, B09BCM5NL...","[2889.5, 1050.8334, 933.0, 748.3333, 354.0, 34...","[0.20720936, 0.07535646, 0.066906504, 0.053663...",Full,"[742, 412, 300, 287, 147, 129, 107, 92, 83, 82...","[0.12008416, 0.06667746, 0.048551545, 0.046447...","[524.5, 122.833336, 195.0, 189.33333, 74.0, 76...","[0.25061718, 0.058692362, 0.09317512, 0.090467...","[1623, 516, 438, 272, 133, 143, 97, 94, 77, 71...","[0.286092, 0.090957165, 0.077207826, 0.0479464...",361577
361578,"[B0BC38GHB4, B07KLCY8NF, B09SXQW8MS, B00MXZEMB...","[10.0, 7.5, 6.0, 5.3333335, 4.0, 3.5, 3.5, 3.0...","[0.15503876, 0.11627907, 0.093023255, 0.082687...",Pad,"[2, 4, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...","[0.08, 0.16, 0.08, 0.08, 0.08, 0.08, 0.04, 0.0...","[2.0, 1.5, 1.0, 1.3333334, 1.0, 0.5, 0.5, 1.0,...","[0.21052632, 0.15789473, 0.10526316, 0.1403508...","[6, 2, 3, 2, 1, 1, 2, 1, 2, 1, 1, 1, 2, 0, 0, ...","[0.2, 0.06666667, 0.1, 0.06666667, 0.033333335...",361578
361579,"[B08RQDVX71, B08RQR2NPB, B08H8TLK4F, B07PY86YP...","[276.66666, 164.0, 162.66667, 89.833336, 75.33...","[0.18181819, 0.10777656, 0.10690033, 0.0590361...",Pad,"[64, 47, 47, 29, 21, 26, 29, 17, 17, 17, 12, 1...","[0.10977702, 0.080617495, 0.080617495, 0.04974...","[38.666668, 29.0, 24.666666, 20.833334, 11.333...","[0.16407356, 0.12305516, 0.10466761, 0.0884017...","[174, 88, 91, 40, 43, 28, 20, 16, 13, 15, 9, 9...","[0.24751067, 0.12517782, 0.12944524, 0.0568990...",361579


In [27]:
flattened_co_graph_valid_candidates = flatten_candidates(co_graph_valid_candidates.drop(columns=['counts', 'state', 'counts_0', 'counts_1', 'counts_2']))

100%|██████████| 361581/361581 [00:54<00:00, 6652.55it/s]


In [28]:
col_rename = {'normalized_counts' : 'co_graph_normalized_counts', 
              'normalized_counts_0' : 'co_graph_normalized_counts_0',
              'normalized_counts_1' : 'co_graph_normalized_counts_1', 
              'normalized_counts_2' : 'co_graph_normalized_counts_2', 
              'candidates' : 'product'}
flattened_co_graph_valid_candidates.rename(columns=col_rename, inplace=True)

In [29]:
merged_candidates_score_g = cudf.from_pandas(merged_candidates_score)
flattened_co_graph_valid_candidates_g = cudf.from_pandas(flattened_co_graph_valid_candidates)

In [30]:
merged_candidates_co_graph_g = merged_candidates_score_g.merge(flattened_co_graph_valid_candidates_g, 
    how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
merged_candidates_co_graph_g = merged_candidates_co_graph_g.sort_values(by=['sess_id']).reset_index(drop=True)
merged_candidates_co_graph_g['co_graph_normalized_counts'] = merged_candidates_co_graph_g['co_graph_normalized_counts'].fillna(0.0)
merged_candidates_co_graph_g['co_graph_normalized_counts_0'] = merged_candidates_co_graph_g['co_graph_normalized_counts_0'].fillna(0.0)
merged_candidates_co_graph_g['co_graph_normalized_counts_1'] = merged_candidates_co_graph_g['co_graph_normalized_counts_1'].fillna(0.0)
merged_candidates_co_graph_g['co_graph_normalized_counts_2'] = merged_candidates_co_graph_g['co_graph_normalized_counts_2'].fillna(0.0)
cast_dtype(merged_candidates_co_graph_g)

In [31]:
merged_candidates_co_graph = merged_candidates_co_graph_g.to_pandas()
merged_candidates_co_graph.to_parquet('./candidates/merged_candidates_feature_2.parquet', engine='pyarrow')

In [32]:
# release GPU
del merged_candidates_score_g
del merged_candidates_co_graph_g
del flattened_co_graph_valid_candidates_g

# Merge test score features

In [7]:
merged_candidates_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_test.parquet'
merged_candidates_feature_test_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates/merged_candidates_feature_test.parquet'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1.csv'
sasrec_test_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/SASRec_Next/seperate_locale/SASRec_Next_04_27_20_07_test_100_with_score.parquet'
roberta_test_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/roberta/roberta_test_100_with_score.parquet'
co_graph_test_candidates_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/candidates/co_graph/co_graph_test_100_with_normalized_score.parquet'

In [8]:
@lru_cache(maxsize=1)
def read_merged_candidates_test():
    return pd.read_parquet(merged_candidates_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_merged_candidates_feature_test():
    return pd.read_parquet(merged_candidates_feature_test_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

@lru_cache(maxsize=1)
def read_sasrec_test_candidates():
    return pd.read_parquet(sasrec_test_candidates_path, engine='pyarrow')

lru_cache(maxsize=1)
def read_roberta_test_candidates():
    return pd.read_parquet(roberta_test_candidates_path, engine='pyarrow')

lru_cache(maxsize=1)
def read_co_graph_test_candidates():
    return pd.read_parquet(co_graph_test_candidates_path, engine='pyarrow')

## Merge SASRec score

In [None]:
merged_candidates_test = read_merged_candidates_test()
test_sessions = read_test_sessions()

In [21]:
sasrec_test_candidates = read_sasrec_test_candidates()
sasrec_test_candidates['sess_id'] = np.arange(sasrec_test_candidates.shape[0], dtype=np.int32) # add sess_id

In [14]:
normalize_score(sasrec_test_candidates)

100%|██████████| 316971/316971 [00:45<00:00, 6899.08it/s] 


In [23]:
flattened_sasrec_test_candidates = flatten_candidates(sasrec_test_candidates.drop(columns=['scores']))

100%|██████████| 316971/316971 [00:35<00:00, 8997.91it/s]


In [26]:
flattened_sasrec_test_candidates.rename(columns={'normalized_scores' : 'sasrec_normalized_scores', 'candidates' : 'product'}, inplace=True)
flattened_sasrec_test_candidates['sess_id'] = flattened_sasrec_test_candidates['sess_id'].astype('int32')

In [30]:
merged_candidates_test_g = cudf.from_pandas(merged_candidates_test)
flattened_sasrec_test_candidates_g = cudf.from_pandas(flattened_sasrec_test_candidates)

In [31]:
merged_candidates_sasrec_g = merged_candidates_test_g.merge(flattened_sasrec_test_candidates_g[['sess_id', 'product', 'sasrec_normalized_scores']], how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
merged_candidates_sasrec_g = merged_candidates_sasrec_g.sort_values(by=['sess_id']).reset_index(drop=True)
merged_candidates_sasrec_g['sasrec_normalized_scores'] = merged_candidates_sasrec_g['sasrec_normalized_scores'].fillna(0.0)

In [33]:
cast_dtype(merged_candidates_sasrec_g)

In [34]:
merged_candidates_sasrec = merged_candidates_sasrec_g.to_pandas()
merged_candidates_sasrec.to_parquet('./candidates/merged_candidates_feature_test.parquet', engine='pyarrow')

In [36]:
del merged_candidates_sasrec_g
del merged_candidates_test_g
del flattened_sasrec_test_candidates_g

## Merge RoBerta score

In [17]:
merged_candidates_test = read_merged_candidates_feature_test()
test_sessions = read_test_sessions()

In [10]:
roberta_test_candidates = read_roberta_test_candidates()
roberta_test_candidates['sess_id'] = np.arange(roberta_test_candidates.shape[0], dtype=np.int32) # add sess_id

In [12]:
normalize_score(roberta_test_candidates)

100%|██████████| 316971/316971 [00:59<00:00, 5370.37it/s]


In [19]:
flattened_roberta_test_candidates = flatten_candidates(roberta_test_candidates.drop(columns=['scores']))
flattened_roberta_test_candidates.rename(columns={'normalized_scores' : 'roberta_normalized_scores', 'candidates' : 'product'}, inplace=True)
flattened_roberta_test_candidates['sess_id'] = flattened_roberta_test_candidates['sess_id'].astype('int32')

100%|██████████| 316971/316971 [00:32<00:00, 9804.01it/s] 


In [21]:
merged_candidates_test_g = cudf.from_pandas(merged_candidates_test)
flattened_roberta_test_candidates_g = cudf.from_pandas(flattened_roberta_test_candidates)

In [22]:
merged_candidates_roberta_g = merged_candidates_test_g.merge(flattened_roberta_test_candidates_g[['sess_id', 'product', 'roberta_normalized_scores']], how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
merged_candidates_roberta_g = merged_candidates_roberta_g.sort_values(by=['sess_id']).reset_index(drop=True)
merged_candidates_roberta_g['roberta_normalized_scores'] = merged_candidates_roberta_g['roberta_normalized_scores'].fillna(0.0)
cast_dtype(merged_candidates_roberta_g)

In [24]:
merged_candidates_roberta = merged_candidates_roberta_g.to_pandas()
merged_candidates_roberta.to_parquet('./candidates/merged_candidates_feature_test.parquet', engine='pyarrow')

In [25]:
del merged_candidates_roberta_g
del merged_candidates_test_g
del flattened_roberta_test_candidates_g

## Merge co-graph scores

In [9]:
merged_candidates_test = read_merged_candidates_feature_test()
test_sessions = read_test_sessions()
co_graph_test_candidates = read_co_graph_test_candidates()

In [11]:
flattened_co_graph_test_candidates = flatten_candidates(co_graph_test_candidates.drop(columns=['counts', 'state', 'counts_0', 'counts_1', 'counts_2']))
col_rename = {'normalized_counts' : 'co_graph_normalized_counts', 
              'normalized_counts_0' : 'co_graph_normalized_counts_0',
              'normalized_counts_1' : 'co_graph_normalized_counts_1', 
              'normalized_counts_2' : 'co_graph_normalized_counts_2', 
              'candidates' : 'product'}
flattened_co_graph_test_candidates.rename(columns=col_rename, inplace=True)

100%|██████████| 316971/316971 [00:47<00:00, 6639.83it/s]


In [14]:
merged_candidates_test_g = cudf.from_pandas(merged_candidates_test)
flattened_co_graph_test_candidates_g = cudf.from_pandas(flattened_co_graph_test_candidates)

In [15]:
merged_candidates_co_graph_g = merged_candidates_test_g.merge(flattened_co_graph_test_candidates_g, 
    how='left', left_on=['sess_id', 'product'], right_on=['sess_id', 'product'])
merged_candidates_co_graph_g = merged_candidates_co_graph_g.sort_values(by=['sess_id']).reset_index(drop=True)
merged_candidates_co_graph_g['co_graph_normalized_counts'] = merged_candidates_co_graph_g['co_graph_normalized_counts'].fillna(0.0)
merged_candidates_co_graph_g['co_graph_normalized_counts_0'] = merged_candidates_co_graph_g['co_graph_normalized_counts_0'].fillna(0.0)
merged_candidates_co_graph_g['co_graph_normalized_counts_1'] = merged_candidates_co_graph_g['co_graph_normalized_counts_1'].fillna(0.0)
merged_candidates_co_graph_g['co_graph_normalized_counts_2'] = merged_candidates_co_graph_g['co_graph_normalized_counts_2'].fillna(0.0)
cast_dtype(merged_candidates_co_graph_g)

In [17]:
merged_candidates_co_graph = merged_candidates_co_graph_g.to_pandas()
merged_candidates_co_graph.to_parquet('./candidates/merged_candidates_feature_test.parquet', engine='pyarrow')

In [18]:
# release GPU
del merged_candidates_test_g
del merged_candidates_co_graph_g
del flattened_co_graph_test_candidates_g