In [3]:
import os
import random
import numpy as np
import pandas as pd
import cudf, itertools
import scipy.sparse as ssp
from functools import lru_cache
from tqdm import tqdm, trange
from collections import Counter, defaultdict

In [4]:
def cast_dtype(df : pd.DataFrame, columns=None):
    if columns is None:
        columns = df.columns
    for k in columns:
        dt = type(df[k].iloc[0])
        if 'float' in str(dt):
            df[k] = df[k].astype('float32')
        elif 'int' in str(dt):
            df[k] = df[k].astype('int32')
        elif dt == list:
            dt_ = type(df.iloc[0][k][0])
            if 'float' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.float32))
            elif 'int' in str(dt_):
                df[k] = df[k].apply(lambda x : np.array(x, dtype=np.int32))

# Merge valid sessions length

In [1]:
merged_candidates_feature_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/XGBoost/candidates_phase2/merged_candidates_150_feature.parquet'
valid_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/data_for_recstudio/task1_data/task13_4_task1_valid_sessions_phase2.csv'
test_sessions_path = '/root/autodl-tmp/xiaolong/WorkSpace/Amazon-KDDCUP-23/raw_data/sessions_test_task1_phase2.csv'

In [5]:
@lru_cache(maxsize=1)
def read_merged_candidates_feature():
    return pd.read_parquet(merged_candidates_feature_path, engine='pyarrow')

@lru_cache(maxsize=1)
def read_valid_sessions():
    return pd.read_csv(valid_sessions_path)

@lru_cache(maxsize=1)
def read_test_sessions():
    return pd.read_csv(test_sessions_path)

In [6]:
merged_candidates_feature = read_merged_candidates_feature()
valid_sessions = read_valid_sessions()

In [7]:
length_list = []
for sess in tqdm(valid_sessions.itertuples(), total=valid_sessions.shape[0]):
    prev_items = eval(sess.prev_items.replace(' ', ','))
    length_list.append(len(prev_items))
length_list = np.array(length_list)

100%|██████████| 261816/261816 [00:06<00:00, 42067.54it/s]


In [12]:
merged_candidates_feature['sess_len'] = length_list[merged_candidates_feature['sess_id']]

In [13]:
merged_candidates_feature['sess_len']

0           14
1           14
2           14
3           14
4           14
            ..
78842194     3
78842195     3
78842196     3
78842197     3
78842198     3
Name: sess_len, Length: 78842199, dtype: int64

In [15]:
cast_dtype(merged_candidates_feature, ['sess_len'])
merged_candidates_feature.to_parquet(merged_candidates_feature_path, engine='pyarrow')

In [16]:
merged_candidates_feature

Unnamed: 0,sess_id,sess_locale,product,target,sess_avg_price,product_price,sasrec_scores_3,normalized_sasrec_scores_3,sasrec_scores_2,normalized_sasrec_scores_2,...,lyx_gru4rec_u2i_score,normalized_lyx_u2i_mbart_mean_score,normalized_lyx_lknn_i2i_score,normalized_lyx_lknn_u2i_score,normalized_lyx_gru4rec_i2i_score,normalized_lyx_gru4rec_u2i_score,normalized_lyx_i2i_base_l1_score,normalized_lyx_i2i_base_l2_score,normalized_lyx_i2i_base_l3_score,sess_len
0,0,DE,355165591X,0.0,43.256542,8.990000,2.230508,7.658405e-09,0.512931,1.377575e-09,...,4.174113,1.271579e-07,1.452903e-09,2.570659e-07,3.081760e-07,7.944862e-07,0.000020,0.000008,1.307907e-05,14
1,0,DE,3833237058,0.0,43.256542,22.000000,9.605231,1.221631e-05,9.325538,9.255110e-06,...,10.260387,1.072909e-05,5.865666e-06,1.772278e-04,7.366180e-04,3.493991e-04,0.000472,0.000457,6.697727e-04,14
2,0,DE,B00CIXSI6U,0.0,43.256542,6.470000,0.714114,1.681035e-09,-0.115904,7.345399e-10,...,1.541509,2.201507e-07,1.060042e-10,6.944130e-08,6.303538e-07,5.711642e-08,0.000013,0.000013,2.086422e-05,14
3,0,DE,B00NVDOWUW,0.0,43.256542,11.990000,8.750996,5.199363e-06,8.507557,4.084482e-06,...,9.524183,7.748233e-05,1.658694e-05,2.719947e-04,5.676211e-05,1.673371e-04,0.000433,0.000464,4.553778e-04,14
4,0,DE,B00NVDP3ZU,0.0,43.256542,22.990000,8.056712,2.596729e-06,5.898870,3.007453e-07,...,7.059176,1.677892e-04,1.670950e-05,7.079612e-05,8.111991e-06,1.422504e-05,0.000213,0.000241,2.567911e-04,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78842194,261815,UK,B0BCX524Y6,0.0,9.383333,16.990000,6.813615,1.076201e-03,7.203015,4.597607e-04,...,9.596521,5.671981e-04,1.311533e-03,1.496002e-03,6.490497e-04,6.915201e-04,0.000063,0.000063,3.704567e-05,3
78842195,261815,UK,B0BCX6QB4L,0.0,9.383333,10.990000,9.030836,9.881445e-03,10.123234,8.526421e-03,...,11.790966,6.904074e-04,9.204712e-03,1.781063e-02,9.381528e-03,6.206401e-03,0.000066,0.000066,3.587974e-05,3
78842196,261815,UK,B0BFPJYXQL,0.0,9.383333,10.560000,0.796892,2.623396e-06,1.711608,1.895152e-06,...,7.981951,2.760592e-05,2.869346e-06,1.963629e-05,9.005370e-06,1.375960e-04,0.000226,0.000226,1.359693e-04,3
78842197,261815,UK,B0BH3X67S3,0.0,9.383333,6.830000,4.250781,8.296004e-05,6.447586,2.159998e-04,...,7.494740,1.596323e-05,6.421880e-05,9.672265e-05,9.891591e-05,8.453039e-05,0.000040,0.000040,2.020255e-05,3
