In [None]:
!pip install polars

In [1]:
import polars as pl
pl.Config.set_tbl_rows(30)
import numpy as np
import gc

In [2]:
def join_candidate_and_feature(phase):
    """
    otto_candidate.ipynbで作成した候補DataFrameとotto_feature.ipynbで作成した特徴量DataFrameを結合させる
    """
    
    user_feature = pl.read_parquet(f'user_feature_{phase}.parquet')
    item_feature = pl.read_parquet(f'item_feature_{phase}.parquet')
    user_item_feature = pl.read_parquet(f'user_item_feature_{phase}.parquet')
    
    for t in ['click', 'cart', 'order']:
    
        df = pl.read_parquet(f'candidate_{t}_{phase}.parquet')

        if phase == 'train':
            train_label = pl.read_parquet('input/otto_train_and_test_data_for_local_validation/test_labels.parquet')
            train_label = train_label.explode('ground_truth').rename({'ground_truth':'aid'})
            train_label = train_label.with_column(pl.lit(1).alias('gt'))
            train_label = train_label.with_column(pl.col(['session', 'aid']).cast(pl.Int32))

            df = df.join(train_label.filter(pl.col('type')==f'{t}s')[['session', 'aid', 'gt']], on=['session', 'aid'], how='left')
            df = df.with_column(pl.col('gt').fill_null(0))
                        
        df = df.join(user_feature, on='session',how='left').join(item_feature, on='aid', how='left').join(user_item_feature, on=['session', 'aid'], how='left')
        
        colname = ['co-visit', 'num_recommended_by_w2vec', 'num_recommended_by_bpr', 'user_click', 
               'user_cart', 'user_order', 'entire_click_count', 'entire_cart_count', 'entire_order_count', 
               'test_click_count', 'test_cart_count', 'test_order_count', 'num_clicked', 'num_carted', 
               'num_ordered', 'wgt', 'entire_action_count', 'test_action_count']
        df = df.with_column(pl.col(colname).fill_null(0))

        col_int32 = ['session', 'aid', 'entire_click_count']
        col_uint16 = ['user_click', 'entire_cart_count', 'entire_order_count', 'test_click_count', 
                   'test_cart_count', 'test_order_count', 'action_num']
        col_uint8 = [
                   'user_cart', 'user_order', 
                   'num_clicked', 'num_carted', 'num_ordered'
        ]

        for colname, d_type in zip([col_int32, col_uint16, col_uint8], [pl.Int32, pl.UInt16, pl.UInt8]):
            df = df.with_column(pl.col(colname).cast(d_type))

        df.write_parquet(f'{phase}_{t}.parquet')
        
        del df
        gc.collect()

In [None]:
join_candidate_and_feature('train')

In [None]:
join_candidate_and_feature('test')