In [None]:
!pip install polars

In [None]:
import polars as pl
pl.Config.set_tbl_rows(30)
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df_train = pl.read_parquet('input/otto_train_and_test_data_for_local_validation/train.parquet')
df_test = pl.read_parquet('input/otto_train_and_test_data_for_local_validation/test.parquet')

In [None]:
def add_log_recency_score(df, exp_min):
    
    df = df.select([
        pl.col('*'),
        pl.col('session').cumcount().reverse().over('session').alias('action_num_reverse_chrono')
    ])
    
    df = df.select([
        pl.col('*'),
        pl.col('session').count().over('session').alias('session_length')
    ])
    
    linear_interpolation = exp_min + ((1-exp_min) / (df['session_length']-1)) * (df['session_length']-df['action_num_reverse_chrono']-1)
    return df.with_columns([
        pl.Series(2**linear_interpolation - 1).alias('log_recency_score').cast(pl.Float32)
    ]).fill_nan(1).drop(['action_num_reverse_chrono', 'session_length'])

In [None]:
def compute_action_count(df, period):
    action_count = (
        df.groupby(['aid', 'type']).count()
          .pivot(values='count', index='aid', columns='type')
          .rename({'0':f'{period}_click_count', '1':f'{period}_cart_count', '2':f'{period}_order_count'})
    )
    action_count = (
        action_count.with_column(pl.col([
            f'{period}_click_count', f'{period}_cart_count', f'{period}_order_count'
        ]).fill_null(0))
    )
    action_count = (
        action_count
        .join(df.groupby('aid').count().rename({'count':f'{period}_action_count'}), on='aid', how='left')
    )
    
    if period == 'test':
        ts_start = df.sort('ts')['ts'][0]
        ts_end = df.sort('ts')['ts'][-1]
        df = df.with_column((1 + 3*(pl.col('ts') - ts_start)/(ts_end-ts_start)).cast(pl.Float32).alias('time_wgt'))

        time_weighted_count = (
            df.groupby(['aid', 'type']).agg(pl.col('time_wgt').sum())
              .pivot(values='time_wgt', index='aid', columns='type')
              .rename({'0':f'time_weighted_click_count_{period}',
                       '1':f'time_weighted_cart_count_{period}',
                       '2':f'time_weighted_order_count_{period}'})
        )

        action_count = action_count.join(time_weighted_count, on='aid', how='left')

    return action_count

In [None]:
def get_item_feature(df_test, df_train):
    """
    itemの特徴量を計算する
    """
    df_entire = pl.concat([df_train, df_test])
    entire_action_count = compute_action_count(df_entire, 'entire')
    test_action_count = compute_action_count(df_test, 'test')
    
    num_unique_user = (
        df_entire.groupby('aid').agg(pl.col('session').n_unique().alias('num_unique_user'))
    )
    
    user_variety = (
        df_entire.groupby('aid', maintain_order=True)
                 .agg((pl.col('session').n_unique()/pl.col('session').count()).alias('user_variety').cast(pl.Float32))
    )
    
    item_feature = (
        entire_action_count.join(test_action_count, on='aid', how='left')
                           .join(user_variety, on='aid', how='left')
                           .join(num_unique_user, on='aid', how='left')
    )
    return item_feature

In [None]:
def get_user_feature(phase, df):
    """
    userの特徴量を計算する
    """
    
    item_variety = (
        df.groupby('session', maintain_order=True).agg(
            (pl.col('aid').n_unique() / pl.col('aid').count()).alias('item_variety').cast(pl.Float32)
        )
    )

    session_length = (
        df.groupby('session', maintain_order=True).count().rename({'count':'session_length'})
    )
    session_length_unique = (
        df.unique(subset=['session', 'aid'])
          .groupby('session', maintain_order=True).count()
          .rename({'count':'session_length_unique'})
    )
    ts_session_length = (
        df.groupby('session', maintain_order=True).agg([
            (pl.col('ts').last()-pl.col('ts').first()).alias('ts_session_length')
        ])
    )

    real_session = (
        df.with_column(
            (pl.col('ts').diff().over('session').fill_null(0) > 7200).alias('real_session_boundary')
        )
    )
    real_session = (
        real_session.with_column(pl.col('real_session_boundary').cumsum().over('session').alias('real_session'))
    )

    real_session = (
        real_session.groupby(['session', 'real_session'], maintain_order=True)
                    .agg([
                        (pl.col('ts').last()-pl.col('ts').first()).alias('ts_real_session_length'),
                        pl.col('aid').count().alias('real_session_length')
                    ])
                    .groupby('session', maintain_order=True).agg([
                                                                    pl.col('ts_real_session_length').mean().alias('mean_ts_real_session_length').cast(pl.Float32),
                                                                    pl.col('real_session_length').mean().alias('mean_real_session_length').cast(pl.Float32)
                                                                 ])
    )

    user_action_interval = (
        df.groupby('session', maintain_order=True)
        .agg(pl.col('ts').diff().mean().alias('user_action_interval').cast(pl.Float32))
    )
    
    num_user_action = (
        df.groupby(['session', 'type'], maintain_order=True).count()
        .pivot(values='count', index='session', columns='type')
        .rename({'0':'user_click', '1':'user_cart', '2':'user_order'})
    )
    
    user_feature = item_variety.join(ts_session_length, on='session', how='left')
    user_feature = user_feature.join(session_length, on='session', how='left')
    user_feature = user_feature.join(session_length_unique, on='session', how='left')
    user_feature = user_feature.join(real_session, on='session', how='left')
    user_feature = user_feature.join(user_action_interval, on='session', how='left')
    user_feature = user_feature.join(num_user_action, on='session', how='left')

    return user_feature

In [None]:
def get_user_item_feature(df):
    """
    userとitemの相互作用を表す特徴量を計算する
    """
    num_action = (
        df.groupby(['session', 'aid', 'type'], maintain_order=True).count()
        .pivot(values='count', index=['session', 'aid'], columns='type')
        .rename({'0':'num_clicked', '1':'num_carted', '2':'num_ordered'})
    )
    click_interval = (
        df.filter(pl.col('type')==0).groupby(['session', 'aid'], maintain_order=True)
        .agg(pl.col('ts').sort().diff().mean().cast(pl.Float32).alias('click_interval'))
    )
    action_interval = (
        df.groupby(['session', 'aid'], maintain_order=True)
        .agg([
            pl.col('ts').sort().diff().mean().cast(pl.Float32).alias('action_interval')
        ])
    )
    real_session = (
        df.with_column(
            (pl.col('ts').diff().over('session').fill_null(0) > 7200).alias('real_session_boundary')
        )
    )
    real_session = (
        real_session.with_column(pl.col('real_session_boundary').cumsum().over('session').alias('real_session'))
    )
    viewing_time = (
        real_session.with_column(
            # pl.col('ts').diff().over(['session', 'real_session']).shift(-1).alias('ts_diff')
            pl.col('ts').diff().over(['session']).shift(-1).alias('ts_diff')
        ).groupby(['session', 'aid'], maintain_order=True)
         .agg(pl.col('ts_diff').sum().alias('viewing_time'))
    )
    
    df = add_log_recency_score(df, 0.1)
    
    log_recency_score = (
        df.groupby(['session', 'aid', 'type']).agg(pl.col('log_recency_score').sum())
          .pivot(values='log_recency_score', index=['session', 'aid'], columns='type')
          .rename({'0':'log_recency_score_click', '1':'log_recency_score_cart', '2':'log_recency_score_order'})
    )
    
    df = df[['session', 'aid']].unique(subset=['session', 'aid'], keep='last')
    user_item_feature = (
        df.join(num_action, on=['session', 'aid'], how='left')
        .join(click_interval, on=['session', 'aid'], how='left')
        .join(action_interval, on=['session', 'aid'], how='left')
        .join(viewing_time, on=['session', 'aid'], how='left')
        .join(log_recency_score, on=['session', 'aid'], how='left')
    )

    return user_item_feature.with_column(pl.col('session').cumcount().over('session').alias('action_num'))

In [None]:
item_feature_train = get_item_feature(df_test, df_train)
user_feature_train = get_user_feature('train', df_test)
user_item_feature_train = get_user_item_feature(df_test)

In [None]:
item_feature_train.write_parquet('item_feature_train.parquet')
user_feature_train.write_parquet('user_feature_train.parquet')
user_item_feature_train.write_parquet('user_item_feature_train.parquet')

In [None]:
del df_train, df_test
import gc
gc.collect()

In [None]:
df_train = pl.read_parquet('input/Otto_Full_Optimized_Memory_Footprint/train.parquet')
df_test = pl.read_parquet('input/Otto_Full_Optimized_Memory_Footprint/test.parquet')

In [None]:
ts_2022_0807_22 = 1659909600
item_feature_test = get_item_feature(df_test, df_train.filter(pl.col('ts')>=ts_2022_0807_22))
user_feature_test = get_user_feature('test', df_test)
user_item_feature_test = get_user_item_feature(df_test)

In [None]:
item_feature_test.write_parquet('item_feature_test.parquet')
user_feature_test.write_parquet('user_feature_test.parquet')
user_item_feature_test.write_parquet('user_item_feature_test.parquet')