In [1]:
import cudf
from tqdm import tqdm
from pathlib import Path

In [2]:
data_path = Path.cwd().parent / 'preprocess/data'
temp_path = Path.cwd() / 'data'
if not temp_path.is_dir():
    temp_path.mkdir()

In [3]:
PARTS = 2
SIZE = 1.86e6 / PARTS
N_CANDIDS = 20
wgt_map = {'clicks': 1, 'carts': 6, 'orders': 3}

## LB

In [4]:
lb_in = data_path / 'lb'
lb_out = temp_path / 'lb'
if not lb_out.is_dir():
    lb_out.mkdir()

In [5]:
%%time

train_path = lb_in / 'train_parquet'
covisit_matrix = []

for part in range(PARTS):
    print(f'- Part {part + 1}/{PARTS}')
    matrix = None
    for train_file in tqdm(sorted(train_path.glob('*.parquet'))):
        data = cudf.read_parquet(train_file.as_posix())

        data['type'] = data['type'].map(wgt_map)
        data.rename({'type': 'wgt'}, axis=1, inplace=True)

        data['wgt'] = data['wgt'].astype('float32')
        data['ts'] = (data['ts'] / 1000).astype('int32')
        data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

        data = data.sort_values(
            ['session', 'ts'], ascending=[True, False], ignore_index=True
        )
        data['n'] = data.groupby('session').cumcount()
        data = data.loc[data.n < 30].drop('n', axis=1)

        data = data.drop('wgt', axis=1).merge(
            data, 
            on='session',
            how='left'
        )

        data = data.loc[(data['aid_x'] >= part * SIZE) & (data['aid_x'] < (part + 1) * SIZE)]

        data = data.loc[
            ((data['ts_x'] - data['ts_y']).abs() < 60 * 60) & (data.aid_x != data.aid_y)
        ]

        data = data.drop_duplicates(
            subset=['session', 'aid_x', 'aid_y'], keep='first'
        ).reset_index(drop=True)

        data['wgt'] *= (1 / 2) ** ((data.ts_x - data.ts_y).abs() / 60 / 60)  # NEW
        data['wgt'] = data['wgt'].astype('float32')

        data.drop(['session', 'ts_x', 'ts_y'], axis=1, inplace=True)
        data = data.groupby(['aid_x', 'aid_y']).sum()

        if matrix is None:
            matrix = data
        else:
            matrix = matrix.add(data, fill_value=0)  # this is the bottleneck operation

        del data

    matrix = matrix.reset_index().rename(columns={'aid_x': 'aid', 'aid_y': 'candidate'})
    matrix = matrix.sort_values(
        ['aid', 'wgt'], ascending=[True, False], ignore_index=True
    )

    matrix['rank'] = matrix.groupby('aid').candidate.cumcount()
    matrix = matrix[matrix['rank'] < N_CANDIDS].reset_index(drop=True)
    covisit_matrix.append(matrix)

covisit_matrix = cudf.concat(covisit_matrix, ignore_index=True)

- Part 1/2


100%|██████████| 129/129 [04:37<00:00,  2.15s/it]


- Part 2/2


100%|██████████| 129/129 [04:37<00:00,  2.15s/it]


CPU times: user 3min 45s, sys: 4min 43s, total: 8min 29s
Wall time: 9min 15s


In [6]:
covisit_matrix.to_parquet((lb_out / 'covisit.parquet').as_posix())

del covisit_matrix

## CV

In [7]:
cv_in = data_path / 'cv'
cv_out = temp_path / 'cv'
if not cv_out.is_dir():
    cv_out.mkdir()

In [8]:
%%time

train_path = cv_in / 'train_parquet'
covisit_matrix = []

for part in range(PARTS):
    print(f'- Part {part + 1}/{PARTS}')
    matrix = None
    for train_file in tqdm(sorted(train_path.glob('*.parquet'))):
        data = cudf.read_parquet(train_file.as_posix())

        data['type'] = data['type'].map(wgt_map)
        data.rename({'type': 'wgt'}, axis=1, inplace=True)

        data['wgt'] = data['wgt'].astype('float32')
        data['ts'] = (data['ts'] / 1000).astype('int32')
        data[['session', 'aid']] = data[['session', 'aid']].astype('int32')

        data = data.sort_values(
            ['session', 'ts'], ascending=[True, False], ignore_index=True
        )
        data['n'] = data.groupby('session').cumcount()
        data = data.loc[data.n < 30].drop('n', axis=1)

        data = data.drop('wgt', axis=1).merge(
            data, 
            on='session',
            how='left'
        )

        data = data.loc[(data['aid_x'] >= part * SIZE) & (data['aid_x'] < (part + 1) * SIZE)]

        data = data.loc[
            ((data['ts_x'] - data['ts_y']).abs() < 60 * 60) & (data.aid_x != data.aid_y)
        ]

        data = data.drop_duplicates(
            subset=['session', 'aid_x', 'aid_y'], keep='first'
        ).reset_index(drop=True)

        data['wgt'] *= (1 / 2) ** ((data.ts_x - data.ts_y).abs() / 60 / 60)  # NEW
        data['wgt'] = data['wgt'].astype('float32')

        data.drop(['session', 'ts_x', 'ts_y'], axis=1, inplace=True)
        data = data.groupby(['aid_x', 'aid_y']).sum()

        if matrix is None:
            matrix = data
        else:
            matrix = matrix.add(data, fill_value=0)  # this is the bottleneck operation

        del data

    matrix = matrix.reset_index().rename(columns={'aid_x': 'aid', 'aid_y': 'candidate'})
    matrix = matrix.sort_values(
        ['aid', 'wgt'], ascending=[True, False], ignore_index=True
    )

    matrix['rank'] = matrix.groupby('aid').candidate.cumcount()
    matrix = matrix[matrix['rank'] < N_CANDIDS].reset_index(drop=True)
    covisit_matrix.append(matrix)

covisit_matrix = cudf.concat(covisit_matrix, ignore_index=True)

- Part 1/2


100%|██████████| 106/106 [03:18<00:00,  1.87s/it]


- Part 2/2


100%|██████████| 106/106 [03:18<00:00,  1.87s/it]


CPU times: user 2min 42s, sys: 3min 41s, total: 6min 23s
Wall time: 6min 37s


In [9]:
covisit_matrix.to_parquet((cv_out / 'covisit.parquet').as_posix())

del covisit_matrix