In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import gc
import sys
import cudf
import json
import glob
import numba
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
warnings.simplefilter(action="ignore", category=FutureWarning)

pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
from params import *

from data.covisitation import compute_covisitation_matrix
from data.candidates import load_parquets, create_candidates, explode

from utils.metrics import get_coverage
from utils.chris import suggest_clicks, suggest_buys, read_file_to_cache

### Load candidates

In [5]:
MODE = "val"
SUFFIX = "v1"

In [6]:
if MODE == "val":
    CANDIDATE_FILE = '../output/candidates_val_593.parquet'
#     PARQUET_FILES = "../output/val_parquet/*"
    PARQUET_FILES = "../input/chris/test_parquet/*"
else:  # train
    CANDIDATE_FILE = '../output/candidates_train_593.parquet'
    PARQUET_FILES = "../output/train_parquet/*"
    
pairs = cudf.read_parquet(CANDIDATE_FILE)
pairs = pairs.sort_values(['session', 'candidates'])

### Covisitation features
TODO :
- time weighted agg, agg last n
- merge rank in matrix

In [7]:
def compute_coocurence_features(pairs, matrix_file=""):
    pairs['group'] = pairs['session'] // 100000
    
    mat = cudf.read_parquet(matrix_file)
    mat.columns = ['aid', 'candidates', 'w']

    fts = []
    for _, df in pairs.groupby('group'):
        df = df[['session', 'candidates', 'aid']].explode('aid').reset_index(drop=True)

        df = df.merge(mat, how="left", on=["aid", "candidates"]).reset_index().fillna(0)
        df = df[['candidates', 'session', 'w']].groupby(['session', 'candidates']).agg(["mean", "sum", "max"])
        df.columns = df.columns.get_level_values(1)

        df['mean'] = df['mean'].astype("float32")
        fts.append(df.reset_index())

    fts = cudf.concat(fts, ignore_index=True)
    fts = fts.sort_values(['session', 'candidates']).reset_index(drop=True)

    return fts

In [8]:
MATRIX_FOLDER = "../output/matrices/"
MATRIX_NAMES = ["matrix_123_temporal_20", "matrix_123_type_20", "matrix_12__15"]

In [9]:
sessions = load_parquets(PARQUET_FILES)
sessions = cudf.from_pandas(sessions)

sessions = sessions.sort_values(['session', "aid"]).groupby('session').agg(list).reset_index()
pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

for name in MATRIX_NAMES:
    print(f' -> Features from {name}')

    fts = compute_coocurence_features(
        pairs[['session', 'candidates', 'aid']],
        os.path.join(MATRIX_FOLDER, name + ".pqt")
    )
    
    pairs[f'{name}_mean'] = fts["mean"].values
    pairs[f'{name}_sum'] = fts["sum"].values
    pairs[f'{name}_max'] = fts["max"].values
    
    del fts
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
pairs.drop('aid', axis=1, inplace=True)

del sessions
numba.cuda.current_context().deallocations.clear()
gc.collect()

 -> Features from matrix_123_temporal_20
 -> Features from matrix_123_type_20
 -> Features from matrix_12__15


0

### Popularity
TODO :
- Popularity of items in session
- Popularity over different periods  (day / month)
- Time weighted popularity

In [10]:
sessions = load_parquets(PARQUET_FILES)
sessions = cudf.from_pandas(sessions)

for i, c in enumerate(CLASSES):
    popularity = cudf.DataFrame(sessions.loc[sessions["type"] == i, "aid"].value_counts()).reset_index()
    popularity.columns = ['candidates', f'{c}_popularity']
    popularity[f'{c}_popularity'] = np.clip(popularity[f'{c}_popularity'], 0, 2 ** 16 - 1).astype("uint16")

    pairs = pairs.merge(popularity, how="left", on="candidates").fillna(0)

del sessions, popularity
numba.cuda.current_context().deallocations.clear()
gc.collect()

109

### Session features
- Count views/clicks/carts/orders of session
- Count views/clicks/carts/orders of each candidate

In [11]:
def count_actions(pairs, sessions):
    pairs['group'] = pairs['session'] // 100000

    pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")

    dfp = pairs[['session', 'candidates', 'aid']].explode('aid')
    dfp['aid'] = (dfp['aid'] == dfp['candidates']).astype(np.uint16)

    n_actions = dfp.groupby(
        ["session", "candidates"]
    ).sum().reset_index().sort_values(['session', 'candidates'])['aid'].values
    
    return np.clip(n_actions, 0, 255).astype(np.uint8)

In [12]:
def count_actions(pairs, sessions):
    pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
    pairs['group'] = pairs['session'] // 100000

    fts = []
    for _, df in pairs.groupby('group'):
        df = df[['session', 'candidates', 'aid']].explode('aid')
        df['aid'] = (df['aid'] == df['candidates']).astype(np.uint16)

        df = df.groupby(
            ["session", "candidates"]
        ).sum().reset_index()
        
        fts.append(df)
    
    ft = cudf.concat(fts, ignore_index=True)
    ft = ft.sort_values(['session', 'candidates'])['aid'].values

    return np.clip(ft, 0, 255).astype(np.uint8)

In [13]:
for i, c in enumerate(CLASSES + ["*"]):
    print(f'-> Candidate {c if c != "*" else "views"} in session')

    sessions = load_parquets(PARQUET_FILES)
    sessions = cudf.from_pandas(sessions)

    if c != "*":
        sessions.loc[sessions["type"] != i, "aid"] = -1

    sessions = sessions.groupby('session').agg(list).reset_index()
    
    pairs[f'candidate_{c}_before'] = count_actions(pairs, sessions)
    
    del sessions
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
#     break

-> Candidate clicks in session
-> Candidate carts in session
-> Candidate orders in session
-> Candidate views in session


In [14]:
sessions = load_parquets(PARQUET_FILES)
sessions = cudf.from_pandas(sessions)

n_views = sessions[['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_views"})
n_clicks = sessions[sessions['type'] == 0][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_clicks"})
n_carts = sessions[sessions['type'] == 1][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_carts"})
n_orders = sessions[sessions['type'] == 2][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_orders"})

# sessions = sessions.merge(n_views, how="left", on="session").fillna(0)
sessions_fts = n_views.merge(n_clicks, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_carts, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_orders, how="left", on="session").fillna(0)

for c in sessions_fts.columns[1:]:
    sessions_fts[c] = np.clip(sessions_fts[c], 0, 255).astype(np.uint8)

In [15]:
pairs = pairs.merge(sessions_fts, on="session", how="left")
pairs = pairs.sort_values(['session', 'candidates'])

In [16]:
pairs.to_pandas().to_parquet(
    f"../output/fts_{MODE}_{SUFFIX}.parquet", index=False
)
print(f"Saved to ../output/fts_{MODE}_{SUFFIX}.parquet")

Saved to ../output/fts_val_v1.parquet


Done