**About :** Computes Features.

**TODO**:
- not leaky matrices
- not leaky tgt enc

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import gc
import re
import sys
import cudf
import json
import glob
import numba
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
warnings.simplefilter(action="ignore", category=FutureWarning)

pandarallel.initialize(nb_workers=32, progress_bar=False)

In [None]:
from params import *

from data.covisitation import compute_covisitation_matrix
from data.candidates import load_parquets, create_candidates, explode

from utils.metrics import get_coverage
from utils.chris import suggest_clicks, suggest_buys, read_file_to_cache

### Load candidates

In [None]:
def load_sessions(regex):
    dfs = []
    for e, chunk_file in enumerate(glob.glob(regex)):
        chunk = cudf.read_parquet(chunk_file)
        chunk.ts = (chunk.ts / 1000).astype("int32")
        chunk["type"] = chunk["type"].map(TYPE_LABELS).astype("int8")
        dfs.append(chunk)
    
    return cudf.concat(dfs).sort_values(['session', 'aid']).reset_index(drop=True)

In [None]:
MODE = "val_c"
SUFFIX = "v3"

In [None]:
if MODE == "val":
    CANDIDATE_FILE = '../output/candidates_val_592.parquet'
    PARQUET_FILES = "../output/val_parquet/*"
elif MODE == "val_c":
    CANDIDATE_FILE = '../output/candidates_val_c_592.parquet'
    PARQUET_FILES = "../output/val_c_parquet/*"
else:  # train
    CANDIDATE_FILE = '../output/candidates_train_592.parquet'
    PARQUET_FILES = "../output/train_parquet/*"
    
pairs = cudf.read_parquet(CANDIDATE_FILE)
pairs = pairs.sort_values(['session', 'candidates'])

### Time weighting

In [None]:
def compute_weights(sessions):
    sessions.sort_values(['session', "ts"], ascending=[True, False]).reset_index(drop=True)
    
    sessions['w'] = sessions.groupby('session')['aid'].cumcount()
    
    sessions = sessions.merge(
        cudf.DataFrame(sessions.groupby('session')['aid'].size()),
        on="session",
        how="left"
    ).rename(columns={0: "n"})
    
    sessions["logspace_w"] = sessions.apply(
        lambda x : 1 if x.n == 1 else 2 ** (0.1 + 0.9 * (x.n - x.w - 1) / (x.n - 1)) - 1,
        axis=1
    )
    sessions["linspace_w"] = sessions['w'].apply(
        lambda x : 0.05 if x >= 20 else 0.1 + 0.9 * (18 - x) / 18
    )
    
    weights = sessions[["session", "aid", "logspace_w", "linspace_w"]].groupby(['session', 'aid']).sum().reset_index()
    
    weights = weights.sort_values(['session', "aid"]).reset_index(drop=True).rename(columns={"aid": "candidates"})
    
    return weights

In [None]:
sessions = load_sessions(PARQUET_FILES)
weights = compute_weights(sessions)

In [None]:
pairs = pairs.merge(weights, how="left", on=["session", "candidates"])
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

pairs['logspace_w'] = pairs['logspace_w'].fillna(pairs["logspace_w"].min() / 2).astype("float32")
pairs['linspace_w'] = pairs['linspace_w'].fillna(pairs["linspace_w"].min() / 2).astype("float32")

In [None]:
del sessions
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Covisitation features
TODO :
- time weighted agg, agg last n
- merge rank in matrix

In [None]:
def compute_coocurence_features(pairs, matrix_file, weights):
    pairs['group'] = pairs['session'] // 100000
    
    weights = weights.rename(columns={"candidates": "aid"})

    mat = cudf.read_parquet(matrix_file)
    mat.columns = ['aid', 'candidates', 'w']

    fts = []
    for _, df in tqdm(pairs.groupby('group')):
        df = df[['session', 'candidates', 'aid']].explode('aid').reset_index(drop=True)

        df = df.merge(mat, how="left", on=["aid", "candidates"]).reset_index().fillna(0)

        df = df.merge(weights, how="left", on=["session", "aid"])
        df['logspace_w'] *= df['w']
        df['linspace_w'] *= df['w']

        df = df[['candidates', 'session', 'w', 'logspace_w', 'linspace_w']].groupby(['session', 'candidates']).agg(["mean", "sum", "max"])
        df.columns = ['_'.join(col) for col in df.columns.values]

        df[df.columns] = df[df.columns].astype("float32")
        fts.append(df.reset_index())

    fts = cudf.concat(fts, ignore_index=True)
    fts = fts.sort_values(['session', 'candidates']).reset_index(drop=True)

    return fts

In [None]:
MATRIX_FOLDER = "../output/matrices/"
MATRIX_NAMES = ["matrix_123_temporal_20", "matrix_123_type136_20", "matrix_12__20"]

In [None]:
sessions = load_sessions(PARQUET_FILES)

sessions = sessions.sort_values(['session', "aid"]).groupby('session').agg(list).reset_index()
pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

In [None]:
for name in MATRIX_NAMES:
    print(f' -> Features from {name}')

    fts = compute_coocurence_features(
        pairs[['session', 'candidates', 'aid']],
        os.path.join(MATRIX_FOLDER, name + ".pqt"),
        weights
    )
    
    for c in fts.columns[2:]:
        pairs[f"{name}_{re.sub('w_', '', c)}"] = fts[c].values

    del fts
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
pairs.drop('aid', axis=1, inplace=True)

del sessions, weights
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Popularity
TODO :
- Popularity of items in session
- Popularity over different periods  (day / month)
- Time weighted popularity

In [None]:
sessions = load_sessions(PARQUET_FILES)

for i, c in enumerate(CLASSES):
    print(f'-> Popularity for {c}')
    popularity = cudf.DataFrame(sessions.loc[sessions["type"] == i, "aid"].value_counts()).reset_index()
    popularity.columns = ['candidates', f'{c}_popularity']
    popularity[f'{c}_popularity'] = np.clip(popularity[f'{c}_popularity'], 0, 2 ** 16 - 1).astype("uint16")

    pairs = pairs.merge(popularity, how="left", on="candidates").fillna(0)

del sessions, popularity
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Session features
- Count views/clicks/carts/orders of session
- Count views/clicks/carts/orders of each candidate

TODO :
- Distance to last view

In [None]:
def count_actions(pairs, sessions):
    pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
    pairs['group'] = pairs['session'] // 100000

    fts = []
    for _, df in tqdm(pairs.groupby('group')):
        df = df[['session', 'candidates', 'aid']].explode('aid')
        df['aid'] = (df['aid'] == df['candidates']).astype(np.uint16)

        df = df.groupby(
            ["session", "candidates"]
        ).sum().reset_index()
        
        fts.append(df)
    
    ft = cudf.concat(fts, ignore_index=True)
    ft = ft.sort_values(['session', 'candidates'])['aid'].values

    return np.clip(ft, 0, 255).astype(np.uint8)

In [None]:
for i, c in enumerate(CLASSES + ["*"]):
    print(f'-> Candidate {c if c != "*" else "views"} in session')

    sessions = load_sessions(PARQUET_FILES)
    if c != "*":
        sessions.loc[sessions["type"] != i, "aid"] = -1

    sessions = sessions.groupby('session').agg(list).reset_index()

    pairs[f'candidate_{c}_before'] = count_actions(
        pairs[['session', 'candidates']],
        sessions
    )
    
    del sessions
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
#     break

In [None]:
sessions = load_sessions(PARQUET_FILES)

n_views = sessions[['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_views"})
n_clicks = sessions[sessions['type'] == 0][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_clicks"})
n_carts = sessions[sessions['type'] == 1][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_carts"})
n_orders = sessions[sessions['type'] == 2][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_orders"})

# sessions = sessions.merge(n_views, how="left", on="session").fillna(0)
sessions_fts = n_views.merge(n_clicks, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_carts, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_orders, how="left", on="session").fillna(0)

for c in sessions_fts.columns[1:]:
    sessions_fts[c] = np.clip(sessions_fts[c], 0, 255).astype(np.uint8)

In [None]:
pairs = pairs.merge(sessions_fts, on="session", how="left")
pairs = pairs.sort_values(['session', 'candidates'])

### Save

In [None]:
# pairs.info()

In [None]:
def save_by_chunks(pairs, folder):
    print(f'-> Saving chunks to {folder}')
    os.makedirs(folder, exist_ok=True)
    
    pairs['group'] = pairs['session'] // 100000

    for i, (_, df) in enumerate(tqdm(pairs.groupby('group'))):
        df.drop('group', axis=1, inplace=True)
        df.to_parquet(os.path.join(folder, f'{i:03d}.parquet'))

In [None]:
save_by_chunks(pairs, f"../output/fts_{MODE}_{SUFFIX}/")

In [None]:
# pairs.to_csv(
#     f"../output/fts_{MODE}_{SUFFIX}.csv", index=False, chunksize=100000
# )
# print(f"Saved to ../output/fts_{MODE}_{SUFFIX}.csv")

In [None]:
# pairs.to_pandas().to_parquet(
#     f"../output/fts_{MODE}_{SUFFIX}.parquet", index=False
# )
# print(f"Saved to ../output/fts_{MODE}_{SUFFIX}.parquet")

Done