**About :** Generates candidates.

**TODO**:
- Matrices from optimized notebook

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import gc
import sys
import cudf
import json
import glob
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

os.environ["CUDA_VISIBLE_DEVICES"] = "3"
warnings.simplefilter(action="ignore", category=FutureWarning)

pandarallel.initialize(nb_workers=32, progress_bar=False)

In [None]:
from params import *

from data.covisitation import compute_covisitation_matrix
from data.candidates import load_parquets, create_candidates, explode, matrix_to_candids_dict

from utils.metrics import get_coverage
from utils.chris import suggest_clicks, suggest_buys, read_file_to_cache

## Covisitation matrices
- Recompute on train without using val ??

In [None]:
data_cache = {}
type_labels = {"clicks": 0, "carts": 1, "orders": 2}

# files = glob.glob("../input/chris/*_parquet/*")

files = glob.glob("../output/full_train_parquet/*") +  glob.glob("../output/val_parquet/*")

for f in tqdm(files):
    data_cache[f] = read_file_to_cache(f)

In [None]:
# mins = []
# maxs = []
# st = 0
# sv = 0
# nv = 0
# nt = 0

# for k in data_cache.keys():
#     if "val" in k or "test" in k:
#         mins.append(data_cache[k]['ts'].min())
#         sv += len(data_cache[k]['session'].unique())
#         nv += len(data_cache[k])
#     else:
#         maxs.append(data_cache[k]['ts'].max())
#         st += len(data_cache[k]['session'].unique())
#         nt += len(data_cache[k])
        
# np.min(mins) > np.max(maxs), st, sv, nt, nv

In [None]:
compute_covisitation_matrix(
    files,
    data_cache,
    weighting="temporal",
    n=20,
    save_folder="../output/matrices/",
)

In [None]:
compute_covisitation_matrix(
    files,
    data_cache,
    weighting="type",
    type_weight={0: 1, 1: 3, 2: 6},
    n=20,
    save_folder="../output/matrices/",
)

In [None]:
compute_covisitation_matrix(
    files,
    data_cache,
    considered_types=[1, 2],
    weighting="",
    n=20,
    save_folder="../output/matrices/",
)

## Chris Rerank

In [None]:
# df_val = load_parquets("../input/chris/test_parquet/*")

# top_clicks = df_val.loc[df_val["type"] == 0, "aid"].value_counts().index.values[:20]
# top_carts = df_val.loc[df_val["type"] == 1, "aid"].value_counts().index.values[:20]
# top_orders = df_val.loc[df_val["type"] == 2, "aid"].value_counts().index.values[:20]

In [None]:
# clicks_candids = matrix_to_candids_dict(
#     cudf.read_parquet("../output/matrices/matrix_123_temporal_20.pqt")
# )
# type_weighted_candids = matrix_to_candids_dict(
#     cudf.read_parquet("../output/matrices/matrix_123_type_15.pqt")
# )
# cartbuy_candids = matrix_to_candids_dict(
#     cudf.read_parquet("../output/matrices/matrix_12__15.pqt")
# )

In [None]:
# %%time
# pred_df_clicks = df_val.groupby(["session"]).apply(
#     lambda x: suggest_clicks(x, clicks_candids, top_clicks)
# )

In [None]:
# %%time
# pred_df_buys = df_val.groupby(["session"]).apply(
#     lambda x: suggest_buys(x, type_weighted_candids, cartbuy_candids, top_orders)
# )

In [None]:
# %%time

# try:
#     clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
# except:
#     clicks_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_clicks"), columns=["labels"]).reset_index()

# orders_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_orders"), columns=["labels"]).reset_index()
# carts_pred_df = pd.DataFrame(pred_df_buys.add_suffix("_carts"), columns=["labels"]).reset_index()

# pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
# pred_df.columns = ["session_type", "labels_l"]
# pred_df["labels"] = pred_df["labels_l"].apply(lambda x: " ".join(map(str, x)))

In [None]:
# gt = pd.read_parquet("../input/chris/test_labels.parquet")


# df_pred = pred_df[["session_type", "labels_l"]].copy()
# df_pred.columns = ["session_type", "candidates"]
# df_pred["session"] = (
#     df_pred["session_type"].apply(lambda x: x.split("_")[0]).astype(int)
# )
# df_pred["type"] = df_pred["session_type"].apply(lambda x: x.split("_")[1])

# df_pred = df_pred.merge(gt, on=["session", "type"], how="left")

# for col in CLASSES:
#     df_pred_c = df_pred[df_pred["type"] == col]

#     n_preds, n_gts, n_found = get_coverage(
#         df_pred_c["candidates"].values, df_pred_c["ground_truth"].values
#     )
#     print(
#         f"{col}\t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.3f}"
#     )

- clicks	- Found 574.14K GTs with 36.03M candidates (pos_prop=1.59%)	-  Highest reachable Recall : 0.327
- carts	- Found 181.3K GTs with 36.03M candidates (pos_prop=0.50%)	-  Highest reachable Recall : 0.314
- orders	- Found 186.38K GTs with 36.03M candidates (pos_prop=0.52%)	-  Highest reachable Recall : 0.595

- clicks	- Found 922.64K (52.56%) GTs with 35.74M candidates (pos_prop=2.58%)
- carts	- Found 236.14K (40.96%) GTs with 32.3M candidates (pos_prop=0.73%)
- orders	- Found 203.32K (64.90%) GTs with 32.3M candidates (pos_prop=0.63%)

## Val candidates

## Candidates

In [None]:
N_MATRIX = 10
MAX_COOC = 100

In [None]:
df_val = load_parquets("../output/val_parquet/*")
# df_val = load_parquets("../input/chris/test_parquet/*")
df_val = df_val.sort_values(["session", "ts"]).reset_index(drop=True)

In [None]:
%%time
df_val = create_candidates(df_val, n_matrix=N_MATRIX, max_cooc=MAX_COOC)

In [None]:
n_candid = df_val["candidates"].apply(len)
sns.histplot(np.clip(n_candid, 0, 150))

plt.title(f"Proportion of sessions with <20 candidates : {(n_candid < 20).mean() :.3f}")
plt.show()

### Coverage

In [None]:
# gt = pd.read_parquet("../input/chris/test_labels.parquet")
gt = pd.read_parquet("../output/val_labels.parquet")

In [None]:
recalls = []
for col in CLASSES:
    if f"gt_{col}" not in df_val.columns:
        df_val = df_val.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        df_val["candidates"].values, df_val[f"gt_{col}"].values
    )

    print(
        f"{col}\t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.3f}"
    )
    recalls.append(n_found / n_gts)

In [None]:
cv = np.average(recalls, weights=WEIGHTS)
print(f"-> Highest reachable CV : {cv:.3f}")

In [None]:
# -> Highest reachable CV : 0.577

### Explode & saving

In [None]:
df_val = explode(df_val)

In [None]:
df_val.to_parquet(
    f"../output/candidates_val_{int(np.round(cv, 3) * 1000)}.parquet", index=False
)
print(f"Saved to ../output/candidates_val_{int(np.round(cv, 3) * 1000)}.parquet")

In [None]:
del df_val
gc.collect()

## Train Candidates
- this is leaky ! 

In [None]:
gt = pd.read_parquet("../output/train_labels.parquet")

In [None]:
df_train = load_parquets("../output/train_parquet/*")
df_train = df_train.sort_values(["session", "ts"]).reset_index(drop=True)

In [None]:
%%time
df_train = create_candidates(df_train, n_matrix=N_MATRIX, max_cooc=MAX_COOC)

In [None]:
recalls = []
for col in CLASSES:
    if f"gt_{col}" not in df_train.columns:
        df_train = df_train.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        df_train["candidates"].values, df_train[f"gt_{col}"].values
    )

    print(
        f"{col}\t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.3f}"
    )
    recalls.append(n_found / n_gts)
    
cv_ = np.average(recalls, weights=WEIGHTS)
print(f"\n-> Highest reachable CV : {cv_:.3f}")

In [None]:
df_train = explode(df_train)

In [None]:
df_train.to_parquet(
    f"../output/candidates_train_{int(np.round(cv, 3) * 1000)}.parquet", index=False
)
print(f"Saved to ../output/candidates_train_{int(np.round(cv, 3) * 1000)}.parquet")

### Val cropped candidates
- leaky as well !

In [None]:
gt = pd.read_parquet("../output/val_c_labels.parquet")

In [None]:
df_val_c = load_parquets("../output/val_c_parquet/*")
df_val_c = df_val_c.sort_values(["session", "ts"]).reset_index(drop=True)

In [None]:
%%time
df_val_c = create_candidates(df_val_c, n_matrix=N_MATRIX, max_cooc=MAX_COOC)

In [None]:
recalls = []
for col in CLASSES:
    if f"gt_{col}" not in df_val_c.columns:
        df_val_c = df_val_c.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
            columns={"ground_truth": f"gt_{col}"}
        )

    n_preds, n_gts, n_found = get_coverage(
        df_val_c["candidates"].values, df_val_c[f"gt_{col}"].values
    )

    print(
        f"{col}\t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.3f}"
    )
    recalls.append(n_found / n_gts)
    
cv_ = np.average(recalls, weights=WEIGHTS)
print(f"\n-> Highest reachable CV : {cv_:.3f}")

In [None]:
df_val_c = explode(df_val_c)

In [None]:
df_val_c.to_parquet(
    f"../output/candidates_val_c_{int(np.round(cv, 3) * 1000)}.parquet", index=False
)
print(f"Saved to ../output/candidates_val_c_{int(np.round(cv, 3) * 1000)}.parquet")

Done