**About :** Generates candidates.

**TODO**:

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import os
import gc
import sys
import cudf
import json
import glob
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from numerize.numerize import numerize

from merlin.io import Dataset
from torch.optim import SparseAdam
from merlin.loader.torch import Loader

warnings.simplefilter(action="ignore", category=FutureWarning)

In [None]:
from params import *

from utils.load import load_sessions
from utils.metrics import get_coverage

### Params

In [None]:
MODE = "val"
NO_CLICKS = False

In [None]:
if MODE == "val":
#     files = glob.glob("../output/full_train_parquet/*") + glob.glob(
#         "../output/val_parquet/*"
#     )
    files = glob.glob("../output/full_train_2_parquet/*") + glob.glob(
        "../output/val_2_parquet/*"
    )
elif MODE == "test":
    files = glob.glob("../output/full_train_val_parquet/*") + glob.glob(
        "../output/test_parquet/*"
    )
else:
    raise NotImplementedError

In [None]:
train_pairs = cudf.concat([cudf.read_parquet(f) for f in files], ignore_index=True)

In [None]:
if NO_CLICKS:
    train_pairs = train_pairs[train_pairs['type'] != "clicks"].reset_index(drop=True)

In [None]:
SHIFT = 1
SHIFTS = None

train_pairs['aid_next'] = train_pairs.groupby('session').aid.shift(-1 * SHIFT)
train_pairs = train_pairs[['aid', 'aid_next']].dropna().reset_index(drop=True)

In [None]:
# SHIFTS =  [1, 2, 3, 4, 5]
# SHIFT = "1-5"

# train_pairs_ = []

# for shift in tqdm(SHIFTS):
#     train_pairs['aid_next'] = train_pairs.groupby('session').aid.shift(-1 * shift)
#     train_pairs_.append(train_pairs[['aid', 'aid_next']].dropna().reset_index(drop=True).to_pandas())

# train_pairs = cudf.from_pandas(pd.concat(train_pairs_, ignore_index=True).drop_duplicates(keep="first"))

In [None]:
print('Number of pairs', numerize(len(train_pairs)))

In [None]:
train_pairs.to_pandas().to_parquet(
    f"../output/matrix_factorization/{MODE}_pairs.parquet"
)

In [None]:
train_pairs.tail(10_000_000).to_parquet(
    f"../output/matrix_factorization/{MODE}_pairs_val.parquet"
)

### Utils
- TODO : Cart -> Buy / Buy -> Buy

In [None]:
import torch
from torch import nn


class MatrixFactorization(nn.Module):
    def __init__(self, n_aids, n_factors):
        super().__init__()
        self.aid_factors = nn.Embedding(n_aids, n_factors, sparse=True)

    def forward(self, aid1, aid2):
        aid1 = self.aid_factors(aid1)
        aid2 = self.aid_factors(aid2)

        return (aid1 * aid2).sum(dim=1)


In [None]:
import torch
from torch import nn


class MatrixFactorization2(nn.Module):
    def __init__(self, n_aids, n_factors):
        super().__init__()
        self.aid_factors = nn.Embedding(n_aids, n_factors, sparse=True)
        self.aid_factors_next = nn.Embedding(n_aids, n_factors, sparse=True)

    def forward(self, aid1, aid2):
        aid1 = self.aid_factors(aid1)
        aid2 = self.aid_factors_next(aid2)

        return (aid1 * aid2).sum(dim=1)


In [None]:
class AverageMeter(object):
    """Computes and stores the average and current value"""

    def __init__(self, name, fmt=":f"):
        self.name = name
        self.fmt = fmt
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    def __str__(self):
        fmtstr = "{name} {val" + self.fmt + "} ({avg" + self.fmt + "})"
        return fmtstr.format(**self.__dict__)

In [None]:
train_ds = Dataset(f"../output/matrix_factorization/{MODE}_pairs.parquet")
train_dl_merlin = Loader(train_ds, 65536, True)

valid_ds = Dataset(f"../output/matrix_factorization/{MODE}_pairs_val.parquet")
valid_dl_merlin = Loader(valid_ds, 65536, True)

In [None]:
DIM = 64

N_AIDS = 1855602
EPOCHS = 20
LR = 0.1

In [None]:
# model = MatrixFactorization(N_AIDS + 1, DIM)
model = MatrixFactorization2(N_AIDS + 1, DIM)
model.to("cuda")

optimizer = SparseAdam(model.parameters(), lr=LR)
criterion = nn.BCEWithLogitsLoss()

In [None]:
for epoch in range(1, EPOCHS + 1):
    for batch, _ in train_dl_merlin:
        model.train()
        losses = AverageMeter("Loss", ":.4e")

        aid1, aid2 = batch["aid"], batch["aid_next"]
        aid1 = aid1.to("cuda")
        aid2 = aid2.to("cuda")
        output_pos = model(aid1, aid2)
        output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])

        output = torch.cat([output_pos, output_neg])
        targets = torch.cat([torch.ones_like(output_pos), torch.zeros_like(output_pos)])
        loss = criterion(output, targets)
        losses.update(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()

    with torch.no_grad():
        accuracy = AverageMeter("accuracy")
        for batch, _ in valid_dl_merlin:
            aid1, aid2 = batch["aid"], batch["aid_next"]
            output_pos = model(aid1, aid2)
            output_neg = model(aid1, aid2[torch.randperm(aid2.shape[0])])
            accuracy_batch = (
                torch.cat([output_pos.sigmoid() > 0.5, output_neg.sigmoid() < 0.5])
                .float()
                .mean()
            )
            accuracy.update(accuracy_batch, aid1.shape[0])

    print(
        f"Epoch {epoch:02d}/{EPOCHS} \t loss={losses.avg:.3f} \t val_acc={accuracy.avg:.3f}"
    )

In [None]:
# embeddings = model.aid_factors.weight.detach().cpu().numpy().astype("float32")

# name = f"embed_{SHIFT}_{DIM}{'_cartbuy' if NO_CLICKS else ''}_{MODE}.npy"
# np.save(f"../output/matrix_factorization/{name}", embeddings)

# print(
#     f"Saved matrix of shape {embeddings.shape} to",
#     f"../output/matrix_factorization/{name}",
# )

In [None]:
embeddings = model.aid_factors.weight.detach().cpu().numpy().astype("float32")

name = f"embed_{SHIFT}_{DIM}{'_cartbuy' if NO_CLICKS else ''}_prev_{MODE}.npy"
np.save(f"../output/matrix_factorization/{name}", embeddings)

print(
    f"Saved matrix of shape {embeddings.shape} to",
    f"../output/matrix_factorization/{name}",
)

embeddings = model.aid_factors_next.weight.detach().cpu().numpy().astype("float32")

name = f"embed_{SHIFT}_{DIM}{'_cartbuy' if NO_CLICKS else ''}_next_{MODE}.npy"
np.save(f"../output/matrix_factorization/{name}", embeddings)

print(
    f"Saved matrix of shape {embeddings.shape} to",
    f"../output/matrix_factorization/{name}",
)

### kNN

In [None]:
if MODE == "val":
    REGEX = "../output/val_parquet/*"
elif MODE == "test":
    REGEX = "../output/test_parquet/*"
else:
    raise NotImplementedError
    
N_NEIGHBORS = 50

In [None]:
import cuml

def find_matches(preds, df, n_neighbors=100):
    

    matcher.fit(preds)

    dists, indices = matcher.kneighbors(preds)

    ids = df.index[indices.flatten()].values.reshape(-1, n_neighbors)
    df["matches"] = list(ids)
    matches = df[["matches"]].to_dict(orient="dict")["matches"]
    for k in matches:
        matches[k] = [m for m in matches[k] if m != k]

    return matches

In [None]:
x_prev = np.load("../output/matrix_factorization/embed_1_64_val.npy")
x_next = np.load("../output/matrix_factorization/embed_1_64_val.npy")

In [None]:
n_neighbors = 50

In [None]:
matcher = cuml.neighbors.NearestNeighbors(n_neighbors=n_neighbors, metric="cosine")
matcher.fit(x_next)

In [None]:
df = load_sessions(REGEX)
df = df.sort_values(['session', 'ts']).groupby('session').agg('last').reset_index()

In [None]:
x_prev = x_prev[df['aid'].to_pandas().values]
dists, indices = matcher.kneighbors(x_prev)

In [None]:
df['candidates'] = indices.tolist()

In [None]:
df = df.to_pandas()

In [None]:
df_ = load_sessions(REGEX)
df_ = df_.sort_values(['session', 'ts'], ascending=[True, False]).groupby('session').agg(list).reset_index()

df['aids'] = df_['aid'].to_pandas().values
df['candidates'] = df.apply(lambda x: list(x.candidates) + list(x.aids), axis=1)

In [None]:
if MODE != "test":
    recalls = []
    gt = pd.read_parquet(f"../output/val_labels.parquet")

    for col in CLASSES:
        if f"gt_{col}" not in df.columns:
            df = df.merge(
                gt[gt["type"] == col].drop("type", axis=1), how="left"
            ).rename(columns={"ground_truth": f"gt_{col}"})

        n_preds, n_gts, n_found = get_coverage(
            df["candidates"].values, df[f"gt_{col}"].values
        )

        print(
            f"- {col} \t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.4f}"
        )
        recalls.append(n_found / n_gts)

    cv = np.average(recalls, weights=WEIGHTS)
    print(f"\n-> Highest reachable CV : {cv:.3f}")

+0
- clicks 	- Found 568.89K GTs with 7.7M candidates (pos_prop=7.39%)	-  Highest reachable Recall : 0.3241
- carts 	- Found 181.67K GTs with 7.7M candidates (pos_prop=2.36%)	-  Highest reachable Recall : 0.3154
- orders 	- Found 187K GTs with 7.7M candidates (pos_prop=2.43%)	-  Highest reachable Recall : 0.5970

+10
- clicks 	- Found 583.5K GTs with 25.71M candidates (pos_prop=2.27%)	-  Highest reachable Recall : 0.3324
- carts 	- Found 184.66K GTs with 25.71M candidates (pos_prop=0.72%)	-  Highest reachable Recall : 0.3206
- orders 	- Found 187.44K GTs with 25.71M candidates (pos_prop=0.73%)	-  Highest reachable Recall : 0.5984

+50

In [None]:
def explode(df, test=False):
    if "aid" in df.columns:
        df.drop(["aid", "type"], axis=1, inplace=True)

    df = cudf.from_pandas(df)
    df = df.explode("candidates")
    df = df.drop_duplicates(keep="first", subset=["session", "candidates"])

    df["candidates"] = df["candidates"].astype("uint32")
    df["session"] = df["session"].astype("uint32")

    df = df.sort_values(["session", "candidates"]).reset_index(drop=True)

    if not test:
        for col in ["gt_clicks", "gt_carts", "gt_orders"]:
            df_tgt = (
                df[["session", "candidates", col]].explode(col).reset_index(drop=True)
            ).fillna(-1)
            df_tgt[col] = df_tgt[col].astype("int64") == df_tgt["candidates"].astype(
                "int64"
            )

            assert not df_tgt.isna().any().max()

            df_tgt = df_tgt.groupby(["session", "candidates"]).max().reset_index()
            df_tgt = df_tgt.sort_values(["session", "candidates"]).reset_index(
                drop=True
            )

            assert not df_tgt.isna().any().max()

            df[col] = df_tgt[col].astype("uint8")

    return df

In [None]:
# df_candids = explode(df, test=(MODE == "test"))

In [None]:
# df_candids.to_parquet(
#     f"../output/candidates/candidates_matrix_factorization_{MODE}.parquet", index=False
# )
# print(f"Saved to ../output/candidates/candidates_matrix_factorization_{MODE}.parquet")

Done