**About :** Generates candidates.

**TODO**:

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%load_ext lab_black

In [4]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [5]:
import os
import gc
import sys
import cudf
import json
import glob
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize


warnings.simplefilter(action="ignore", category=FutureWarning)
pandarallel.initialize(nb_workers=16, progress_bar=False, use_memory_fs=True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [6]:
from params import *

from data.covisitation import compute_covisitation_matrix
from data.candidates import (
    load_parquets,
    create_candidates,
    explode,
    matrix_to_candids_dict,
)

from utils.metrics import get_coverage
from utils.chris import suggest_clicks, suggest_buys, read_file_to_cache

### Params

In [7]:
MODE = "val"

In [8]:
if MODE == "val":
    REGEX = "../output/val_parquet/*"
elif MODE == "test":
    REGEX = "../output/test_parquet/*"
else:
    raise NotImplementedError

In [9]:
# data_cache = {}
# for f in tqdm(files):
#     data_cache[f] = read_file_to_cache(f)

In [10]:
ITEM_CT = 75

In [11]:
MATRIX_FOLDER = "../output/matrices/"

In [12]:
SUFFIX = "c-orders-v3"  # 50
SUFFIX = "c-orders-v4"  # 75

### Utils

In [13]:
def matrix_to_candids_dict(matrix):
    matrix = matrix.sort_values(["aid_x", "wgt"], ascending=[True, False])

    candids = matrix[["aid_x", "aid_y"]].groupby("aid_x").agg(list)

    try:
        candids = candids.to_pandas()
    except AttributeError:
        pass

    candids["aid_y"] = candids["aid_y"].apply(lambda x: x.tolist())
    candids_dict = candids.to_dict()["aid_y"]

    return candids_dict

In [14]:
def load_parquets(regex):
    dfs = []
    for e, chunk_file in enumerate(glob.glob(regex)):
        chunk = cudf.read_parquet(chunk_file)
        chunk["d"] = cudf.to_datetime(chunk.ts * 1e6).dt.day.astype("int8")
        chunk.ts = (chunk.ts / 1000).astype("int32")
        chunk["type"] = chunk["type"].map(TYPE_LABELS).astype("int8")

        dfs.append(chunk)

    return (
        cudf.concat(dfs).sort_values(["session", "ts"], ignore_index=True).to_pandas()
    )

### GTs

In [15]:
# gt = cudf.read_parquet("../output/val_labels.parquet")

In [16]:
# gt_clicks = gt[gt["type"] == "clicks"].explode("ground_truth")
# gt_clicks = gt_clicks.drop("type", axis=1).rename(
#     columns={"ground_truth": "candidates"}
# )
# gt_clicks["gt_clicks"] = 1
# gt_clicks["gt_carts"] = 0
# gt_clicks["gt_orders"] = 0

# gt_carts = gt[gt["type"] == "carts"].explode("ground_truth")
# gt_carts = gt_carts.drop("type", axis=1).rename(columns={"ground_truth": "candidates"})
# gt_carts["gt_clicks"] = 0
# gt_carts["gt_carts"] = 1
# gt_carts["gt_orders"] = 0

# gt_orders = gt[gt["type"] == "orders"].explode("ground_truth")
# gt_orders = gt_orders.drop("type", axis=1).rename(
#     columns={"ground_truth": "candidates"}
# )
# gt_orders["gt_clicks"] = 0
# gt_orders["gt_carts"] = 0
# gt_orders["gt_orders"] = 1

In [17]:
# candidates_gt = cudf.concat([gt_clicks, gt_carts, gt_orders], ignore_index=True)

# candidates_gt = (
#     candidates_gt.groupby(["session", "candidates"])
#     .max()
#     .reset_index()
#     .sort_values(["session", "candidates"])
# )

# candidates_gt.to_parquet(f"../output/candidates/candidates_gt.parquet", index=False)

### Load

#### Sessions

In [18]:
df_val = load_parquets(REGEX)

#### Popular Items

In [19]:
top_clicks = (
    df_val.loc[df_val["type"] == 0, "aid"].value_counts().index.values[:100].tolist()
)
top_carts = (
    df_val.loc[df_val["type"] == 1, "aid"].value_counts().index.values[:100].tolist()
)
top_orders = (
    df_val.loc[df_val["type"] == 2, "aid"].value_counts().index.values[:100].tolist()
)

#### Matrices

In [20]:
top_20_buy2buy = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_cpu-90_{MODE}.pqt")
)

top_20_buy2buy2 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_cpu-99_{MODE}.pqt")
)

top_20_orders = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_cpu-95_{MODE}.pqt")
)
top_20_carts = top_20_orders

top_20_test = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-116_{MODE}.pqt")
)

top_20_test2 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-115_{MODE}.pqt")
)

In [21]:
top_20 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-93_{MODE}.pqt")
)

top_20b = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-217_{MODE}.pqt")
)

top_20c = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-220_{MODE}.pqt")
)

top_20d = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-226_{MODE}.pqt")
)

top_20e = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-232_{MODE}.pqt")
)

top_20f = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-235_{MODE}.pqt")
)

In [22]:
top_20_buy = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-239_{MODE}.pqt")
)

top_20_new = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-700_{MODE}.pqt")
)

top_20_new2 = matrix_to_candids_dict(
    cudf.read_parquet(MATRIX_FOLDER + f"matrix_gpu-701_{MODE}.pqt")
)

### Chris Functions

In [23]:
type_weight_multipliers = {0: 1, 1: 6, 2: 3}

In [24]:
def suggest_orders(df):
    aids = df.aid.tolist()
    types = df.type.tolist()
    unique_aids = list(dict.fromkeys(aids[::-1]))

    mx = df.d.max()
    aids2 = df.loc[df.d == mx].aid.tolist()
    unique_aids4 = list(dict.fromkeys(aids2[::-1]))

    mx = df.ts.max()
    aids2 = df.loc[df.ts >= mx - 60 * 60 / 2].aid.tolist()
    unique_aids5 = list(dict.fromkeys(aids2[::-1]))  # recent 1 hour

    df2 = df.drop_duplicates("d")
    aids2 = df2.aid.tolist()
    unique_aids2 = list(dict.fromkeys(aids2[::-1]))  # first of each session

    df2 = df.sort_values("ts", ascending=False).drop_duplicates("d")
    aids2 = df2.aid.tolist()
    unique_aids3 = list(dict.fromkeys(aids2))  # last of each session

    df = df.loc[df["type"].isin([1, 2])]
    unique_buys = list(dict.fromkeys(df.aid.tolist()[::-1]))

    if len(unique_aids) >= 20:
        weights = np.logspace(0.5, 1, len(aids), base=2, endpoint=True) - 1
        aids_temp = Counter()
        for aid, w, t in zip(aids, weights, types):
            aids_temp[aid] += w * type_weight_multipliers[t]
        for aid in unique_aids2:
            aids_temp[aid] += 0.5
        for aid in unique_aids3:
            aids_temp[aid] += 0.5

        aids3 = list(
            itertools.chain(
                *[
                    top_20_buy2buy[aid][:40]
                    for aid in unique_buys
                    if aid in top_20_buy2buy
                ]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.05
            if i % 40 == 0:
                aids_temp[aid] += 0.05
        aids3 = list(
            itertools.chain(
                *[
                    top_20_buy2buy2[aid][:40]
                    for aid in unique_buys
                    if aid in top_20_buy2buy2
                ]
            )
        )
        for i, aid in enumerate(aids3):
            aids_temp[aid] += 0.1
            if i % 40 == 0:
                aids_temp[aid] += 0.1

        aids4 = list(
            itertools.chain(
                *[top_20_test[aid][:40] for aid in unique_aids if aid in top_20_test]
            )
        )
        for i, aid in enumerate(aids4):
            aids_temp[aid] += 0.05
            if i % 40 == 0:
                aids_temp[aid] += 0.05
        aids5 = list(
            itertools.chain(
                *[top_20c[aid][:20] for aid in unique_aids[:1] if aid in top_20c]
            )
        )
        for i, aid in enumerate(aids5):
            aids_temp[aid] += 0.05
            if i % 20 == 0:
                aids_temp[aid] += 0.05
        aids6 = list(
            itertools.chain(
                *[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]
            )
        )
        for i, aid in enumerate(aids6):
            aids_temp[aid] += 0.05
            if i % 20 == 0:
                aids_temp[aid] += 0.05

        aids7 = list(
            itertools.chain(
                *[top_20b[aid][:5] for aid in unique_aids3 if aid in top_20b]
            )
        )
        for i, aid in enumerate(aids7):
            aids_temp[aid] += 0.25
            if i % 5 == 0:
                aids_temp[aid] += 0.25
        aids7 = list(
            itertools.chain(
                *[top_20b[aid][:5] for aid in unique_aids2 if aid in top_20b]
            )
        )
        for i, aid in enumerate(aids7):
            aids_temp[aid] += 0.125
            if i % 5 == 0:
                aids_temp[aid] += 0.125

        result = [k for k, v in aids_temp.most_common(ITEM_CT)]

        if len(result) < 1:
            result += top_orders[: 20 - len(result)]

        return result[:ITEM_CT]

    weights = [2, 2] + [1] * 8  # + [0]*30
    weights2 = [2, 2] + [1] * 53  # + [0]*25
    weights3 = [2, 2] + [1] * 18  # + [0]*70
    weights4 = [2, 2] + [1] * 38  # + [0]*70

    ln = len(unique_aids)

    aids_temp = Counter()
    aids2 = list(
        itertools.chain(
            *[top_20_orders[aid][:10] for aid in unique_aids if aid in top_20_orders]
        )
    )
    w2 = weights * int(len(aids2) // 10)
    aids3 = list(
        itertools.chain(
            *[top_20_buy2buy[aid][:10] for aid in unique_buys if aid in top_20_buy2buy]
        )
    )
    w3 = weights * int(len(aids3) // 10)
    aids4 = list(
        itertools.chain(
            *[top_20_test[aid][:10] for aid in unique_aids if aid in top_20_test]
        )
    )
    w4 = weights * int(len(aids4) // 10)
    aids5 = list(
        itertools.chain(
            *[
                top_20_buy2buy2[aid][:10]
                for aid in unique_buys
                if aid in top_20_buy2buy2
            ]
        )
    )
    w5 = weights * int(len(aids5) // 10)
    for i, (aid, w) in enumerate(zip(aids2, w2)):
        m = 0.25 + 0.75 * (ln - (i // 10)) / ln
        aids_temp[aid] += w * m
    for i, (aid, w) in enumerate(zip(aids3, w3)):
        aids_temp[aid] += w / 2
    for i, (aid, w) in enumerate(zip(aids4, w4)):
        m = 0.25 + 0.75 * (ln - (i // 10)) / ln
        aids_temp[aid] += w * m
    for i, (aid, w) in enumerate(zip(aids5, w5)):
        aids_temp[aid] += w / 2

    aids5 = list(
        itertools.chain(
            *[top_20c[aid][:55] for aid in unique_aids[:1] if aid in top_20c]
        )
    )
    w5 = weights2 * int(len(aids5) // 55)
    for aid, w in zip(aids5, w5):
        aids_temp[aid] += w

    # NEW
    if len(unique_aids) == 1:
        aids5 = list(
            itertools.chain(
                *[
                    top_20_new2[aid][:20]
                    for aid in unique_aids[-1:]
                    if aid in top_20_new2
                ]
            )
        )
        w5 = weights3 * int(len(aids5) // 20)
        for aid, w in zip(aids5, w5):
            aids_temp[aid] += w
        aids5 = list(
            itertools.chain(
                *[top_20_new[aid][:20] for aid in unique_aids[-1:] if aid in top_20_new]
            )
        )
        w5 = weights3 * int(len(aids5) // 20)
        for aid, w in zip(aids5, w5):
            aids_temp[aid] += w

    aids5 = list(
        itertools.chain(
            *[top_20d[aid][:20] for aid in unique_buys[:1] if aid in top_20d]
        )
    )
    w5 = weights3 * int(len(aids5) // 20)
    for aid, w in zip(aids5, w5):
        aids_temp[aid] += w

    ln2 = len(unique_aids5)
    aids5 = list(
        itertools.chain(
            *[top_20_buy[aid][:20] for aid in unique_aids5 if aid in top_20_buy]
        )
    )
    w5 = weights3 * int(len(aids5) // 20)
    for aid, w in zip(aids5, w5):
        aids_temp[aid] += 2 * w / ln2

    aids4 = list(
        itertools.chain(*[top_20f[aid][:5] for aid in unique_aids4 if aid in top_20f])
    )
    for i, aid in enumerate(aids4):
        w = i // 5
        aids_temp[aid] += 1 / 2 - w * 0.05
        if i % 5 == 0:
            aids_temp[aid] += 1 / 2 - w * 0.05
    aids5 = list(
        itertools.chain(*[top_20e[aid][:55] for aid in unique_aids3 if aid in top_20e])
    )
    w5 = weights2 * int(len(aids5) // 55)
    for i, (aid, w) in enumerate(zip(aids5, w5)):
        w2 = i // 55
        aids_temp[aid] += w - w2 * 0.1
    aids5 = list(
        itertools.chain(*[top_20e[aid][:10] for aid in unique_aids2 if aid in top_20e])
    )
    w5 = weights * int(len(aids5) // 10)
    for i, (aid, w) in enumerate(zip(aids5, w5)):
        w2 = i // 10
        aids_temp[aid] += w / 2.0 - w2 * 0.05

    sorted_aids = [k for k, v in aids_temp.most_common(ITEM_CT) if k not in unique_aids]

    result = unique_aids + sorted_aids[: ITEM_CT - len(unique_aids)]

    if len(result) < 1:
        result += top_orders[: 20 - len(result)]

    return result[:ITEM_CT]

### Main

In [25]:
df_val["chunk"] = df_val["session"] // 100000

In [26]:
%%time
preds = []
pbar = tqdm(df_val.groupby("chunk"), total=len(df_val["chunk"].unique()))

for _, dfg in pbar:
    pbar.set_description(f"Chunk size {len(dfg)}")
    pred_df_orders = dfg.groupby(["session"]).parallel_apply(
        lambda x: suggest_orders(x)
    )
    preds.append(pred_df_orders)
    
pred_df_orders = pd.concat(preds)

del preds
gc.collect()

Chunk size 353331: 100%|██████████| 19/19 [26:21<00:00, 83.22s/it]


CPU times: user 5min 59s, sys: 6min 3s, total: 12min 2s
Wall time: 26min 42s


0

### Coverage

In [27]:
%%time

if MODE != "test":
    try:
        clicks_pred_df = pd.DataFrame(pred_df_clicks.add_suffix("_clicks"), columns=["labels"]).reset_index()
    except:
        clicks_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_clicks"), columns=["labels"]).reset_index()

    orders_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_orders"), columns=["labels"]).reset_index()
    carts_pred_df = pd.DataFrame(pred_df_orders.add_suffix("_carts"), columns=["labels"]).reset_index()

    pred_df = pd.concat([clicks_pred_df, orders_pred_df, carts_pred_df])
    pred_df.columns = ["session_type", "labels_l"]
    pred_df["labels"] = pred_df["labels_l"].apply(lambda x: " ".join(map(str, x)))

CPU times: user 2min 24s, sys: 7.81 s, total: 2min 32s
Wall time: 2min 32s


In [28]:
if MODE != "test":
    gt = pd.read_parquet("../output/val_labels.parquet")

    recs = []
    df_pred = pred_df[["session_type", "labels_l"]].copy()
    df_pred.columns = ["session_type", "candidates"]
    df_pred["session"] = (
        df_pred["session_type"].apply(lambda x: x.split("_")[0]).astype(int)
    )
    df_pred["type"] = df_pred["session_type"].apply(lambda x: x.split("_")[1])

    df_pred = df_pred.merge(gt, on=["session", "type"], how="left")

    for col in CLASSES:
        df_pred_c = df_pred[df_pred["type"] == col]

        n_preds, n_gts, n_found = get_coverage(
            df_pred_c["candidates"].values, df_pred_c["ground_truth"].values
        )
        print(
            f"- {col} \t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.4f}"
        )

        recs.append(n_found / n_gts)

    cv = np.average(recs, weights=WEIGHTS)
    print(f"\n-> CV : {cv:.4f}")

    del clicks_pred_df, orders_pred_df, carts_pred_df, pred_df, df_pred
    gc.collect()

- clicks 	- Found 1.11M GTs with 127.05M candidates (pos_prop=0.87%)	-  Highest reachable Recall : 0.6317
- carts 	- Found 287.71K GTs with 127.05M candidates (pos_prop=0.23%)	-  Highest reachable Recall : 0.4995
- orders 	- Found 220.07K GTs with 127.05M candidates (pos_prop=0.17%)	-  Highest reachable Recall : 0.7026

-> CV : 0.6346


- 20
 - clicks 	- Found 908.67K GTs with 36.03M candidates (pos_prop=2.52%)	-  Highest reachable Recall : 0.5177
 - carts 	- Found 245.23K GTs with 36.03M candidates (pos_prop=0.68%)	-  Highest reachable Recall : 0.4257
 - orders 	- Found 206.18K GTs with 36.03M candidates (pos_prop=0.57%)	-  Highest reachable Recall : 0.6582
- 50
 - clicks 	- Found 1.07M GTs with 87.44M candidates (pos_prop=1.22%)	-  Highest reachable Recall : 0.6076
 - carts 	- Found 275.95K GTs with 87.44M candidates (pos_prop=0.32%)	-  Highest reachable Recall : 0.4790
 - orders 	- Found 216.38K GTs with 87.44M candidates (pos_prop=0.25%)	-  Highest reachable Recall : 0.6908
- 75
 - clicks 	- Found 1.11M GTs with 127.24M candidates (pos_prop=0.87%)	-  Highest reachable Recall : 0.6317
 - carts 	- Found 287.71K GTs with 127.24M candidates (pos_prop=0.23%)	-  Highest reachable Recall : 0.4995
 - orders 	- Found 220.08K GTs with 127.24M candidates (pos_prop=0.17%)	-  Highest reachable Recall : 0.7026

### Save

In [29]:
df_candids = pd.DataFrame(pred_df_orders).reset_index()
df_candids.columns = ["session", "candidates"]

In [30]:
if MODE != "test":
    gt = pd.read_parquet(f"../output/{MODE}_labels.parquet")
    gt["ground_truth"] = gt["ground_truth"].apply(lambda x: x.tolist())

    for col in CLASSES:
        if f"gt_{col}" not in df_candids.columns:
            df_candids = df_candids.merge(
                gt[gt["type"] == col].drop("type", axis=1), how="left"
            ).rename(columns={"ground_truth": f"gt_{col}"})

In [31]:
def explode(df, test=False):
    if "aid" in df.columns:
        df.drop(["aid", "type"], axis=1, inplace=True)

    df = cudf.from_pandas(df)
    df = df.explode("candidates")
    df = df.drop_duplicates(keep="first", subset=["session", "candidates"])

    df["candidates"] = df["candidates"].astype("uint32")
    df["session"] = df["session"].astype("uint32")

    df = df.sort_values(["session", "candidates"]).reset_index(drop=True)

    if not test:
        for col in ["gt_clicks", "gt_carts", "gt_orders"]:
            df_tgt = (
                df[["session", "candidates", col]].explode(col).reset_index(drop=True)
            ).fillna(-1)
            df_tgt[col] = df_tgt[col].astype("int64") == df_tgt["candidates"].astype(
                "int64"
            )

            assert not df_tgt.isna().any().max()

            df_tgt = df_tgt.groupby(["session", "candidates"]).max().reset_index()
            df_tgt = df_tgt.sort_values(["session", "candidates"]).reset_index(
                drop=True
            )

            assert not df_tgt.isna().any().max()

            df[col] = df_tgt[col].astype("uint8")

    return df

In [32]:
df_candids = explode(df_candids, test=(MODE == "test"))

df_candids.to_parquet(
    f"../output/candidates/candidates_{SUFFIX}_{MODE}.parquet", index=False
)
print(f"Saved to ../output/candidates/candidates_{SUFFIX}_{MODE}.parquet")

Saved to ../output/candidates/candidates_c-orders-v4_val.parquet


## Theo's version

### Params

In [None]:
MODE = "test"
SUFFIX = "v5"

In [None]:
N_MATRIX = 20
MAX_COOC = 100

In [None]:
if MODE == "val":
    PARQUET_FILES = "../output/val_parquet/*"
elif MODE == "val_c":  # needs to be recomputed, matrices as well
    PARQUET_FILES = "../output/val_c_parquet/*"
elif MODE == "train":
    PARQUET_FILES = "../output/train_parquet/*"
elif MODE == "test":
    PARQUET_FILES = "../output/test_parquet/*"
else:
    raise NotImplementedError

### Load

In [None]:
df = load_parquets(PARQUET_FILES)
df = df.sort_values(["session", "ts"]).reset_index(drop=True)

### Candidates

In [None]:
# %%time  # TODO
# df = create_candidates(df, clicks_candids, type_weighted_candids, max_cooc=MAX_COOC)

In [None]:
del clicks_candids, type_weighted_candids
gc.collect()

### Coverage

In [None]:
n_candid = df["candidates"].apply(len)
sns.histplot(np.clip(n_candid, 0, 150))

plt.title(f"Proportion of sessions with <20 candidates : {(n_candid < 20).mean() :.3f}")
plt.show()

In [None]:
if MODE != "test":

    recalls = []
    gt = pd.read_parquet(f"../output/{MODE}_labels.parquet")
    # gt = pd.read_parquet("../input/chris/test_labels.parquet")

    for col in CLASSES:
        if f"gt_{col}" not in df.columns:
            df = df.merge(gt[gt["type"] == col].drop("type", axis=1), how="left").rename(
                columns={"ground_truth": f"gt_{col}"}
            )

        n_preds, n_gts, n_found = get_coverage(
            df["candidates"].values, df[f"gt_{col}"].values
        )

        print(
            f"{col}\t- Found {numerize(n_found)} GTs with {numerize(n_preds)} candidates (pos_prop={n_found / n_preds * 100 :.2f}%)\t-  Highest reachable Recall : {n_found / n_gts :.3f}"
        )
        recalls.append(n_found / n_gts)

In [None]:
if MODE != "test":
    cv = np.average(recalls, weights=WEIGHTS)
    print(f"-> Highest reachable CV : {cv:.3f}")

### Explode & saving

In [None]:
df = explode(df, test=(MODE=="test"))

In [None]:
df.to_parquet(
    f"../output/candidates/candidates_{SUFFIX}_{MODE}.parquet", index=False
)
print(f"Saved to ../output/candidates/candidates_{SUFFIX}_{MODE}.parquet")

In [None]:
del df
gc.collect()

Done