**About :** Generates candidates.

**TODO**:

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
import os
import gc
import sys
import cudf
import json
import glob
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize


warnings.simplefilter(action="ignore", category=FutureWarning)
pandarallel.initialize(nb_workers=32, progress_bar=False)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from params import *

from data.covisitation import compute_covisitation_matrix
from data.candidates import load_parquets, create_candidates, explode, matrix_to_candids_dict

from utils.metrics import get_coverage
from utils.chris import suggest_clicks, suggest_buys, read_file_to_cache

## Covisitation matrices
- Recompute on train without using val ??

In [6]:
MATRIX_FOLDER = "../output/matrices_2/"

os.makedirs(MATRIX_FOLDER, exist_ok=True)

In [7]:
MODE = "val"

In [12]:
if MODE == "val":
    files = glob.glob("../output/full_train_parquet/*") +  glob.glob("../output/val_parquet/*")
    files += glob.glob("../output/val_trimmed_parquet/*")
elif MODE == "test":
    files = glob.glob("../output/full_train_val_parquet/*") +  glob.glob("../output/test_parquet/*")
else:
    raise NotImplementedError

In [14]:
data_cache = {}
for f in tqdm(files):
    data_cache[f] = read_file_to_cache(f)

100%|██████████| 156/156 [01:18<00:00,  2.00it/s]


In [15]:
for n in [20, 40]:
    compute_covisitation_matrix(
        files,
        data_cache,
        weighting="temporal",
        n=n,
        save_folder=MATRIX_FOLDER,
        suffix=MODE,
    )

    compute_covisitation_matrix(
        files,
        data_cache,
        weighting="type",
        type_weight={0: 1, 1: 3, 2: 6},
        n=n,
        save_folder=MATRIX_FOLDER,
        suffix=MODE,
    )
    
    compute_covisitation_matrix(
        files,
        data_cache,
        considered_types=[1, 2],
        weighting="",
        n=n,
        save_folder=MATRIX_FOLDER,
        suffix=MODE,
    )

    compute_covisitation_matrix(
        files,
        data_cache,
        weighting="type",
        type_weight={0: 0.5, 1: 9, 2: 0.5},
        n=n,
        save_folder=MATRIX_FOLDER,
        suffix=MODE,
    )

100%|██████████| 16/16 [00:43<00:00,  2.74s/it]
100%|██████████| 16/16 [00:42<00:00,  2.65s/it]
100%|██████████| 16/16 [00:42<00:00,  2.63s/it]
100%|██████████| 16/16 [00:42<00:00,  2.63s/it]


Saving matrix to ../output/matrices_2/matrix_123_temporal_20_val.pqt


100%|██████████| 16/16 [00:41<00:00,  2.62s/it]
100%|██████████| 16/16 [00:43<00:00,  2.69s/it]
100%|██████████| 16/16 [00:42<00:00,  2.68s/it]
100%|██████████| 16/16 [00:42<00:00,  2.66s/it]


Saving matrix to ../output/matrices_2/matrix_123_type136_20_val.pqt


100%|██████████| 16/16 [00:14<00:00,  1.11it/s]
100%|██████████| 16/16 [00:12<00:00,  1.30it/s]
100%|██████████| 16/16 [00:12<00:00,  1.30it/s]
100%|██████████| 16/16 [00:12<00:00,  1.31it/s]


Saving matrix to ../output/matrices_2/matrix_12__20_val.pqt


100%|██████████| 16/16 [00:40<00:00,  2.54s/it]
100%|██████████| 16/16 [00:40<00:00,  2.52s/it]
100%|██████████| 16/16 [00:40<00:00,  2.52s/it]
100%|██████████| 16/16 [00:39<00:00,  2.50s/it]


Saving matrix to ../output/matrices_2/matrix_123_type0.590.5_20_val.pqt


100%|██████████| 16/16 [00:39<00:00,  2.44s/it]
100%|██████████| 16/16 [00:39<00:00,  2.45s/it]
100%|██████████| 16/16 [00:39<00:00,  2.48s/it]
100%|██████████| 16/16 [00:39<00:00,  2.46s/it]


Saving matrix to ../output/matrices_2/matrix_123_temporal_40_val.pqt


100%|██████████| 16/16 [00:39<00:00,  2.49s/it]
100%|██████████| 16/16 [00:39<00:00,  2.49s/it]
100%|██████████| 16/16 [00:39<00:00,  2.49s/it]
100%|██████████| 16/16 [00:39<00:00,  2.48s/it]


Saving matrix to ../output/matrices_2/matrix_123_type136_40_val.pqt


100%|██████████| 16/16 [00:11<00:00,  1.39it/s]
100%|██████████| 16/16 [00:11<00:00,  1.41it/s]
100%|██████████| 16/16 [00:11<00:00,  1.41it/s]
100%|██████████| 16/16 [00:12<00:00,  1.32it/s]


Saving matrix to ../output/matrices_2/matrix_12__40_val.pqt


100%|██████████| 16/16 [00:40<00:00,  2.51s/it]
100%|██████████| 16/16 [00:40<00:00,  2.51s/it]
100%|██████████| 16/16 [00:41<00:00,  2.57s/it]
100%|██████████| 16/16 [00:42<00:00,  2.63s/it]


Saving matrix to ../output/matrices_2/matrix_123_type0.590.5_40_val.pqt


Done