**About :** Computes Features.

**TODO**:
- for loop to automatize

In [None]:
cd ../src

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
import os
import gc
import re
import cudf
import glob
import numba
import warnings
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

warnings.simplefilter(action="ignore", category=FutureWarning)
pandarallel.initialize(nb_workers=32, progress_bar=False)
pd.options.display.max_columns = 500

In [None]:
from params import *
from data.fe import *
from utils.load import load_sessions

### Load candidates

In [None]:
MODE = "val"  #  "val", "test"
CANDIDATES_VERSION = "c-orders-v3"
FEATURES_VERSION = "6"

SUFFIX = f"{CANDIDATES_VERSION}.{FEATURES_VERSION}"

In [None]:
CANDIDATE_FILE = f'../output/candidates/candidates_{CANDIDATES_VERSION}_{MODE}.parquet'
PARQUET_FILES = f"../output/{MODE}_parquet/*"

if MODE == "val":
    OLD_PARQUET_FILES = "../output/full_train_parquet/*"
elif MODE == "train":
    OLD_PARQUET_FILES = "../output/other_parquet/*"
else:
    raise NotImplementedError

In [None]:
pairs = cudf.read_parquet(CANDIDATE_FILE)
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

In [None]:
PART = 2
CHUNK_SIZE = 1_000_000  # PER SESSION INSTEAD ??

ids = np.arange(PART * CHUNK_SIZE, min((PART + 1) * CHUNK_SIZE, len(pairs)))
pairs = pairs.iloc[ids].reset_index(drop=True)
print(pairs.shape)

### Time weighting

In [None]:
sessions = load_sessions(PARQUET_FILES)
weights = compute_weights(sessions)

In [None]:
pairs = pairs.merge(weights, how="left", on=["session", "candidates"])
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

for c in weights.columns[2:]:
    pairs[c] = pairs[c].fillna(pairs[c].min() / 2).astype("float32")

In [None]:
del sessions
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Popularity
TODO :
- Popularity of items in session ?
- build session clusters and find popularity among it

In [None]:
pairs = compute_popularity_features(pairs, [OLD_PARQUET_FILES, PARQUET_FILES], "")
pairs = compute_popularity_features(pairs, OLD_PARQUET_FILES, "_old")
pairs = compute_popularity_features(pairs, PARQUET_FILES, "_w")

In [None]:
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Covisitation features
TODO :
- merge rank in matrix

In [None]:
MATRIX_FOLDER = "../output/matrices/"
MATRIX_NAMES = [f"matrix_123_temporal_20_{MODE}", f"matrix_123_type136_20_{MODE}", f"matrix_12__20_{MODE}", f"matrix_123_type0.590.5_20_{MODE}"]

In [None]:
sessions = load_sessions(PARQUET_FILES)

sessions = sessions.sort_values(['session', "aid"]).groupby('session').agg(list).reset_index()
pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

In [None]:
for name in MATRIX_NAMES:
    print(f' -> Features from {name}')

    fts = compute_coocurence_features(
        pairs[['session', 'candidates', 'aid']],
        os.path.join(MATRIX_FOLDER, name + ".pqt"),
        weights
    )

    for c in fts.columns[2:]:
        pairs[f"{name.rsplit('_', 1)[0]}_{re.sub('w_', '', c)}"] = fts[c].values

    break
    del fts
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
pairs.drop('aid', axis=1, inplace=True)

del sessions, weights
numba.cuda.current_context().deallocations.clear()
gc.collect()

#### Popularity x day/weekday/hour
- popularity tomorrow ?

In [None]:
# sessions = load_sessions(PARQUET_FILES)
# sessions = get_datetime_info(sessions)

# sessions = sessions.groupby('session').max().reset_index().drop(['aid', 'ts', 'type'], axis=1)
# pairs = pairs.merge(sessions, on="session", how="left").sort_values(['session', 'candidates']).reset_index(drop=True)

In [None]:
# COLS = ["day", "weekday", "hour"]

# for i, c in enumerate(CLASSES): # + ["*"]
#     if c == "clicks":
#         continue

#     print(f'-> Popularity for {c if c != "*" else "views"}')

#     sessions = load_sessions([OLD_PARQUET_FILES, PARQUET_FILES])
#     sessions = sessions[sessions["type"] == i]
#     sessions = get_datetime_info(sessions)

#     for col in COLS:
#         popularity = sessions[['aid', col, 'type']].groupby(['aid', col]).count().reset_index()

#         popularity.columns = ["candidates", col, f"{col}_{c}_popularity"]
#         popularity[col] = popularity[col].astype(sessions[col].dtype)
#         popularity[f"{col}_{c}_popularity"] = np.clip(popularity[f"{col}_{c}_popularity"], 0, 2 ** 15 - 1).astype('int16')

#         pairs = pairs.merge(popularity, how="left", on=["candidates", col]).fillna(0)
        
# #         break
# #     break
# pairs.drop(['day'], axis=1, inplace=True)

In [None]:
# del sessions, popularity
# numba.cuda.current_context().deallocations.clear()
# gc.collect()

### Session features
- Count views/clicks/carts/orders of session
- Count views/clicks/carts/orders of each candidate

TODO :
- Distance to last view

In [None]:
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

for i, c in enumerate(CLASSES + ["*"]):
    print(f'-> Candidate {c if c != "*" else "views"} in session')

    sessions = load_sessions(PARQUET_FILES)
    
    if c != "*":
        sessions.loc[sessions["type"] != i, "aid"] = -1
    sessions = sessions.groupby('session').agg(list).reset_index()

    pairs[f'candidate_{c}_before'] = count_actions(
        pairs[['session', 'candidates']],
        sessions
    )

    del sessions
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
#     break

In [None]:
sessions = load_sessions(PARQUET_FILES)

n_views = sessions[['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_views"})
n_clicks = sessions[sessions['type'] == 0][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_clicks"})
n_carts = sessions[sessions['type'] == 1][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_carts"})
n_orders = sessions[sessions['type'] == 2][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_orders"})

sessions_fts = n_views.merge(n_clicks, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_carts, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_orders, how="left", on="session").fillna(0)

for c in sessions_fts.columns[1:]:
    sessions_fts[c] = np.clip(sessions_fts[c], 0, 255).astype(np.uint8)

In [None]:
pairs = pairs.merge(sessions_fts, on="session", how="left")
pairs = pairs.sort_values(['session', 'candidates'])

### Save

In [None]:
save_by_chunks(pairs, f"../output/features/fts_{MODE}_{SUFFIX}/", part=PART)

In [None]:
pairs.head()

Done