**About :** Computes Features.

**TODO**:
- for loop to automatize

In [125]:
cd ../src

/workspace/kaggle_otto_rs/src


In [126]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [127]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [128]:
import os
import gc
import re
import cudf
import glob
import numba
import warnings
import numpy as np
import pandas as pd
import seaborn as sns

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

warnings.simplefilter(action="ignore", category=FutureWarning)
warnings.simplefilter(action="ignore", category=UserWarning)
pandarallel.initialize(nb_workers=32, progress_bar=False)
pd.options.display.max_columns = 500
pd.options.display.max_rows = 50

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [129]:
from params import *
from data.fe import *
from utils.load import load_sessions

### Load candidates

In [130]:
MODE = "val"  #  "val", "test"
CANDIDATES_VERSION = "c-orders-v3"
FEATURES_VERSION = "8"

SUFFIX = f"{CANDIDATES_VERSION}.{FEATURES_VERSION}"

In [131]:
CANDIDATE_FILE = f'../output/candidates/candidates_{CANDIDATES_VERSION}_{MODE}.parquet'
PARQUET_FILES = f"../output/{MODE}_parquet/*"

if MODE == "val":
    OLD_PARQUET_FILES = "../output/full_train_parquet/*"
elif MODE == "train":
    OLD_PARQUET_FILES = "../output/other_parquet/*"
else:
    raise NotImplementedError

In [132]:
pairs = cudf.read_parquet(CANDIDATE_FILE)
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

In [133]:
PART = 0
CHUNK_SIZE = 1_000_000  # PER SESSION INSTEAD ??

ids = np.arange(PART * CHUNK_SIZE, min((PART + 1) * CHUNK_SIZE, len(pairs)))
pairs = pairs.iloc[ids].reset_index(drop=True)
print(pairs.shape)

(1000000, 5)


### Time weighting

In [134]:
sessions = load_sessions(PARQUET_FILES)
weights = compute_weights(sessions)

In [135]:
pairs = pairs.merge(weights, how="left", on=["session", "candidates"])
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

for c in weights.columns[2:]:
    pairs[c] = pairs[c].fillna(pairs[c].min() / 2).astype("float32")

In [136]:
del sessions
numba.cuda.current_context().deallocations.clear()
gc.collect()

2160

### Word2vec

In [111]:
pairs = compute_w2v_features(pairs, PARQUET_FILES, f'../output/matrix_factorization/word2vec_{MODE}.emb')

-> Computing Word2Vec similarities from matrix word2vec_val.emb


In [112]:
pairs.head()

Unnamed: 0,session,candidates,gt_clicks,gt_carts,gt_orders,w_pos-log,w_type-163,w_lastday,w_time,w_recsys,w2v_sim_1,w2v_sim_2,w2v_sim_3,w2v_sim_wgt_1,w2v_sim_wgt_2,w2v_sim_last,w2v_sim_type_1
0,11098528,2911,0,0,0,0.035887,0.5,0.0,0.05425,0.000703,0.969566,0.648074,6,0.140399,0.208146,0.969566,0.324037
1,11098528,7555,0,0,0,0.035887,0.5,0.0,0.05425,0.000703,0.905576,0.61356,6,0.133495,0.197423,0.905576,0.30678
2,11098528,11830,0,0,1,0.071773,1.0,1.0,0.434294,0.165468,1.0,0.552832,5,0.137038,0.188457,0.608779,0.276416
3,11098528,92401,0,0,1,0.071773,1.0,1.0,0.621335,0.34606,1.0,0.576727,4,0.12539,0.185514,0.611477,0.288364
4,11098528,136418,0,0,0,0.035887,0.5,0.0,0.05425,0.000703,0.953659,0.639578,6,0.138836,0.205593,0.953659,0.319789


### Matrix Factorization

In [114]:
EMBED_PATH = "../output/matrix_factorization/"

EMBED_NAMES = [
    f'embed_1-9_64_cartbuy_{MODE}',
    f'embed_1_64_{MODE}',
    f'embed_1-5_64_{MODE}',
]

In [115]:
for embed_name in EMBED_NAMES:
    print(f'-> Features from matrix {embed_name}')
    name = embed_name.rsplit('_', 1)[0]

    # Load embeddings
    embed_path = os.path.join(EMBED_PATH, embed_name + ".npy")
    embed = np.load(embed_path)
    embed /= np.reshape(np.sqrt(np.sum(embed * embed, axis=1)), (-1, 1))
    embed = np.concatenate((embed, np.zeros((1, embed.shape[1])))).astype(np.float32)
    
    # Retrieve sessions
    sessions = load_sessions(PARQUET_FILES)
    if "_cartbuy" in embed_path:
        sessions = sessions[sessions['type'] != 0]
    sessions = sessions.sort_values(['session', "ts"], ascending=[True, False])
    
    # Last n events
    df_s = sessions[['session', "aid"]].groupby('session').first().reset_index()
    df_s.columns = ['session', 'last_0']
    
    sessions['n'] = sessions[['session', "aid"]].groupby('session').cumcount()
    for n in range(5):
        if n > 0:
            df_s = df_s.merge(
                sessions[['session', "aid"]][sessions['n'] == n], how="left", on="session"
            ).rename(columns={"aid": f"last_{n}"})
        df_s[f"last_{n}"] = df_s[f"last_{n}"].fillna(embed.shape[0] - 1).astype("int32")
    
    pairs = pairs.merge(df_s, how="left", on="session")
    for n in range(5):
        pairs[f"last_{n}"] = pairs[f"last_{n}"].fillna(embed.shape[0] - 1).astype("int32")
        pairs[f'{name}_last_{n}'] = np.sum(embed[pairs['candidates'].to_pandas().values] * embed[pairs[f'last_{n}'].to_pandas().values], axis=1)
        pairs[f'{name}_last_{n}'] -= (pairs[f'last_{n}'] == embed.shape[0] - 1)  # nan are set to -1
        pairs.drop(f'last_{n}', axis=1, inplace=True)

    weights_noclick = None
    if "_cartbuy" in embed_path:
        sessions = load_sessions(PARQUET_FILES)
        weights_noclick = compute_weights(sessions, no_click=True)

    sessions = sessions.sort_values(['session', "ts"], ascending=[True, False])
    
    sessions = sessions.sort_values(['session', "aid"]).groupby('session').agg(list).reset_index()
    pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
    pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)
    
    fts = compute_matrix_factorization_features(
        pairs[["session", "candidates", "aid"]],
        embed,
        weights if weights_noclick is None else weights_noclick
    )
    
    for c in fts.columns[2:]:
        pairs[f"{name}_{re.sub('w_', '', c)}"] = fts[c].values

    del fts, sessions, weights_noclick, df_s, embed
    numba.cuda.current_context().deallocations.clear()
    gc.collect()

    pairs.drop('aid', axis=1, inplace=True)

-> Features from matrix embed_1-9_64_cartbuy_val
-> Features from matrix embed_1_64_val


KeyboardInterrupt: 

### Popularity

In [None]:
pairs = compute_popularity_features(pairs, [OLD_PARQUET_FILES, PARQUET_FILES], "")
pairs = compute_popularity_features(pairs, PARQUET_FILES, "_w")

In [None]:
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Popularity 2

In [137]:
sessions = load_sessions(PARQUET_FILES)

In [138]:
pairs = compute_popularities_new(pairs, sessions, mode=MODE)

SKIP CLIC
-> Popularity for carts
-> Popularity for orders


In [139]:
pairs.groupby('gt_orders').mean()

Unnamed: 0_level_0,session,candidates,gt_clicks,gt_carts,w_pos-log,w_type-163,w_lastday,w_time,w_recsys,popularity_week_carts,popularity_day_carts,popularity_hour_carts,popularity_hour/day_carts,popularity_day/week_carts,popularity_week_orders,popularity_day_orders,popularity_hour_orders,popularity_hour/day_orders,popularity_day/week_orders
gt_orders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
0,11108740.0,888708.479323,0.01128,0.002204,0.058577,0.605996,0.085993,0.118627,0.087436,0.213671,0.244853,0.004422,0.006351,0.798954,0.223968,0.242399,0.003511,0.00186,0.355754
1,11109320.0,938360.709948,0.192968,0.40871,0.44277,5.724331,1.985617,1.406937,1.514609,0.315921,0.490269,0.053245,0.236783,2.475041,0.400702,0.595318,0.019225,0.008237,0.646194


### Covisitation features

In [None]:
MATRIX_FOLDER = "../output/matrices/"
# MATRIX_NAMES = [f"matrix_123_temporal_20_{MODE}", f"matrix_123_type136_20_{MODE}", f"matrix_12__20_{MODE}", f"matrix_123_type0.590.5_20_{MODE}"]

In [None]:
MATRIX_NAMES = [
    f"matrix_123_temporal_20_{MODE}",
    f"matrix_123_type136_20_{MODE}",
    f"matrix_12__20_{MODE}",
    f"matrix_123_type0.590.5_20_{MODE}",
    f"matrix_cpu-90_{MODE}",
    f"matrix_cpu-95_{MODE}",
    f"matrix_cpu-99_{MODE}",
    f"matrix_gpu-116_{MODE}",
    f"matrix_gpu-115_{MODE}",
    f"matrix_gpu-93_{MODE}",
    f"matrix_gpu-217_{MODE}",
    f"matrix_gpu-226_{MODE}",
    f"matrix_gpu-232_{MODE}",
    f"matrix_gpu-239_{MODE}",
    f"matrix_gpu-700_{MODE}",
    f"matrix_gpu-701_{MODE}",
]

In [None]:
sessions = load_sessions(PARQUET_FILES)

sessions = sessions.sort_values(['session', "aid"]).groupby('session').agg(list).reset_index()
pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

In [None]:
for name in MATRIX_NAMES:
    print(f' -> Features from {name}')

    fts = compute_coocurence_features(
        pairs[['session', 'candidates', 'aid']],
        os.path.join(MATRIX_FOLDER, name + ".pqt"),
        weights
    )

    for c in fts.columns[2:]:
        pairs[f"{name.rsplit('_', 1)[0]}_{re.sub('w_', '', c)}"] = fts[c].values

    del fts
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
pairs.drop('aid', axis=1, inplace=True)

del sessions, weights
numba.cuda.current_context().deallocations.clear()
gc.collect()

### Rank ft

In [None]:
def add_rank_feature(pairs, feature):
    df_ft = pairs[["session", "candidates", feature]]
    df_ft = df_ft.sort_values(feature, ascending=False, ignore_index=True)
    df_ft[f'{feature}_rank'] = 1
    df_ft[f'{feature}_rank'] = df_ft[f'{feature}_rank'].astype("uint8")
    df_ft[f'{feature}_rank'] = df_ft.groupby("session")[f'{feature}_rank'].cumsum()

    df_ft[f'{feature}_rank'] = df_ft.groupby(["session", feature])[f'{feature}_rank'].cummin()  # Ties

    df_ft = df_ft.drop(feature, axis=1).sort_values(["session", "candidates"], ignore_index=True)

    pairs[f'{feature}_rank'] = df_ft[f'{feature}_rank'].astype("uint8")

In [None]:
fts_to_rank = [ft for ft in pairs.columns[5:] if not any([k in ft for k in ["_rank", "_sum", "_max"]])]

In [None]:
for ft in tqdm(fts_to_rank):
    add_rank_feature(pairs, ft)

### Correlations

In [None]:
# FEATURES = pairs.columns[5:]  # [ft for ft in pairs.columns[5:] if "rank" in ft]

# corr = pairs[FEATURES].corr()
# corr = corr.to_pandas()
# corr = corr.values

# mask = np.zeros_like(corr, dtype=bool)
# mask[np.triu_indices_from(mask)] = True
# corr[mask] = 0

In [None]:
# TH = 0.99

# for i in range(len(corr)):
#     for j in range(len(corr)):
#         if corr[i, j] > TH:
#             print(FEATURES[i], FEATURES[j], f'{corr[i, j] :.3f}')

### Session features
- Count views/clicks/carts/orders of session
- Count views/clicks/carts/orders of each candidate

TODO :
- Distance to last view

In [None]:
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

for i, c in enumerate(CLASSES + ["*"]):
    print(f'-> Candidate {c if c != "*" else "views"} in session')

    sessions = load_sessions(PARQUET_FILES)
    
    if c != "*":
        sessions.loc[sessions["type"] != i, "aid"] = -1
    sessions = sessions.groupby('session').agg(list).reset_index()

    pairs[f'candidate_{c}_before'] = count_actions(
        pairs[['session', 'candidates']],
        sessions
    )

    del sessions
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
#     break

In [None]:
sessions = load_sessions(PARQUET_FILES)

n_views = sessions[['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_views"})
n_clicks = sessions[sessions['type'] == 0][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_clicks"})
n_carts = sessions[sessions['type'] == 1][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_carts"})
n_orders = sessions[sessions['type'] == 2][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_orders"})

sessions_fts = n_views.merge(n_clicks, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_carts, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_orders, how="left", on="session").fillna(0)

for c in sessions_fts.columns[1:]:
    sessions_fts[c] = np.clip(sessions_fts[c], 0, 255).astype(np.uint8)

In [None]:
pairs = pairs.merge(sessions_fts, on="session", how="left")
pairs = pairs.sort_values(['session', 'candidates'])

### Save

In [None]:
save_by_chunks(pairs, f"../output/features/fts_{MODE}_{SUFFIX}/", part=PART)

In [None]:
pairs.head()

Done