**About :** Computes Features.

**TODO**:
- for loop to automatize

In [1]:
cd ../src

/workspace/kaggle_otto_rs/src


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [4]:
import os
import gc
import re
import sys
import cudf
import json
import glob
import numba
import pickle
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
from datetime import datetime
from collections import Counter
from pandarallel import pandarallel
from numerize.numerize import numerize

warnings.simplefilter(action="ignore", category=FutureWarning)
pandarallel.initialize(nb_workers=32, progress_bar=False)
pd.options.display.max_columns = 500

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [5]:
from params import *

from data.covisitation import compute_covisitation_matrix
from data.candidates import load_parquets, create_candidates, explode

from utils.load import load_sessions
from utils.metrics import get_coverage
from utils.chris import suggest_clicks, suggest_buys, read_file_to_cache

### Load candidates

In [6]:
MODE = "train"  # "train", "val", "test"  ("val_c", "test_c")
CANDIDATES_VERSION = "v2"
SUFFIX = f"{CANDIDATES_VERSION}.4"

In [7]:
CANDIDATE_FILE = f'../output/candidates/candidates_{CANDIDATES_VERSION}_{MODE}.parquet'
PARQUET_FILES = f"../output/{MODE}_parquet/*"

if MODE == "val":
    OLD_PARQUET_FILES = "../output/full_train_parquet/*"
elif MODE == "train":
    OLD_PARQUET_FILES = "../output/other_parquet/*"
else:
    raise NotImplementedError

pairs = cudf.read_parquet(CANDIDATE_FILE)
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

In [8]:
PART = 2
CHUNK_SIZE = 25_000_000  # PER SESSION INSTEAD ??

ids = np.arange(PART * CHUNK_SIZE, min((PART + 1) * CHUNK_SIZE, len(pairs)))
pairs = pairs.iloc[ids].reset_index(drop=True)
print(pairs.shape)

(25000000, 5)


### Time weighting

In [9]:
def compute_weights(sessions):
    sessions.sort_values(['session', "ts"], ascending=[True, False]).reset_index(drop=True)

    sessions['w'] = sessions.groupby('session')['aid'].cumcount()
    
    sessions = sessions.merge(
        cudf.DataFrame(sessions.groupby('session')['aid'].size()),
        on="session",
        how="left"
    ).rename(columns={0: "n"})
    
    sessions["logspace_w"] = sessions.apply(
        lambda x : 1 if x.n == 1 else 2 ** (0.1 + 0.9 * (x.n - x.w - 1) / (x.n - 1)) - 1,
        axis=1
    )
    sessions["linspace_w"] = sessions['w'].apply(
        lambda x : 0.05 if x >= 20 else 0.1 + 0.9 * (18 - x) / 18
    )
    
    sessions["linspace_w_t163"] = sessions["linspace_w"] * sessions['type'].map({0: 1, 1: 6, 2:3})
    sessions["logspace_w_t163"] = sessions["logspace_w"] * sessions['type'].map({0: 1, 1: 6, 2:3})

    sessions["linspace_w_t191"] = sessions["linspace_w"] * sessions['type'].map({0: 1, 1: 9, 2:1})
    sessions["logspace_w_t191"] = sessions["logspace_w"] * sessions['type'].map({0: 1, 1: 9, 2:1})

    weights = sessions.drop(['ts', 'type', 'w', 'n'], axis=1).groupby(['session', 'aid']).sum().reset_index()
    
    weights = weights.sort_values(['session', "aid"]).reset_index(drop=True).rename(columns={"aid": "candidates"})
    
    return weights

In [10]:
sessions = load_sessions(PARQUET_FILES)
weights = compute_weights(sessions)

In [11]:
pairs = pairs.merge(weights, how="left", on=["session", "candidates"])
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

for c in weights.columns[2:]:
    pairs[c] = pairs[c].fillna(pairs[c].min() / 2).astype("float32")

In [12]:
del sessions
numba.cuda.current_context().deallocations.clear()
gc.collect()

3396

### Popularity
TODO :
- Popularity of items in session ?
- build session clusters and find popularity among it

In [13]:
def compute_popularity_features(pairs, parquet_files, suffix):
    sessions = load_sessions(parquet_files)
    
    for i, c in enumerate(CLASSES):
        print(f'-> Popularity for {c}')
        popularity = cudf.DataFrame(sessions.loc[sessions["type"] == i, "aid"].value_counts()).reset_index()
        popularity.columns = ['candidates', f'{c}_popularity{suffix}']
        popularity[f'{c}_popularity{suffix}'] = np.clip(popularity[f'{c}_popularity{suffix}'], 0, 2 ** 16 - 1).astype("uint16")

        pairs = pairs.merge(popularity, how="left", on="candidates").fillna(0)

        sessions['w'] = (sessions['ts'] - sessions['ts'].min())
        max_ = sessions['w'].max()
        sessions["w_log"] = sessions["w"].apply(lambda x: 2 ** (0.1 + 0.9 * (x - 1) / (max_ - 1)) - 1)
        sessions["w"] = sessions["w"].apply(lambda x: 0.1 + 0.9 * (x - 1) / (max_ - 1))

        popularity_time_weighted = sessions[["aid", "w", "w_log"]] .groupby('aid').sum().reset_index()
        popularity_time_weighted["w"] = popularity_time_weighted["w"].astype("float32")
        popularity_time_weighted["w_log"] = popularity_time_weighted["w_log"].astype("float32")

        popularity_time_weighted.columns = ["candidates", f"{c}_popularity_lin{suffix}", f"{c}_popularity_log{suffix}"]
        pairs = pairs.merge(popularity_time_weighted, how="left", on="candidates").fillna(0)
    
        del popularity, popularity_time_weighted
        numba.cuda.current_context().deallocations.clear()
        gc.collect()

    return pairs

In [14]:
pairs = compute_popularity_features(pairs, [OLD_PARQUET_FILES, PARQUET_FILES], "")
pairs = compute_popularity_features(pairs, OLD_PARQUET_FILES, "_old")
pairs = compute_popularity_features(pairs, PARQUET_FILES, "_w")

-> Popularity for clicks
-> Popularity for carts
-> Popularity for orders
-> Popularity for clicks
-> Popularity for carts
-> Popularity for orders
-> Popularity for clicks
-> Popularity for carts
-> Popularity for orders


In [15]:
numba.cuda.current_context().deallocations.clear()
gc.collect()

59

### Covisitation features
TODO :
- merge rank in matrix

In [16]:
def compute_coocurence_features(pairs, matrix_file, weights):
    pairs['group'] = pairs['session'] // 100000

    weights = weights.rename(columns={"candidates": "aid"})

    mat = cudf.read_parquet(matrix_file)
    mat.columns = ['aid', 'candidates', 'w']

    fts = []
    for _, df in tqdm(pairs.groupby('group')):
        df = df[['session', 'candidates', 'aid']].explode('aid').reset_index(drop=True)

        df = df.merge(mat, how="left", on=["aid", "candidates"]).reset_index().fillna(0)

        df = df.merge(weights, how="left", on=["session", "aid"])
        df['logspace_w'] *= df['w']
        df['linspace_w'] *= df['w']

        df = df[['candidates', 'session', 'w', 'logspace_w', 'linspace_w']].groupby(['session', 'candidates']).agg(["mean", "sum", "max"])
        df.columns = ['_'.join(col) for col in df.columns.values]

        df[df.columns] = df[df.columns].astype("float32")
        fts.append(df.reset_index())

    fts = cudf.concat(fts, ignore_index=True)
    fts = fts.sort_values(['session', 'candidates']).reset_index(drop=True)

    return fts

In [17]:
MATRIX_FOLDER = "../output/matrices/"
MATRIX_NAMES = [f"matrix_123_temporal_20_{MODE}", f"matrix_123_type136_20_{MODE}", f"matrix_12__20_{MODE}", f"matrix_123_type0.590.5_20_{MODE}"]

In [18]:
sessions = load_sessions(PARQUET_FILES)

sessions = sessions.sort_values(['session', "aid"]).groupby('session').agg(list).reset_index()
pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
pairs = pairs.sort_values(['session', 'candidates']).reset_index(drop=True)

In [19]:
for name in MATRIX_NAMES:
    print(f' -> Features from {name}')

    fts = compute_coocurence_features(
        pairs[['session', 'candidates', 'aid']],
        os.path.join(MATRIX_FOLDER, name + ".pqt"),
        weights
    )

    for c in fts.columns[2:]:
        pairs[f"{name.rsplit('_', 1)[0]}_{re.sub('w_', '', c)}"] = fts[c].values

    del fts
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
pairs.drop('aid', axis=1, inplace=True)

del sessions, weights
numba.cuda.current_context().deallocations.clear()
gc.collect()

 -> Features from matrix_123_temporal_20_train


10it [00:03,  2.93it/s]


 -> Features from matrix_123_type136_20_train


10it [00:03,  2.99it/s]


 -> Features from matrix_12__20_train


10it [00:03,  3.26it/s]


 -> Features from matrix_123_type0.590.5_20_train


10it [00:03,  3.01it/s]


0

#### Popularity x day/weekday/hour

In [20]:
# Simple function
def get_time(x):
    '''Convert from Unix to Datetime.'''
    return datetime.utcfromtimestamp(x).strftime('%Y-%m-%d %H:%M:%S')

def get_period(x):
    if 7 <= x <= 12:
        return 0  # "morning"
    elif 12 <= x <= 18:
        return 1  # "afernoon"
    elif 18 <= x <= 23:
        return 2  # "evening"
    else:
        return 3  # "night"
    
def get_datetime_info(df):
    df["datetime"] = cudf.to_datetime(df['ts'].to_pandas().parallel_apply(get_time).values)
    df["datetime"] += cudf.DateOffset(hours=2)  # UTC + 2 in germany

    df["day"] = df["datetime"].dt.day + (df["datetime"].dt.month - 8) * 12
    df["weekday"] = df["datetime"].dt.weekday
    df["hour"] = df["datetime"].dt.hour
#     df["period"] = df["hour"].apply(get_period)
    
    df[["day", "hour", "weekday"]] = df[["day", "hour", "weekday"]].astype("uint8")
#     df[["day", "hour", "weekday", "period"]] = df[["day", "hour", "weekday", "period"]].astype("uint8")
    df.drop('datetime', axis=1, inplace=True)
    return df

In [21]:
# sessions = load_sessions(PARQUET_FILES)
# sessions = get_datetime_info(sessions)

# sessions = sessions.groupby('session').max().reset_index().drop(['aid', 'ts', 'type'], axis=1)
# pairs = pairs.merge(sessions, on="session", how="left").sort_values(['session', 'candidates']).reset_index(drop=True)

In [22]:
# COLS = ["day", "weekday", "hour"]

# for i, c in enumerate(CLASSES): # + ["*"]
#     if c == "clicks":
#         continue

#     print(f'-> Popularity for {c if c != "*" else "views"}')

#     sessions = load_sessions([OLD_PARQUET_FILES, PARQUET_FILES])
#     sessions = sessions[sessions["type"] == i]
#     sessions = get_datetime_info(sessions)

#     for col in COLS:
#         popularity = sessions[['aid', col, 'type']].groupby(['aid', col]).count().reset_index()

#         popularity.columns = ["candidates", col, f"{col}_{c}_popularity"]
#         popularity[col] = popularity[col].astype(sessions[col].dtype)
#         popularity[f"{col}_{c}_popularity"] = np.clip(popularity[f"{col}_{c}_popularity"], 0, 2 ** 15 - 1).astype('int16')

#         pairs = pairs.merge(popularity, how="left", on=["candidates", col]).fillna(0)
        
# #         break
# #     break
# pairs.drop(['day'], axis=1, inplace=True)

In [23]:
# del sessions, popularity
# numba.cuda.current_context().deallocations.clear()
# gc.collect()

### Session features
- Count views/clicks/carts/orders of session
- Count views/clicks/carts/orders of each candidate

TODO :
- Distance to last view

In [24]:
def count_actions(pairs, sessions):
    pairs = pairs.merge(sessions[["session", "aid"]], how="left", on="session")
    pairs['group'] = pairs['session'] // 100000

    fts = []
    for _, df in tqdm(pairs.groupby('group')):
        df = df[['session', 'candidates', 'aid']].explode('aid')
        df['aid'] = (df['aid'] == df['candidates']).astype(np.uint16)

        df = df.groupby(
            ["session", "candidates"]
        ).sum().reset_index()
        
        fts.append(df)
    
    ft = cudf.concat(fts, ignore_index=True)
    ft = ft.sort_values(['session', 'candidates'])['aid'].values

    return np.clip(ft, 0, 255).astype(np.uint8)

In [25]:
for i, c in enumerate(CLASSES + ["*"]):
    print(f'-> Candidate {c if c != "*" else "views"} in session')

    sessions = load_sessions(PARQUET_FILES)
    if c != "*":
        sessions.loc[sessions["type"] != i, "aid"] = -1

    sessions = sessions.groupby('session').agg(list).reset_index()

    pairs[f'candidate_{c}_before'] = count_actions(
        pairs[['session', 'candidates']],
        sessions
    )
    
    del sessions
    numba.cuda.current_context().deallocations.clear()
    gc.collect()
    
#     break

-> Candidate clicks in session


10it [00:00, 13.31it/s]


-> Candidate carts in session


10it [00:00, 12.93it/s]


-> Candidate orders in session


10it [00:00, 13.03it/s]


-> Candidate views in session


10it [00:00, 13.26it/s]


In [26]:
sessions = load_sessions(PARQUET_FILES)

n_views = sessions[['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_views"})
n_clicks = sessions[sessions['type'] == 0][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_clicks"})
n_carts = sessions[sessions['type'] == 1][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_carts"})
n_orders = sessions[sessions['type'] == 2][['session', 'ts']].groupby('session').count().reset_index().rename(columns={"ts": "n_orders"})

# sessions = sessions.merge(n_views, how="left", on="session").fillna(0)
sessions_fts = n_views.merge(n_clicks, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_carts, how="left", on="session").fillna(0)
sessions_fts = sessions_fts.merge(n_orders, how="left", on="session").fillna(0)

for c in sessions_fts.columns[1:]:
    sessions_fts[c] = np.clip(sessions_fts[c], 0, 255).astype(np.uint8)

In [27]:
pairs = pairs.merge(sessions_fts, on="session", how="left")
pairs = pairs.sort_values(['session', 'candidates'])

### Save

In [28]:
def save_by_chunks(pairs, folder):
    print(f'-> Saving chunks to {folder}')
    os.makedirs(folder, exist_ok=True)

    pairs['group'] = pairs['session'] // 100000

    for i, (_, df) in enumerate(tqdm(pairs.groupby('group'))):
        df.drop('group', axis=1, inplace=True)
        df.to_parquet(os.path.join(folder, f'{PART}_{i:03d}.parquet'))

In [29]:
save_by_chunks(pairs, f"../output/features/fts_{MODE}_{SUFFIX}/")

-> Saving chunks to ../output/features/fts_train_v2.4/


10it [00:28,  2.87s/it]


In [30]:
pairs.head()

Unnamed: 0,session,candidates,gt_clicks,gt_carts,gt_orders,logspace_w,linspace_w,linspace_w_t163,logspace_w_t163,linspace_w_t191,logspace_w_t191,clicks_popularity,clicks_popularity_lin,clicks_popularity_log,carts_popularity,carts_popularity_lin,carts_popularity_log,orders_popularity,orders_popularity_lin,orders_popularity_log,clicks_popularity_old,clicks_popularity_lin_old,clicks_popularity_log_old,carts_popularity_old,carts_popularity_lin_old,carts_popularity_log_old,orders_popularity_old,orders_popularity_lin_old,orders_popularity_log_old,clicks_popularity_w,clicks_popularity_lin_w,clicks_popularity_log_w,carts_popularity_w,carts_popularity_lin_w,carts_popularity_log_w,orders_popularity_w,orders_popularity_lin_w,orders_popularity_log_w,matrix_123_temporal_20_mean,matrix_123_temporal_20_sum,matrix_123_temporal_20_max,matrix_123_temporal_20_logspace_mean,matrix_123_temporal_20_logspace_sum,matrix_123_temporal_20_logspace_max,matrix_123_temporal_20_linspace_mean,matrix_123_temporal_20_linspace_sum,matrix_123_temporal_20_linspace_max,matrix_123_type136_20_mean,matrix_123_type136_20_sum,matrix_123_type136_20_max,matrix_123_type136_20_logspace_mean,matrix_123_type136_20_logspace_sum,matrix_123_type136_20_logspace_max,matrix_123_type136_20_linspace_mean,matrix_123_type136_20_linspace_sum,matrix_123_type136_20_linspace_max,matrix_12__20_mean,matrix_12__20_sum,matrix_12__20_max,matrix_12__20_logspace_mean,matrix_12__20_logspace_sum,matrix_12__20_logspace_max,matrix_12__20_linspace_mean,matrix_12__20_linspace_sum,matrix_12__20_linspace_max,matrix_123_type0.590.5_20_mean,matrix_123_type0.590.5_20_sum,matrix_123_type0.590.5_20_max,matrix_123_type0.590.5_20_logspace_mean,matrix_123_type0.590.5_20_logspace_sum,matrix_123_type0.590.5_20_logspace_max,matrix_123_type0.590.5_20_linspace_mean,matrix_123_type0.590.5_20_linspace_sum,matrix_123_type0.590.5_20_linspace_max,candidate_clicks_before,candidate_carts_before,candidate_orders_before,candidate_*_before,n_views,n_clicks,n_carts,n_orders,group
2768,9471468,24827,0,0,0,0.035887,0.025,0.025,0.035887,0.025,0.035887,549,258.187225,212.844559,90,258.187225,212.844559,24,258.187225,212.844559,526,322.226471,280.802887,87,322.226471,280.802887,24,322.226471,280.802887,23,14.407562,12.465124,3,14.407562,12.465124,0,14.407562,12.465124,27.586843,55.173687,55.173687,1.980003,3.960006,3.960006,26.2075,52.415001,52.415001,21.5,43.0,43.0,1.543129,3.086259,3.086259,20.424999,40.849998,40.849998,0.5,1.0,1.0,0.035887,0.071773,0.071773,0.475,0.95,0.95,20.75,41.5,41.5,1.489299,2.978599,2.978599,19.7125,39.424999,39.424999,0,0,0,0,2,2,0,0,94
2769,9471468,109883,0,0,0,1.0,1.0,1.0,1.0,1.0,1.0,1634,913.232849,789.891541,59,913.232849,789.891541,20,913.232849,789.891541,1384,983.86792,908.4505,53,983.86792,908.4505,18,983.86792,908.4505,250,110.457817,92.440056,6,110.457817,92.440056,2,110.457817,92.440056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,0,1,2,2,0,0,94
2770,9471468,166037,0,0,0,0.035887,0.025,0.025,0.035887,0.025,0.035887,36313,21213.896484,17937.714844,7597,21213.896484,17937.714844,2115,21213.896484,17937.714844,33224,24758.595703,22090.992188,6923,24758.595703,22090.992188,2021,24758.595703,22090.992188,3089,2193.597412,1963.644897,674,2193.597412,1963.644897,94,2193.597412,1963.644897,36.145489,72.290977,72.290977,2.594287,5.188574,5.188574,34.338215,68.67643,68.67643,24.5,49.0,49.0,1.75845,3.5169,3.5169,23.275,46.549999,46.549999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15.5,31.0,31.0,1.112489,2.224977,2.224977,14.725,29.450001,29.450001,0,0,0,0,2,2,0,0,94
2771,9471468,167523,0,0,0,0.035887,0.025,0.025,0.035887,0.025,0.035887,632,395.704041,347.879822,33,395.704041,347.879822,12,395.704041,347.879822,519,417.660431,392.349762,32,417.660431,392.349762,11,417.660431,392.349762,113,65.589249,59.397869,1,65.589249,59.397869,1,65.589249,59.397869,61.624081,123.248161,123.248161,61.624081,123.248161,123.248161,61.624081,123.248161,123.248161,32.0,64.0,64.0,32.0,64.0,64.0,32.0,64.0,64.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19.75,39.5,39.5,19.75,39.5,39.5,19.75,39.5,39.5,0,0,0,0,2,2,0,0,94
2772,9471468,271474,0,0,0,0.035887,0.025,0.025,0.035887,0.025,0.035887,1590,811.898987,696.032776,78,811.898987,696.032776,12,811.898987,696.032776,1399,877.221008,786.309265,68,877.221008,786.309265,12,877.221008,786.309265,191,131.351212,119.118126,10,131.351212,119.118126,0,131.351212,119.118126,73.317444,146.634888,146.634888,73.317444,146.634888,146.634888,73.317444,146.634888,146.634888,42.5,85.0,85.0,42.5,85.0,85.0,42.5,85.0,85.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.75,37.5,37.5,18.75,37.5,37.5,18.75,37.5,37.5,0,0,0,0,2,2,0,0,94


Done