In [7]:
from pathlib import Path

import cudf
import xgboost as xgb
from sklearn.model_selection import GroupKFold

In [8]:
data_path = Path.cwd().parent / 'preprocess/data'
temp_path = Path.cwd() / 'data'
if not temp_path.is_dir():
    temp_path.mkdir()

## LB

In [9]:
lb_in = data_path / 'cv'
lb_out = temp_path / 'lb'
if not lb_out.is_dir():
    lb_out.mkdir()

In [10]:
label = cudf.read_parquet((lb_in / 'test_labels.parquet').as_posix())
cand = cudf.read_parquet((lb_out / 'click_cand.parquet').as_posix())

In [11]:
label = label.loc[label['type'] == 'clicks']
label.drop('type', axis=1, inplace=True)
label['label'] = 1
label = label.rename(columns={'aid': 'candidate'})

cand = cand.merge(label, on=['session', 'candidate'], how='left').fillna(0)
cand['past'] = cand.past.astype(int)

MemoryError: std::bad_alloc: out_of_memory: CUDA error at: /usr/local/include/rmm/mr/device/cuda_memory_resource.hpp:70: cudaErrorMemoryAllocation out of memory

In [None]:
model_out = lb_out / 'model'
if not model_out.is_dir():
    model_out.mkdir()
    
FEATURES = cand.columns[2 : -1]

skf = GroupKFold(n_splits=5)
for fold,(train_idx, valid_idx) in enumerate(skf.split(cand, cand['label'].to_numpy(), groups=cand['session'].to_numpy() )):
    train_groups = cand.loc[train_idx].groupby('session').size().to_arrow().to_pylist()
    val_groups = cand.loc[valid_idx].groupby('session').size().to_arrow().tolist()

    X_train = cand.loc[train_idx, FEATURES]
    y_train = cand.loc[train_idx, 'label']
    X_valid = cand.loc[valid_idx, FEATURES]
    y_valid = cand.loc[valid_idx, 'label']

    dtrain = xgb.DMatrix(X_train, y_train, group=train_groups) 
    dvalid = xgb.DMatrix(X_valid, y_valid, group=val_groups) 

    xgb_parms = {'objective':'rank:pairwise', 'tree_method':'gpu_hist'}
    model = xgb.train(xgb_parms, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=1000,
        verbose_eval=100)
    model.save_model((lb_out / f'XGB_fold{fold}_click.xgb').as_posix())

In [None]:
label = cudf.read_parquet((lb_in / 'test_labels.parquet').as_posix())
cand = cudf.read_parquet((lb_out / 'cart_cand.parquet').as_posix())

In [None]:
label = label.loc[label['type'] == 'clicks']
label.drop('type', axis=1, inplace=True)
label['label'] = 1
label = label.rename(columns={'aid': 'candidate'})

cand = cand.merge(label, on=['session', 'candidate'], how='left').fillna(0)
cand['past'] = cand.past.astype(int)

In [None]:
model_out = lb_out / 'model'
if not model_out.is_dir():
    model_out.mkdir()
    
FEATURES = cand.columns[2 : -1]

skf = GroupKFold(n_splits=5)
for fold,(train_idx, valid_idx) in enumerate(skf.split(cand, cand['label'].to_numpy(), groups=cand['session'].to_numpy() )):
    train_groups = cand.loc[train_idx].groupby('session').size().to_arrow().to_pylist()
    val_groups = cand.loc[valid_idx].groupby('session').size().to_arrow().tolist()

    X_train = cand.loc[train_idx, FEATURES]
    y_train = cand.loc[train_idx, 'label']
    X_valid = cand.loc[valid_idx, FEATURES]
    y_valid = cand.loc[valid_idx, 'label']

    dtrain = xgb.DMatrix(X_train, y_train, group=train_groups) 
    dvalid = xgb.DMatrix(X_valid, y_valid, group=val_groups) 

    xgb_parms = {'objective':'rank:pairwise', 'tree_method':'gpu_hist'}
    model = xgb.train(xgb_parms, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=1000,
        verbose_eval=100)
    model.save_model((lb_out / f'XGB_fold{fold}_click.xgb').as_posix())

In [None]:
label = cudf.read_parquet((lb_in / 'test_labels.parquet').as_posix())
cand = cudf.read_parquet((lb_out / 'order_cand.parquet').as_posix())

In [None]:
label = label.loc[label['type'] == 'clicks']
label.drop('type', axis=1, inplace=True)
label['label'] = 1
label = label.rename(columns={'aid': 'candidate'})

cand = cand.merge(label, on=['session', 'candidate'], how='left').fillna(0)
cand['past'] = cand.past.astype(int)

In [None]:
model_out = lb_out / 'model'
if not model_out.is_dir():
    model_out.mkdir()
    
FEATURES = cand.columns[2 : -1]

skf = GroupKFold(n_splits=5)
for fold,(train_idx, valid_idx) in enumerate(skf.split(cand, cand['label'].to_numpy(), groups=cand['session'].to_numpy() )):
    train_groups = cand.loc[train_idx].groupby('session').size().to_arrow().to_pylist()
    val_groups = cand.loc[valid_idx].groupby('session').size().to_arrow().tolist()

    X_train = cand.loc[train_idx, FEATURES]
    y_train = cand.loc[train_idx, 'label']
    X_valid = cand.loc[valid_idx, FEATURES]
    y_valid = cand.loc[valid_idx, 'label']

    dtrain = xgb.DMatrix(X_train, y_train, group=train_groups) 
    dvalid = xgb.DMatrix(X_valid, y_valid, group=val_groups) 

    xgb_parms = {'objective':'rank:pairwise', 'tree_method':'gpu_hist'}
    model = xgb.train(xgb_parms, 
        dtrain=dtrain,
        evals=[(dtrain,'train'),(dvalid,'valid')],
        num_boost_round=1000,
        verbose_eval=100)
    model.save_model((lb_out / f'XGB_fold{fold}_click.xgb').as_posix())