In [None]:
!pip install polars

In [None]:
import polars as pl
import pandas as pd
pd.set_option('display.max_rows', 50)
import numpy as np
import math
from lightgbm.sklearn import LGBMRanker
from sklearn.model_selection import KFold, StratifiedKFold, StratifiedGroupKFold
import gc
import math
import lightgbm as lgb
import optuna
from catboost import Pool, CatBoostRanker
import xgboost as xgb
from numba import njit
from numba import prange

In [None]:
# https://www.kaggle.com/code/greenwolf/lightgbm-fast-recall-20から引用
@njit(parallel=True) # the only difference from previous version
def numba_recall20(preds, targets, groups):
    total = 0
    nonempty = 0
    group_starts = np.cumsum(groups)

    # for group_id in range(len(groups)):
    for group_id in prange(len(groups)): # changed range to prange
        group_end = group_starts[group_id]
        group_start = group_end - groups[group_id]
        ranks = np.argsort(preds[group_start:group_end])[::-1]
        hits = 0
        for i in range(min(len(ranks), 20)):
            hits += targets[group_start + ranks[i]]

        actual = min(20, targets[group_start:group_end].sum())
        if actual > 0:
            total += hits / actual
            nonempty += 1

    return total / nonempty


def lgb_numba_recall(preds, lgb_dataset):
    metric = numba_recall20(preds, lgb_dataset.label, lgb_dataset.group)
    return 'numba_recall@20', metric, True

def xgb_numba_recall(preds, xgb_dataset):
    metric = numba_recall20(preds, xgb_dataset.get_label(), xgb_dataset.get_group())
    return 'numba_recall@20', metric

def create_dataset(size: int) -> lgb.Dataset:
    data = np.random.normal(size=(size, 10))
    target = np.random.randint(0, 2, size)
    groups = np.ones(size // 40, dtype=np.int32) * 40 # groups of 40
    return lgb.Dataset(data, target, group=groups)

In [None]:
size = 1_000_000
eval_dataset = create_dataset(size)
lgb_numba_recall(np.zeros(size), eval_dataset)

In [None]:
def compute_recall(pred_df, valid_session, action_type):
    
    sub = pred_df.filter(pl.col('session_type').str.contains(action_type))
    sub = (
        sub.with_columns([
            pl.col('session_type').str.split('_').arr.first().cast(pl.Int32).alias('session'),
            pl.col('labels').str.split(' ')
        ])
        .drop('session_type')
    )
    test_labels = pl.read_parquet('input/otto_train_and_test_data_for_local_validation/test_labels.parquet')
    test_labels = test_labels.filter((pl.col('type')==f'{action_type}s')&(pl.col('session').is_in(valid_session.to_list())))
    test_labels = (
        test_labels.with_columns([
            pl.col('ground_truth').arr.lengths().clip(0, 20).alias('gt_count'),
            pl.col('session').cast(pl.Int32)
        ])
    )
    test_labels = test_labels.join(sub, on='session', how='left')
    hits = test_labels.apply(lambda df:len(set(df[2]).intersection(set(map(int, df[4])))))
    recall = hits['apply'].sum() / test_labels['gt_count'].sum()
    
    print(f'{action_type} recall =',recall)
    
    return recall

def output_feature_importance(ranker, feature):
    for i in range(len(ranker.feature_importances_)):
        print(
            np.array(feature)[np.argsort(ranker.feature_importances_, )][::-1][i],
            np.sort(ranker.feature_importances_)[::-1][i]
        )

def down_sampling(df, nega_posi_ratio):
    sampled_negative = df.filter(pl.col('gt')==0).sample(nega_posi_ratio*df['gt'].sum(), seed=0)
    return pl.concat([df.filter(pl.col('gt')==1), sampled_negative])

def infer_gbranker(test, gbranker, feature, nsplit):
    chunk_size = math.ceil(len(test) / nsplit)
    chunks = []
    for i in range(nsplit):
        start = i * chunk_size
        end = min((i+1) * chunk_size, len(test))
        score = gbranker.predict(test[start:end][feature].to_pandas(), num_iteration=gbranker.best_iteration)
        # score = gbranker.predict(xgb.DMatrix(test[start:end][feature].to_numpy()), iteration_range=(0, gbranker.best_iteration))
        chunks.append(test[start:end][['session', 'aid']].with_column(pl.Series(score).alias('score').cast(pl.Float32)))
    return pl.concat(chunks) 

def cross_validation(df, action_type, feature, params):
    num_gt = df.groupby('session', maintain_order=True).agg(pl.col('gt').sum().cast(pl.UInt16))
    skf = StratifiedKFold(n_splits=5)
    recalls = []
    best_iter = []
    for fold, (train_index, valid_index) in enumerate(skf.split(num_gt['session'], num_gt['gt'])):
        train_session = num_gt['session'][train_index]
        valid_session = num_gt['session'][valid_index]
        train = df.filter(pl.col('session').is_in(train_session))
        valid = df.filter(pl.col('session').is_in(valid_session))
        
        nega_posi_ratio = {'click':10, 'cart':20, 'order':20}
        
        train = down_sampling(train, nega_posi_ratio[action_type])
        
        session_lengths_train = train.groupby('session').count().sort('session')['count'].to_numpy()
        train = train.sort('session')
        
        session_lengths_valid = valid.groupby('session').count().sort('session')['count'].to_numpy()
        valid = valid.sort('session')
        
        # train_pool = Pool(data=train[feature].to_numpy(), label=train['gt'].to_numpy(), group_id=train['session'].to_numpy())
        # valid_pool = Pool(data=valid[feature].to_numpy(), label=valid['gt'].to_numpy(), group_id=valid['session'].to_numpy())
        # ranker = CatBoostRanker(**params)
        # ranker.fit(
        #     train_pool,
        #     eval_set=valid_pool,
        #     early_stopping_rounds=50
        # )

#         train_dataset = xgb.DMatrix(data=train[feature].to_numpy(), label=train['gt'].to_numpy(), group=session_lengths_train)
#         eval_dataset = xgb.DMatrix(data=valid[feature].to_numpy(), label=valid['gt'].to_numpy(), group=session_lengths_valid)
        
#         ranker = xgb.train(
#             params,
#             train_dataset,
#             evals=[(eval_dataset, 'eval_dataset')],
#             custom_metric=xgb_numba_recall,
#             num_boost_round=10000,
#             early_stopping_rounds=50,
#             verbose_eval=10,
#             maximize=True
#         )
        
        train_dataset = lgb.Dataset(train[feature].to_numpy(), train['gt'].to_numpy(), group=session_lengths_train)
        eval_dataset = lgb.Dataset(valid[feature].to_numpy(), valid['gt'].to_numpy(), group=session_lengths_valid)
        
        ranker = lgb.train(
            params,
            train_dataset,
            valid_sets=eval_dataset,
            feval=lgb_numba_recall,
            callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False), lgb.log_evaluation(10)]
            # callbacks=[lgb.log_evaluation(10)]
        )      
        print(f'best iteration = {ranker.best_iteration}')
        best_iter.append(ranker.best_iteration)
        
        # if fold == 0:
        #     output_feature_importance(ranker, feature)
        
        valid = infer_gbranker(valid, ranker, feature, 10)
        
        pred = valid.sort(['session', 'score'], reverse=[False, True]).groupby('session', maintain_order=True).agg(pl.col('aid').head(20))
        pred = pred.with_columns([pl.col('session').apply(lambda x:str(x)+f'_{action_type}s'), pl.col('aid').apply(lambda x:' '.join(map(str, x.to_list())))])
        pred = pred.rename({'session':'session_type', 'aid':'labels'})
        
        recalls.append(compute_recall(pred, valid_session, action_type))
        
        del train, valid
        gc.collect()
    
    mean_recall = np.mean(recalls)
    mean_best_iter = np.mean(best_iter)
    print(f'mean {action_type} recall = {mean_recall}')
    print(f'mean {action_type} best iteration = {mean_best_iter}')
    return mean_recall
    
def compute_score():
    
    recall = {}
    # max_depth = {'click':7, 'cart':6, 'order':7}
    for action_type in ['order', 'cart', 'click']:
        train = pl.read_parquet(f'train_{action_type}.parquet')

        feature = train.drop([
                        'session', 
                        'aid', 
                        'gt',
                        'user_order'
                    ]).columns    

        params = {
            'boosting_type':'gbdt',
            'objective':'lambdarank',
            'metric':'"None"',
            'learning_rate':0.05,
            'num_boost_round':10000,
            # 'max_depth':max_depth[action_type],
            'max_depth':6,
            'num_leaves':32,
            'min_child_samples':471,
            'reg_alpha':0.06786952863490345,
            'reg_lambda':0.0013212485115586014,
            'random_state':500,
            'bagging_fraction': 0.877462547767822,
            'feature_fraction': 0.37792222260319913,
            'bagging_freq': 1
        }
    
        # params = {
        #     'loss_function':'YetiRank',
        #     'iterations':10000,
        #     'learning_rate':0.1,
        #     'random_seed':100,
        #     'eval_metric':CatBoostEvalMetric()
        # }
    
        # params = {
        #     'booster':'gbtree',
        #     'objective':'rank:pairwise',
        #     'random_state':100,
        #     'learning_rate':0.1
        # }

        recall[action_type] = cross_validation(train, action_type, feature, params)
        
        del train
        gc.collect()

    weight = {'click': 0.10, 'cart': 0.30, 'order': 0.60}
    score = recall['click']*weight['click'] + recall['cart']*weight['cart'] + recall['order']*weight['order']
    print('Overall Recall =',score)

In [None]:
class objective(object):
    def __init__(self, df, action_type, feature):
        self.df = df
        self.action_type = action_type
        self.feature = feature
    def __call__(self, trial):
        
        max_depth = trial.suggest_int('max_depth', 5, 9)
        num_leaves_max = 2**max_depth
        num_leaves_upper = int(0.8*num_leaves_max)
        num_leaves_lower = int(0.5*num_leaves_max)
        num_leaves = trial.suggest_int('num_leaves', num_leaves_lower, num_leaves_upper)
        min_child_samples = trial.suggest_int('min_child_samples', 1, 1000)
        reg_alpha = trial.suggest_loguniform('reg_alpha', 0.00001, 0.1)
        reg_lambda = trial.suggest_loguniform('reg_lambda', 0.00001, 0.1)
        bagging_fraction = trial.suggest_float('bagging_fraction', 0.1, 0.9)
        feature_fraction = trial.suggest_float('feature_fraction', 0.1, 0.9)
        bagging_freq = trial.suggest_int('bagging_freq', 0, 50)
        params = {
            'boosting_type':'gbdt',
            'objective':'lambdarank',
            'metric':'"None"',
            'n_estimators':10000,
            'learning_rate':0.1,
            'random_state':500,
            'max_depth':max_depth,
            'num_leaves':num_leaves,
            'min_child_samples':min_child_samples,
            'reg_alpha':reg_alpha,
            'reg_lambda':reg_lambda,
            'bagging_fraction':bagging_fraction,
            'bagging_freq':bagging_freq,
            'feature_fraction':feature_fraction
        }
        return cross_validation(self.df, self.action_type, self.feature, params)
    
def optimize_parameter(action_type):
    
    train = pl.read_parquet(f'train_{action_type}.parquet')
    feature = train.drop([
                'session',
                'aid',
                'gt',
                'user_order'
                ]).columns
    objective_func = objective(train, action_type, feature)
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_func, n_trials=50)
    return study

In [None]:
compute_score()