In [None]:
!pip install polars

In [None]:
import polars as pl
import pandas as pd
pd.set_option('display.max_rows', 50)
import numpy as np
import math
from lightgbm.sklearn import LGBMRanker
import matplotlib.pyplot as plt
%matplotlib inline
from catboost import Pool, CatBoostRanker
import xgboost as xgb
import gc

In [None]:
def down_sampling(df, nega_posi_ratio):
    sampled_negative = df.filter(pl.col('gt')==0).sample(nega_posi_ratio*df['gt'].sum(), seed=0)
    return pl.concat([df.filter(pl.col('gt')==1), sampled_negative])

def infer_gbranker(test, gbranker, feature, nsplit):
    chunk_size = math.ceil(len(test) / nsplit)
    chunks = []
    for i in range(nsplit):
        start = i * chunk_size
        end = min((i+1) * chunk_size, len(test))
        score = gbranker.predict(test[start:end][feature].to_pandas())
        # score = gbranker.predict(xgb.DMatrix(test[start:end][feature].to_numpy()))
        chunks.append(test[start:end][['session', 'aid']].with_column(pl.Series(score).alias('score').cast(pl.Float32)))
    return pl.concat(chunks) 

In [None]:
%%time
# max_depth = {'click':7, 'cart':6, 'order':7}
# randam forest best parameters
# n_estimators = {'click':600, 'cart':600, 'order':600}
# max_depth = {'click':14, 'cart':15, 'order':15}
# num_leaves = {'click':11267, 'cart':21454, 'order':18287}
# bagging_fraction = {'click':0.12562664982574, 'cart':0.8820039657877258, 'order':0.7327477237093407}
# feature_fraction = {'click':0.6869763784883627, 'cart':0.5869260228060384, 'order':0.4308816079881996}
nega_posi_ratio = {'click':10, 'cart':20, 'order':20}
pred = {}
for action_type in ['order', 'cart', 'click']:
    train = pl.read_parquet(f'train_{action_type}.parquet')
    train = down_sampling(train, nega_posi_ratio[action_type])
    session_lengths_train = train.groupby('session').count().sort('session')['count'].to_list()
    train = train.sort('session')

    feature = train.drop([
                    'session', 
                    'aid', 
                    'gt',
                    'user_order'
                ]).columns   

#     params = {
#         'loss_function':'YetiRank',
#         'iterations':n_estimators[action_type],
#         'learning_rate':0.1,
#         'random_seed':100
#     }
    
#     train_pool = Pool(data=train[feature].to_numpy(), label=train['gt'].to_numpy(), group_id=train['session'].to_numpy())
#     ranker = CatBoostRanker(**params)
#     ranker.fit(
#         train_pool
#     )
    
#     train_dataset = xgb.DMatrix(data=train[feature].to_numpy(), label=train['gt'].to_numpy(), group=session_lengths_train)
    
#     params = {
#         'booster':'gbtree',
#         'objective':'rank:pairwise',
#         'random_state':100,
#         'learning_rate':0.1
#     }
    
#     ranker = xgb.train(
#         params,
#         train_dataset,
#         num_boost_round=n_estimators[action_type]
#     )

    params = {
        'boosting_type':'gbdt',
        'objective':'lambdarank',
        'metric':'"None"',
        'learning_rate':0.05,
        'num_boost_round':500,
        'max_depth':6,
        'num_leaves':32,
        'min_child_samples':471,
        'reg_alpha':0.06786952863490345,
        'reg_lambda':0.0013212485115586014,
        'random_state':500,
        'bagging_fraction': 0.877462547767822,
        'feature_fraction': 0.37792222260319913,
        'bagging_freq': 1
    }
    
    ranker = LGBMRanker(
        **params
        # objective="lambdarank",
        # metric="ndcg",
        # boosting_type="rf",
        # learning_rate=0.1,
        # max_depth=max_depth[action_type],
        # num_leaves=num_leaves[action_type],
        # bagging_fraction=bagging_fraction[action_type],
        # bagging_freq=1,
        # feature_fraction=feature_fraction[action_type],
        # n_estimators=n_estimators[action_type],
        # random_state=400
        # importance_type='gain'
    )
    
    ranker = ranker.fit(
        train[feature].to_pandas(),
        train['gt'].to_pandas(),
        group=session_lengths_train,
    )
    
    del train
    
    test = pl.read_parquet(f'test_{action_type}.parquet')
    test = infer_gbranker(test, ranker, feature, 2)
    
    pred[action_type] = test.sort(['session', 'score'], reverse=[False, True]).groupby('session', maintain_order=True).agg(pl.col('aid').head(20))
    pred[action_type] = pred[action_type].with_columns([pl.col('session').apply(lambda x:str(x)+f'_{action_type}s'), pl.col('aid').apply(lambda x:' '.join(map(str, x.to_list())))])
    pred[action_type].write_parquet(f'pred_{action_type}.parquet')
    del test
    gc.collect()

In [None]:
submission = pl.concat([pred['click'], pred['cart'], pred['order']]).rename({'session':'session_type', 'aid':'labels'})
submission

In [None]:
submission.write_csv('submission.csv')

In [None]:
!mkdir /root/.kaggle
!cp kaggle.json /root/.kaggle/

In [None]:
!zip -r submission.zip submission.csv

In [None]:
!kaggle competitions submit -c otto-recommender-system -f submission.zip -m submit_by_kaggle_api