In [1]:
import pandas as pd
import datatable as dt
import optuna
import optuna.integration.lightgbm as lgb
from tqdm.notebook import tqdm
import os, gc, random, time
import numpy as np
import treelite, treelite_runtime
from numba import njit

import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

import lightgbm as lgbm
from lightgbm import LGBMClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
SEED = 1111
random.seed(SEED)
np.random.seed(SEED)

In [3]:
%%time
train = dt.fread('/kaggle/working/input/train.csv').to_pandas()
# train = dt.fread('/kaggle/input/jane-street-market-prediction/train.csv').to_pandas()
train = train.query('date > 85').reset_index(drop=True)
train = train.loc[train.weight > 0].reset_index(drop = True)

features = [f'feature_{i}' for i in range(130)]
features.remove('feature_0')
# resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp', 'resp_4']
resp_cols = ['resp_1', 'resp_2', 'resp_3', 'resp']

f_mean = train[features[1:]].mean()
train[features[1:]] = train[features[1:]].fillna(f_mean)
f_mean = f_mean.values

train['action'] = (train['resp'] > 0).astype('int')
train['resp'] = (((train['resp'].values)*train['weight']) > 0).astype(int)
train['resp_1'] = (((train['resp_1'].values)*train['weight']) > 0).astype(int)
train['resp_2'] = (((train['resp_2'].values)*train['weight']) > 0).astype(int)
train['resp_3'] = (((train['resp_3'].values)*train['weight']) > 0).astype(int)
train['resp_4'] = (((train['resp_4'].values)*train['weight']) > 0).astype(int)

CPU times: user 3min 8s, sys: 34.6 s, total: 3min 43s
Wall time: 22.4 s


In [None]:
X_train = train[features].values
y_train = np.stack([(train[c] > 0).astype('int') for c in resp_cols]).T
del train
gc.collect()

In [None]:
params = {
    "num_leaves":300,
    "max_bin":450,
    "feature_fraction":0.52,
    "bagging_fraction":0.52,
    "objective":"binary",
    "learning_rate":0.05,
    "boosting_type":"gbdt",
    "metric":"auc"
}

In [None]:
models = []
for i in range(y_train.shape[1]):
    x_tr, x_val, y_tr, y_val = train_test_split(X_train ,y_train[:,i],test_size=0.2,stratify=y_train[:,i])
    
    d_train = lgbm.Dataset(x_tr, label=y_tr)
    d_eval = lgbm.Dataset(x_val, label=y_val, reference=d_train)
    
    clf = lgbm.train(params, d_train, valid_sets=[d_train, d_eval], num_boost_round=1000, early_stopping_rounds=50, verbose_eval=50)
    
    models.append(clf)

In [None]:
clf.save_model('lgbm_model.txt')

# Treelite

In [None]:
%%time
usr_treelite = False
if usr_treelite:
    model = treelite.Model.load('lgbm_model.txt', model_format='lightgbm')
    toolchain = 'gcc'
    model.export_lib(toolchain=toolchain, libpath='./mymodel.so', params={'parallel_comp': 128}, verbose=True)
    predictor = treelite_runtime.Predictor('./mymodel.so', verbose=True)

# Submit

In [5]:
@njit
def fast_fillna(array, values):
    if np.isnan(array.sum()):
        array = np.where(np.isnan(array), values, array)
    return array

In [6]:
import janestreet
env = janestreet.make_env()

In [7]:
f = np.median
th = 0.500

for (test_df, pred_df) in tqdm(env.iter_test()):
    if test_df['weight'].item() > 0:
        x_tt = test_df.loc[:, features].values
        if np.isnan(x_tt[:, 1:].sum()):
            x_tt[:, 1:] = np.nan_to_num(x_tt[:, 1:]) + np.isnan(x_tt[:, 1:]) * f_mean
        if usr_treelite:
            pred = predictor.predict(treelite_runtime.DMatrix(x_tt))
            #         pred = predictor.predict(treelite_runtime.Batch.from_npy2d(x_tt))
        else:
            pred = f(np.stack([model.predict(x_tt) for model in models]),axis=0).T
        pred_df.action = int(pred >= th)
    else:
        pred_df.action = 0
    env.predict(pred_df)

0it [00:00, ?it/s]

KeyboardInterrupt: 