In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import pickle
import xgboost as xgb
import numpy as np
import pandas as pd

import time
from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

# Dataset processing

In [None]:
def load_df(path):
    df = pd.read_parquet(path + 'train.parquet')
    features = [column for column in df.columns if 'feature' in column]
    
    return df, features


# load data and features
path = os.getcwd() + '/drive/MyDrive/'
df, features = load_df(path)
print(df.shape)
print(len(features))

(2390491, 138)
130


In [None]:
def add_actions(df, features):
    f_mean = df[features[1:]].mean()
    f_std = df[features[1:]].std()
    
    # delete all trading oportunities that are not taken into account for 
    # utility score calculation
    df = df.query('weight > 0').reset_index(drop = True)

    # normalize each feature
    df[features[1:]] = df[features[1:]].fillna(f_mean)
    df[features[1:]] = (df[features[1:]] - f_mean) / f_std

    # add the correct action that should be chosen for each trading oportunity
    df['action'] = (df['resp'] > 0).astype('int')
    return df

# add the action column
df = add_actions(df, features)

n_samples = df.shape[0]
df_train = df[:int(0.9 * n_samples)]
df_test = df[int(0.9 * n_samples):int(0.95 * n_samples)]
df_valid = df[int(0.95 * n_samples):]

X_train = df_train.loc[:, df_train.columns.str.contains('feature')]
X_valid = df_valid.loc[:, df_valid.columns.str.contains('feature')]
X_test = df_test.loc[:, df_test.columns.str.contains('feature')]
y_train = df_train['action']
y_valid = df_valid['action']
y_test = df_test['action']

In [None]:
def utility_score(df, action):
    """
    df - pandas.dataframe,
    action - numpy array with len df.shape[0]
    """
    date, weight, resp = df['date'].values, df['weight'].values, df['resp'].values
    count_i = len(np.unique(date))
    Pi = np.bincount(date, weight * resp * action)
    t = np.sum(Pi) / np.sqrt(np.sum(Pi ** 2)) * np.sqrt(250 / count_i)
    u = np.clip(t, 0, 6) * np.sum(Pi)
    return u

# Hyperparameters tuning

In [None]:
# default parameters

clf = xgb.XGBClassifier(
    tree_method='gpu_hist'
)

%time clf.fit(X_train, y_train)
file_name = "default.pkl"
pickle.dump(clf, open(file_name, "wb"))
xgb_model = pickle.load(open(file_name, "rb"))
preds = xgb_model.predict(X_valid)
utility = utility_score(df_valid, preds)
print(utility)

CPU times: user 1min 11s, sys: 864 ms, total: 1min 12s
Wall time: 1min 11s
1498.3463045168141


In [None]:
# linear booster

clf = xgb.XGBClassifier(
    booster='gblinear',
    tree_method='gpu_hist'
)

%time clf.fit(X_train, y_train)
file_name = "linear.pkl"
pickle.dump(clf, open(file_name, "wb"))
xgb_model = pickle.load(open(file_name, "rb"))
preds = xgb_model.predict(X_valid)
utility = utility_score(df_valid, preds)
print(utility)

CPU times: user 4min 23s, sys: 5.53 s, total: 4min 29s
Wall time: 4min 28s
1343.8330975604604


In [None]:
# dart booster

clf = xgb.XGBClassifier(
    booster='dart',
    tree_method='gpu_hist'
)

%time clf.fit(X_train, y_train)
file_name = "dart.pkl"
pickle.dump(clf, open(file_name, "wb"))
xgb_model = pickle.load(open(file_name, "rb"))
preds = xgb_model.predict(X_valid)
utility = utility_score(df_valid, preds)
print(utility)

CPU times: user 6min 39s, sys: 1.95 s, total: 6min 41s
Wall time: 6min 39s
1491.8275908008322


In [None]:
# n_estimators

estimators = range(300, 900, 200)
for n in estimators:
    clf = xgb.XGBClassifier(
        n_estimators=n,
        tree_method='gpu_hist'
    )

    %time clf.fit(X_train, y_train)
    file_name = "n_est" + str(n) + ".pkl"
    pickle.dump(clf, open(file_name, "wb"))
    xgb_model = pickle.load(open(file_name, "rb"))
    preds = xgb_model.predict(X_valid)
    utility = utility_score(df_valid, preds)
    print("utility for n_estimators {} is {}".format(n, utility))

CPU times: user 3min 8s, sys: 834 ms, total: 3min 9s
Wall time: 3min 8s
utility for n_estimators 300 is 1569.7038068926515
CPU times: user 5min 3s, sys: 877 ms, total: 5min 4s
Wall time: 5min 2s
utility for n_estimators 500 is 1591.3569392226661
CPU times: user 7min 2s, sys: 957 ms, total: 7min 3s
Wall time: 7min 1s
utility for n_estimators 700 is 1581.5742408259798


In [None]:
# max depth

max_depths = [3, 7, 11, 13]
for depth in max_depths:
    clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=depth,
        tree_method='gpu_hist'
    )

    %time clf.fit(X_train, y_train)
    file_name = "depth" + str(depth) + ".pkl"
    pickle.dump(clf, open(file_name, "wb"))
    xgb_model = pickle.load(open(file_name, "rb"))
    preds = xgb_model.predict(X_valid)
    utility = utility_score(df_valid, preds)
    print("utility for max depth {} is {}".format(depth, utility))

CPU times: user 5min 4s, sys: 940 ms, total: 5min 5s
Wall time: 5min 4s
utility for max depth 3 is 1591.3569392226661
CPU times: user 6min 51s, sys: 893 ms, total: 6min 52s
Wall time: 6min 50s
utility for max depth 7 is 1230.877361456051
CPU times: user 9min 45s, sys: 1.22 s, total: 9min 47s
Wall time: 9min 45s
utility for max depth 11 is 786.8524075961818
CPU times: user 12min 45s, sys: 1.41 s, total: 12min 47s
Wall time: 12min 47s
utility for max depth 13 is 810.2427474952403


In [None]:
clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=5,
        tree_method='gpu_hist'
    )

%time clf.fit(X_train, y_train)
file_name = "depth" + str(5) + ".pkl"
pickle.dump(clf, open(file_name, "wb"))
xgb_model = pickle.load(open(file_name, "rb"))
preds = xgb_model.predict(X_valid)
utility = utility_score(df_valid, preds)
print("utility for max depth {} is {}".format(5, utility))

CPU times: user 5min 55s, sys: 1.3 s, total: 5min 56s
Wall time: 5min 54s
utility for max depth 5 is 1436.3668865598925


In [None]:
# learning rate

learning_rate = [0.001, 0.01, 0.1]
for lr in learning_rate:
    clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=3,
        leatning_rate=lr,
        tree_method='gpu_hist'
    )

    %time clf.fit(X_train, y_train)
    file_name = "lr" + str(lr) + ".pkl"
    pickle.dump(clf, open(file_name, "wb"))
    xgb_model = pickle.load(open(file_name, "rb"))
    preds = xgb_model.predict(X_valid)
    utility = utility_score(df_valid, preds)
    print("utility for lr {} is {}".format(lr, utility))

CPU times: user 5min 4s, sys: 937 ms, total: 5min 5s
Wall time: 5min 3s
utility for lr 0.001 is 1591.3569392226661
CPU times: user 5min 4s, sys: 846 ms, total: 5min 5s
Wall time: 5min 3s
utility for lr 0.01 is 1591.3569392226661
CPU times: user 5min 3s, sys: 814 ms, total: 5min 4s
Wall time: 5min 2s
utility for lr 0.1 is 1591.3569392226661


In [None]:
subsamples = [0.8, 0.85, 0.9, 0.95]
for subsample in subsamples:
    clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=3,
        leatning_rate=0.01,
        subsample=subsample,
        tree_method='gpu_hist'
    )

    %time clf.fit(X_train, y_train)
    file_name = "subsample" + str(subsample) + ".pkl"
    pickle.dump(clf, open(file_name, "wb"))
    xgb_model = pickle.load(open(file_name, "rb"))
    preds = xgb_model.predict(X_valid)
    utility = utility_score(df_valid, preds)
    print("utility for subsample {} is {}".format(subsample, utility))

CPU times: user 4min 43s, sys: 942 ms, total: 4min 44s
Wall time: 4min 43s
utility for subsample 0.8 is 1660.680341669935
CPU times: user 4min 53s, sys: 845 ms, total: 4min 54s
Wall time: 4min 52s
utility for subsample 0.85 is 1521.3487816444795
CPU times: user 4min 54s, sys: 856 ms, total: 4min 55s
Wall time: 4min 53s
utility for subsample 0.9 is 1650.3450043821158
CPU times: user 5min 1s, sys: 901 ms, total: 5min 2s
Wall time: 5min 1s
utility for subsample 0.95 is 1691.098231945731


In [None]:
colsample_bytrees = [0.5, 0.7, 0.9]
for colsample_bytree in colsample_bytrees:
    clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=3,
        leatning_rate=0.01,
        subsample=0.95,
        colsample_bytree=colsample_bytree,
        missing=-999,
        tree_method='gpu_hist'
    )

    %time clf.fit(X_train, y_train)
    file_name = "colsample_bytree" + str(colsample_bytree) + ".pkl"
    pickle.dump(clf, open(file_name, "wb"))
    xgb_model = pickle.load(open(file_name, "rb"))
    preds = xgb_model.predict(X_valid)
    utility = utility_score(df_valid, preds)
    print("utility for colsample_bytree {} is {}".format(colsample_bytree, utility))

CPU times: user 4min 58s, sys: 972 ms, total: 4min 59s
Wall time: 4min 58s
utility for colsample_bytree 0.5 is 1596.3770293180833
CPU times: user 5min 1s, sys: 839 ms, total: 5min 2s
Wall time: 5min 1s
utility for colsample_bytree 0.7 is 1638.7770882556533
CPU times: user 5min 3s, sys: 803 ms, total: 5min 4s
Wall time: 5min 2s
utility for colsample_bytree 0.9 is 1560.4173988946209


In [None]:
# test the best model
clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=3,
        leatning_rate=0.01,
        subsample=0.95,
        colsample_bytree=0.7,
        missing=-999,
        tree_method='gpu_hist'
    )

%time clf.fit(X_train, y_train)
file_name = "best.pkl"
pickle.dump(clf, open(file_name, "wb"))
xgb_model = pickle.load(open(file_name, "rb"))
preds = xgb_model.predict(X_test)
utility = utility_score(df_test, preds)
print("utility score of the best model is {}".format(utility))

CPU times: user 5min 2s, sys: 916 ms, total: 5min 3s
Wall time: 5min 2s
utility score of the best model is 850.8818457293037
