In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

#import janestreet
#env = janestreet.make_env() # initialize the environment

#!pip install datatable # Internet is not activated in this competition
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl
import datatable as dt

import pickle
MODEL_FILE = '/kaggle/working/model.pickle'

#import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

INPUT_DIR = '/kaggle/input/jane-street-market-prediction/'

Processing /kaggle/input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: datatable
Successfully installed datatable-0.11.0


/kaggle/input/jane-street-market-prediction/example_test.csv
/kaggle/input/jane-street-market-prediction/example_sample_submission.csv
/kaggle/input/jane-street-market-prediction/features.csv
/kaggle/input/jane-street-market-prediction/train.csv
/kaggle/input/jane-street-market-prediction/janestreet/__init__.py
/kaggle/input/jane-street-market-prediction/janestreet/competition.cpython-37m-x86_64-linux-gnu.so
/kaggle/input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl


Thanks to following notebooks :  
https://www.kaggle.com/drcapa/jane-street-market-prediction-starter-xgb/output?select=example_sample_submission.csv

# Load data

In [2]:
%%time
# Thanks to his notebook for this fast loading : https://www.kaggle.com/carlmcbrideellis/jane-street-eda-of-day-0-and-feature-importance
train_data_datatable = dt.fread('../input/jane-street-market-prediction/train.csv')
df = train_data_datatable.to_pandas()

# Thanks to this notebook to gain memory usage : https://www.kaggle.com/jorijnsmit/one-liner-to-halve-your-memory-usage
float64_cols = df.select_dtypes(include='float64').columns
mapper = {col_name: np.float32 for col_name in float64_cols}
df = df.astype(mapper)

CPU times: user 33.9 s, sys: 12.7 s, total: 46.5 s
Wall time: 35.6 s


In [3]:
df['resp'].sum()

976.0646

# Calculate target to predict

In [4]:
df[df['resp'] > 0]['resp'].quantile(0.20)

0.0022803622763603927

In [5]:
df[df['resp'] > 0.0022803622763603927].shape[0] / df[df['resp'] > 0].shape[0]

0.799999834021315

In [6]:
df['resp'].max()

0.4484615921974182

In [7]:
df['resp_positive'] = ((df['resp'])>0)*1

# Split train test

In [8]:
tscv = TimeSeriesSplit(n_splits=10)

for (train_index, test_index) in tscv.split(df):
    pass

df_train, df_test = df.loc[train_index], df.loc[test_index]
y_train, y_test = df.loc[train_index]['resp_positive'], df.loc[test_index]['resp_positive']

In [9]:
df_train.shape

(2173174, 139)

In [10]:
df_test.shape

(217317, 139)

In [11]:
df_test.shape[0] / df.shape[0]

0.09090893879123578

In [12]:
(df_train.shape[0] + df_test.shape[0]) == df.shape[0]

True

In [13]:
#df_train, df_test, y_train, y_test = train_test_split(df, df['resp_positive'], test_size = 0.1, stratify=df['resp_positive'], random_state=42)

In [14]:
df_test.reset_index(drop=True, inplace=True)

In [15]:
df_train.reset_index(drop=True, inplace=True)

# Data clean

In [16]:
cols_with_missing_train = [col for col in df_train.columns if df_train[col].isnull().any()]

In [17]:
#df_medians = df_train[cols_with_missing_train].median()
#df_train.loc[:, cols_with_missing_train].fillna(df_medians, inplace=True)
df_train.loc[:, cols_with_missing_train].fillna(-999, inplace=True)

In [18]:
#df_test.loc[:, cols_with_missing_train].fillna(df_medians, inplace=True)
df_test.loc[:, cols_with_missing_train].fillna(-999, inplace=True)

# Feature definition

In [19]:
df

Unnamed: 0,date,weight,resp_1,resp_2,resp_3,resp_4,resp,feature_0,feature_1,feature_2,...,feature_122,feature_123,feature_124,feature_125,feature_126,feature_127,feature_128,feature_129,ts_id,resp_positive
0,0,0.000000,0.009916,0.014079,0.008773,0.001390,0.006270,1,-1.872746,-2.191242,...,1.168391,8.313582,1.782433,14.018213,2.653056,12.600291,2.301488,11.445807,0,1
1,0,16.673515,-0.002828,-0.003226,-0.007319,-0.011114,-0.009792,-1,-1.349537,-1.704709,...,-1.178850,1.777472,-0.915459,2.831612,-1.417010,2.297459,-1.304614,1.898684,1,0
2,0,0.000000,0.025134,0.027607,0.033406,0.034380,0.023970,-1,0.812780,-0.256156,...,6.115747,9.667908,5.542871,11.671595,7.281757,10.060014,6.638248,9.427299,2,1
3,0,0.000000,-0.004730,-0.003273,-0.000461,-0.000476,-0.003200,-1,1.174379,0.344640,...,2.838853,0.499251,3.033731,1.513488,4.397532,1.266037,3.856384,1.013469,3,0
4,0,0.138531,0.001252,0.002165,-0.001215,-0.006219,-0.002604,1,-3.172026,-3.093182,...,0.344850,4.101145,0.614252,6.623456,0.800129,5.233243,0.362636,3.926633,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2390486,499,0.000000,0.000142,0.000142,0.005829,0.020342,0.015396,1,-1.649365,-1.169997,...,-1.260055,1.947725,-1.994399,-1.685163,-2.866165,-0.216130,-1.892048,0.901585,2390486,1
2390487,499,0.000000,0.000012,0.000012,-0.000935,-0.006326,-0.004718,1,2.432943,5.284504,...,1.064936,3.119762,-0.419796,-0.208975,-0.146749,0.730166,0.648452,2.068737,2390487,0
2390488,499,0.000000,0.000499,0.000499,0.007605,0.024907,0.016591,1,-0.622475,-0.963682,...,-0.640334,-2.279663,-0.950259,-4.388417,-1.669922,-3.288939,-1.336142,-2.814239,2390488,1
2390489,499,0.283405,-0.000156,-0.000156,-0.001375,-0.003702,-0.002004,-1,-1.463757,-1.107228,...,-1.780962,0.881246,-2.202140,-1.912601,-3.341684,-0.571187,-2.185795,0.627452,2390489,0


In [20]:
FEATURES_LIST = ['feature_'+str(i) for i in range(130)] + ['weight']

# Utility calculation function

In [21]:
def utility_function(df_test, df_test_predictions):
    df_test.loc[:, 'utility_pj'] = df_test['weight'] * df_test['resp'] * df_test_predictions
    df_test_utility_pi = df_test.groupby('date').sum('utility_pj')['utility_pj']
    nb_unique_dates = df_test_utility_pi.shape[0]
    t = (df_test_utility_pi.sum() / np.sqrt(df_test_utility_pi.pow(2).sum())) * (np.sqrt(250 / np.abs(nb_unique_dates)))
    u = min(max(t, 0), 6) * df_test_utility_pi.sum()
    
    return(u)

# Train model

In [22]:
model = XGBClassifier()

In [23]:
'''
param_search = { 'random_state': [42],
                 'max_depth': [50, 100, 500],
                 'n_estimators': [50, 500, 1000],
                 'tree_method': ['gpu_hist'],
               }
'''
param_search = { 'random_state': [42],
                 'max_depth': [12],
                 'n_estimators': [500],
                'learning_rate': [0.05],
                'subsample': [0.9],
                'colsample_bytree': [0.7],
                 'tree_method': ['gpu_hist'],
               }


In [24]:
%%time
tscv_subsplits = TimeSeriesSplit(n_splits=3)

grid_search = GridSearchCV(estimator=model, cv=tscv_subsplits,
                        param_grid=param_search, verbose=10)

grid_search.fit(df_train[FEATURES_LIST], y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, n_estimators=500, random_state=42, subsample=0.9, tree_method=gpu_hist 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, n_estimators=500, random_state=42, subsample=0.9, tree_method=gpu_hist, score=0.514, total= 2.1min
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, n_estimators=500, random_state=42, subsample=0.9, tree_method=gpu_hist 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min remaining:    0.0s


[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, n_estimators=500, random_state=42, subsample=0.9, tree_method=gpu_hist, score=0.510, total= 2.4min
[CV] colsample_bytree=0.7, learning_rate=0.05, max_depth=12, n_estimators=500, random_state=42, subsample=0.9, tree_method=gpu_hist 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.5min remaining:    0.0s


[CV]  colsample_bytree=0.7, learning_rate=0.05, max_depth=12, n_estimators=500, random_state=42, subsample=0.9, tree_method=gpu_hist, score=0.514, total= 2.9min


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.4min remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  7.4min finished


CPU times: user 11min 16s, sys: 17.5 s, total: 11min 34s
Wall time: 9min 37s


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=3),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, mon...
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                           

In [25]:
print('Model training ended')

Model training ended


In [26]:
model = grid_search

In [27]:
model.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.05, max_delta_step=0, max_depth=12,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=500, n_jobs=0, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [28]:
model.best_estimator_.feature_importances_

array([0.00823211, 0.00560844, 0.00588764, 0.00989601, 0.0101052 ,
       0.01110377, 0.01129792, 0.00674406, 0.00705924, 0.00555958,
       0.00643467, 0.00606411, 0.00641348, 0.0056863 , 0.005995  ,
       0.00561384, 0.00616039, 0.00709071, 0.00724734, 0.00596976,
       0.00703349, 0.00655146, 0.00682776, 0.00679222, 0.00664621,
       0.00685698, 0.00684351, 0.0092739 , 0.00737781, 0.00634544,
       0.00631369, 0.0070104 , 0.00682822, 0.00655606, 0.00661923,
       0.00671735, 0.00673991, 0.00987545, 0.00996683, 0.01009811,
       0.01034685, 0.0144034 , 0.01567014, 0.01464951, 0.01400632,
       0.01462375, 0.00600775, 0.00638606, 0.00658803, 0.00655731,
       0.00664358, 0.00641142, 0.00587919, 0.00684816, 0.00658931,
       0.00905232, 0.00630558, 0.00814724, 0.00718965, 0.00669577,
       0.01131259, 0.01326098, 0.01196003, 0.01300935, 0.00873492,
       0.0074709 , 0.0083607 , 0.00785076, 0.00866328, 0.00776158,
       0.00772263, 0.00798759, 0.00686969, 0.00632943, 0.00622

# Save model

In [29]:
print('Saving model')

Saving model


In [30]:
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

In [31]:
df_grid_search_results = pd.concat([pd.DataFrame(grid_search.cv_results_["params"]),pd.DataFrame(grid_search.cv_results_["mean_test_score"], columns=["mean_test_score"])],axis=1)
df_grid_search_results = pd.concat([df_grid_search_results,pd.DataFrame(grid_search.cv_results_["std_test_score"], columns=["std_test_score"])],axis=1)
df_grid_search_results = pd.concat([df_grid_search_results,pd.DataFrame(grid_search.cv_results_["mean_fit_time"], columns=["mean_fit_time"])],axis=1)
df_grid_search_results = pd.concat([df_grid_search_results,pd.DataFrame(grid_search.cv_results_["mean_score_time"], columns=["mean_score_time"])],axis=1)

In [32]:
df_grid_search_results

Unnamed: 0,colsample_bytree,learning_rate,max_depth,n_estimators,random_state,subsample,tree_method,mean_test_score,std_test_score,mean_fit_time,mean_score_time
0,0.7,0.05,12,500,42,0.9,gpu_hist,0.512633,0.002082,96.639476,50.88363


# Test predictions

In [33]:
print('Testing predictions')

Testing predictions


In [34]:
df_test_predictions = model.predict(df_test[FEATURES_LIST])

In [35]:
print('Calculating utility score')

Calculating utility score


In [36]:
utility_score = utility_function(df_test, df_test_predictions)

In [37]:
utility_score

1883.1550978776543

In [38]:
print('Calculating accuracy score')

Calculating accuracy score


In [39]:
accuracy_score(y_test, df_test_predictions)

0.5241697612243865

# Make random predictions

In [40]:
print('Random predictions test')

Random predictions test


In [41]:
action_1_proba = df_test[df_test['resp_positive'] == 1].shape[0] / df_test.shape[0]
print(action_1_proba)

0.4981800779506435


In [42]:
np.random.seed(42)

In [43]:
df_predictions_test_random = pd.Series(np.asarray(np.random.rand(df_test.shape[0]) > (1 - action_1_proba), dtype=int))

In [44]:
df_predictions_test_random.reset_index(drop=True, inplace=True)

In [45]:
df_predictions_test_random.mean()

0.4991648145336076

In [46]:
accuracy_score(y_test, df_predictions_test_random)

0.500683333563412

In [47]:
df_predictions_test_random.shape

(217317,)

# Calculate utility

In [48]:
df_test.reset_index(drop=True, inplace=True)

In [49]:
utility_function(df_test, df_predictions_test_random)

79.16867371626394