In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier


import janestreet
env = janestreet.make_env() # initialize the environment

#!pip install datatable # Internet is not activated in this competition
!pip install ../input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl
import datatable as dt


import xgboost as xgb

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

INPUT_DIR = '/kaggle/input/jane-street-market-prediction/'

Processing /kaggle/input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl
Installing collected packages: datatable
Successfully installed datatable-0.11.0


/kaggle/input/python-datatable/datatable-0.11.0-cp37-cp37m-manylinux2010_x86_64.whl
/kaggle/input/jane-street-market-prediction/example_test.csv
/kaggle/input/jane-street-market-prediction/example_sample_submission.csv
/kaggle/input/jane-street-market-prediction/features.csv
/kaggle/input/jane-street-market-prediction/train.csv
/kaggle/input/jane-street-market-prediction/janestreet/__init__.py
/kaggle/input/jane-street-market-prediction/janestreet/competition.cpython-37m-x86_64-linux-gnu.so


In [2]:
import xgboost as xgb

In [3]:
import pickle
MODEL_FILE = '/kaggle/working/model.pickle'

Thanks to following notebooks :  
https://www.kaggle.com/drcapa/jane-street-market-prediction-starter-xgb/output?select=example_sample_submission.csv

# Load data

In [4]:
%%time
# Thanks to his notebook for this fast loading : https://www.kaggle.com/carlmcbrideellis/jane-street-eda-of-day-0-and-feature-importance
train_data_datatable = dt.fread('../input/jane-street-market-prediction/train.csv')
df = train_data_datatable.to_pandas()

# Thanks to this notebook to gain memory usage : https://www.kaggle.com/jorijnsmit/one-liner-to-halve-your-memory-usage
float64_cols = df.select_dtypes(include='float64').columns
mapper = {col_name: np.float32 for col_name in float64_cols}
df = df.astype(mapper)

CPU times: user 34.3 s, sys: 13.1 s, total: 47.4 s
Wall time: 34.6 s


In [5]:
df['resp'].sum()

976.0646

# Calculate target to predict

In [6]:
df[df['resp'] > 0]['resp'].quantile(0.20)

0.0022803622763603927

In [7]:
df[df['resp'] > 0.0022803622763603927].shape[0] / df[df['resp'] > 0].shape[0]

0.799999834021315

In [8]:
df['resp'].max()

0.4484615921974182

In [9]:
df['resp_positive'] = ((df['resp'])>0)*1

# Split train test

In [10]:
df_train, df_test, y_train, y_test = train_test_split(df, df['resp_positive'], test_size = 0.1, stratify=df['resp_positive'], random_state=42)

In [11]:
df_test.reset_index(drop=True, inplace=True)

# Data clean

In [12]:
cols_with_missing_train = [col for col in df_train.columns if df_train[col].isnull().any()]

In [13]:
#df_medians = df_train[cols_with_missing_train].median()
#df_train.loc[:, cols_with_missing_train].fillna(df_medians, inplace=True)
df_train.loc[:, cols_with_missing_train].fillna(-999, inplace=True)

In [14]:
#df_test.loc[:, cols_with_missing_train].fillna(df_medians, inplace=True)
df_test.loc[:, cols_with_missing_train].fillna(-999, inplace=True)

# Feature definition

In [15]:
FEATURES_LIST = ['feature_'+str(i) for i in range(130)]

# Utility calculation function

In [16]:
def utility_function(df_test, df_test_predictions):
    df_test.loc[:, 'utility_pj'] = df_test['weight'] * df_test['resp'] * df_test_predictions
    df_test_utility_pi = df_test.groupby('date').sum('utility_pj')['utility_pj']
    nb_unique_dates = df_test_utility_pi.shape[0]
    t = (df_test_utility_pi.sum() / np.sqrt(df_test_utility_pi.pow(2).sum())) * (np.sqrt(250 / np.abs(nb_unique_dates)))
    u = min(max(t, 0), 6) * df_test_utility_pi.sum()
    
    return(u)

# Train model

In [17]:
%%time
model = xgb.XGBClassifier(random_state=42, max_depth=50, n_estimators=50, tree_method = 'gpu_hist')
#model = xgb.XGBClassifier(random_state=42, max_depth=50, n_estimators=50, disable_default_eval_metric=True)
model.fit(df_train[FEATURES_LIST], y_train)

CPU times: user 8min 54s, sys: 8.79 s, total: 9min 3s
Wall time: 9min 8s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=50,
              min_child_weight=1, missing=nan,
              monotone_constraints='(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)',
              n_estimators=50, n_jobs=0, num_parallel_tree=1, random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [18]:
df_test_predictions = model.predict(df_test[FEATURES_LIST])

In [19]:
utility_score = utility_function(df_test, df_test_predictions)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


In [20]:
utility_score

9868.183429858607

In [21]:
accuracy_score(y_test, df_test_predictions)

0.657485881614725

# Save model

In [22]:
with open(MODEL_FILE, 'wb') as f:
    pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)

# Make random predictions

In [23]:
action_1_proba = df_test[df_test['resp_positive'] == 1].shape[0] / df_test.shape[0]
print(action_1_proba)

0.5040702781844802


In [24]:
np.random.seed(42)

In [25]:
df_predictions_test_random = pd.Series(np.asarray(np.random.rand(df_test.shape[0]) > (1 - action_1_proba), dtype=int))

In [26]:
df_predictions_test_random.reset_index(drop=True, inplace=True)

In [27]:
df_predictions_test_random.mean()

0.5049320225894164

In [28]:
accuracy_score(y_test, df_predictions_test_random)

0.4997197239071324

In [29]:
df_predictions_test_random.shape

(239050,)

# Calculate utility

In [30]:
df_test.reset_index(drop=True, inplace=True)

In [31]:
utility_function(df_test, df_predictions_test_random)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


-0.0