# $\text{Jane Street Market Prediction}$

## $\text{Getting the data}$

## For Oscar:

In [5]:
# Data
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500) #arbitrary large number, I wanna see all columns


# Plotting
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

#utilities
import os
from collections import Counter
import joblib


#MOdels and evalutation
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix
import lightgbm as lgb
import tensorflow as tf

In [6]:
%%time
working_dir = "/Users/oscarengelbrektson/Documents/Minerva/Spring 2021 - San Francisco/Quantitative Trading/Jane Street Competition/data/"

train_data = pd.read_csv(working_dir + "train.csv")
test_data_sample = pd.read_csv(working_dir + "example_test.csv")
feature_data = pd.read_csv(working_dir + "features.csv")


CPU times: user 56.1 s, sys: 11.1 s, total: 1min 7s
Wall time: 1min 9s


## For Taha

In [1]:
# getting access to Google Drive to retrieve data
#from google.colab import drive
#drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# changing the working directory
#%cd /content/gdrive/My Drive/Kaggle

/content/gdrive/My Drive/Kaggle


In [3]:
'''
!kaggle competitions download -c jane-street-market-prediction
!unzip \*.zip  && rm *.zip''';

In [7]:
%%time
train_data = pd.read_csv('C:/Users/Taha/Desktop/Spring 2021/JaneStreet/train.csv')

Wall time: 2min 7s


In [9]:
# difference columns between train and test set
#set(train_data.columns) - set(test_sample.columns)

## $\text{LightGMB}$

### $\text{Data Processing}$

In [7]:
#Get outcome variable by transforming resp > 0
train_data["action"] = train_data["resp"].apply(lambda x: int(x>0))

In [8]:
#Train-validation-test split : 300-100-100
train_set = train_data[train_data.date < 300]
validation_set = train_data[(train_data.date >= 300) & (train_data.date < 400)]
test_set = train_data[train_data.date >= 400]

train_set.date.nunique(), validation_set.date.nunique(), test_set.date.nunique()

(300, 100, 100)

In [9]:
outcomes = ['resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'action', 'ts_id']
predictors = list(filter(lambda x: x not in outcomes, train_data.columns))
outcome = ['action']

In [10]:
X_train, X_validation, y_train, y_validation = train_set[predictors],  validation_set[predictors], train_set[outcome], validation_set[outcome]
X_test, y_test = test_set[predictors], test_set[outcome]

## Loading the models

In [18]:
lgbm = joblib.load('final_lgbm_model.pkl')
lstm = tf.keras.models.load_model('lstm_model.h5')

## Making predictions on the validation set

In [12]:
y_pred_lgbm = lgbm.predict_proba(X_validation)
y_pred_lstm = lstm.predict_proba(X_validation)

Instructions for updating:
Please use `model.predict()` instead.


ValueError: in user code:

    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:1462 predict_function  *
        return step_function(self, iterator)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:1452 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:1445 run_step  **
        outputs = model.predict_step(data)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:1418 predict_step
        return self(x, training=False)
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/base_layer.py:975 __call__
        input_spec.assert_input_compatibility(self.input_spec, inputs,
    /Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/input_spec.py:176 assert_input_compatibility
        raise ValueError('Input ' + str(input_index) + ' of layer ' +

    ValueError: Input 0 of layer sequential is incompatible with the layer: expected ndim=3, found ndim=2. Full shape received: [None, 132]


In [None]:
#Store in Dataframe
validation_set["lgbm_predicted_prob"] = y_pred_lgbm[:, 1]
validation_set["lstm_predicted_prob"] = y_pred_lstm[:, 1]

## Showing how the utility changes as a function of the weight assigned to each models predictions

In [None]:
def utility_score_last(date, weight, resp, action):
    '''
    Takes four 1-d arrays of equal size:
    Date: int
    weight: float >= 0
    resp: float
    action: binary
    
    and returns jane street utility score, u
    '''
    count_i = date.nunique() # Get number of days
    P_i = np.bincount(date, weight * resp * action) # Compute P_i
    t = np.sum(P_i) / np.sqrt(np.sum(P_i ** 2)) * np.sqrt(250 / count_i) # Compute t
    u = np.clip(t, 0, 6) * np.sum(P_i) # Combine to get utility score
    return u

def get_utility_from_df(df, lgbm_weight=0.5):
    '''
    Takes a dataframe and a decision threshold, 
    computes the total utility given the decision threshold for converting predicted probabilities to actions
    '''
    weighted_predictions = (lgbm_weight*df.lgbm_predicted_prob + (1-lgbm_weight)*df.lstm_predicted_prob)
    #Transform predictions to actions by Round to 1 or 0
    return utility_score_last(df.date, df.weight, df.resp, int(np.round(weighted_predictions)))

In [None]:
lgbm_weights =  np.linspace(0, 1, 200)
utility_by_weight = [get_utility_from_df(validation_set, lgbm_weight) for lgbm_weight in lgbm_weights]

In [None]:
best_lgbm_weights = lgbm_weights[utility_by_weight.index(max(utility_by_weight))]

In [None]:
# Visualize
plt.figure(figsize=(16, 8))
plt.plot(lgbm_weights, utility_by_weight, label="Best utility: %s"%np.round(max(utility_by_threshold), 3))
plt.axvline(best_lgbm_weights, 
            color="red", linestyle="--",label="Best lgbm weight: %s"%np.round(best_threshold, 3))
plt.ylabel("Utility")
plt.xlabel("Weight assigned to LGBM")
plt.legend(loc=4)
plt.show()

In [None]:
plt.scatter(validation_set.lgbm_predicted_prob, validation_set.resp, label="r-squared: {}".format(np.corrcoef(validation_set.lgbm_predicted_prob,
                                                                                             validation_set.resp)[0,1]))
plt.xlabel("Predicted probability")
plt.ylabel("resp")

plt.legend(loc=0)
plt.show()

In [None]:
plt.scatter(validation_set.lgbm_predicted_prob, validation_set.resp, label="r-squared: {}".format(np.corrcoef(validation_set.lgbm_predicted_prob,
                                                                                             validation_set.weight)[0,1]))
plt.xlabel("Predicted probability")
plt.ylabel("resp")

plt.legend(loc=0)
plt.show()

# Compute test set utility

In [None]:
y_pred_lgbm = lgbm.predict_proba(X_test)
y_pred_lstm = lstm.predict_proba(X_test)

In [None]:
#Store in Dataframe
test_set["lgbm_predicted_prob"] = y_pred_lgbm[:, 1]
test_set["lstm_predicted_prob"] = y_pred_lstm[:, 1]

In [None]:
get_utility_from_df(test_set, best_lgbm_weights)