# $\text{Jane Street Market Prediction}$

## $\text{Getting the data}$

## For Oscar:

In [1]:
# Data
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500) #arbitrary large number, I wanna see all columns


# Plotting
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()

#utilities
import os
from collections import Counter
import joblib


#MOdels and evalutation
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score, confusion_matrix
import lightgbm as lgb

In [2]:
%%time
working_dir = "/Users/oscarengelbrektson/Documents/Minerva/Spring 2021 - San Francisco/Quantitative Trading/Jane Street Competition/data/"

train_data = pd.read_csv(working_dir + "train.csv")
test_data_sample = pd.read_csv(working_dir + "example_test.csv")
feature_data = pd.read_csv(working_dir + "features.csv")


CPU times: user 53.7 s, sys: 5.26 s, total: 59 s
Wall time: 59.3 s


## For Taha

In [1]:
# getting access to Google Drive to retrieve data
#from google.colab import drive
#drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# changing the working directory
#%cd /content/gdrive/My Drive/Kaggle

/content/gdrive/My Drive/Kaggle


In [3]:
'''
!kaggle competitions download -c jane-street-market-prediction
!unzip \*.zip  && rm *.zip''';

In [7]:
%%time
train_data = pd.read_csv('C:/Users/Taha/Desktop/Spring 2021/JaneStreet/train.csv')

Wall time: 2min 7s


In [9]:
# difference columns between train and test set
#set(train_data.columns) - set(test_sample.columns)

## $\text{LightGMB}$

### $\text{Data Processing}$

In [3]:
#Get outcome variable by transforming resp > 0
train_data["action"] = train_data["resp"].apply(lambda x: int(x>0))

In [6]:
#Train-validation-test split : 300-100-100
train_set = train_data[train_data.date < 300]
validation_set = train_data[(train_data.date >= 300) & (train_data.date < 400)]
test_set = train_data[train_data.date >= 400]

train_set.date.nunique(), validation_set.date.nunique(), test_set.date.nunique()

(300, 100, 100)

In [8]:
predictors = list(set(train_data.columns) - {'resp', 'resp_1', 'resp_2', 'resp_3', 'resp_4', 'action', 'ts_id'})
outcome = ['action']

In [12]:
X_train, X_validation, y_train, y_validation = train_set[predictors],  validation_set[predictors], train_set[outcome], validation_set[outcome]
X_test, y_test = test_set[predictors], test_set[outcome]

## Loading the models

In [None]:
lgbm = joblib.load('final_lgbm_model.pkl')
lstm = joblib.load('final_lgbm_model.pkl')

## Making predictions on the validation set

In [16]:
y_pred_lgbm = lgbm.predict_proba(X_validation)
y_pred_lstm = lstm.predict_proba(X_validation)

In [None]:
#Store in Dataframe
validation_set["lgbm_predicted_prob"] = y_pred_lgbm[:, 1]
validation_set["lstm_predicted_prob"] = y_pred_lstm[:, 1]

## Showing how the utility changes as a function of the weight assigned to each models predictions

In [None]:
def utility_score_last(date, weight, resp, action):
    '''
    Takes four 1-d arrays of equal size:
    Date: int
    weight: float >= 0
    resp: float
    action: binary
    
    and returns jane street utility score, u
    '''
    count_i = date.nunique() # Get number of days
    P_i = np.bincount(date, weight * resp * action) # Compute P_i
    t = np.sum(P_i) / np.sqrt(np.sum(P_i ** 2)) * np.sqrt(250 / count_i) # Compute t
    u = np.clip(t, 0, 6) * np.sum(P_i) # Combine to get utility score
    return u

def get_utility_from_df(df, lgbm_weight=0.5):
    '''
    Takes a dataframe and a decision threshold, 
    computes the total utility given the decision threshold for converting predicted probabilities to actions
    '''
    weighted_predictions = (lgbm_weight*df.lgbm_predicted_prob + (1-lgbm_weight)*df.lstm_predicted_prob)
    #Transform predictions to actions by Round to 1 or 0
    return utility_score_last(df.date, df.weight, df.resp, int(np.round(weighted_predictions)))

In [None]:
lgbm_weights =  np.linspace(0, 1, 200)
utility_by_weight = [get_utility_from_df(validation_set, lgbm_weight) for lgbm_weight in lgbm_weights]

In [None]:
best_lgbm_weights = lgbm_weights[utility_by_weight.index(max(utility_by_weight))]

In [None]:
# Visualize
plt.figure(figsize=(16, 8))
plt.plot(lgbm_weights, utility_by_weight, label="Best utility: %s"%np.round(max(utility_by_threshold), 3))
plt.axvline(best_lgbm_weights, 
            color="red", linestyle="--",label="Best lgbm weight: %s"%np.round(best_threshold, 3))
plt.ylabel("Utility")
plt.xlabel("Weight assigned to LGBM")
plt.legend(loc=4)
plt.show()

In [None]:
plt.scatter(validation_set.lgbm_predicted_prob, validation_set.resp, label="r-squared: {}".format(np.corrcoef(validation_set.lgbm_predicted_prob,
                                                                                             validation_set.resp)[0,1]))
plt.xlabel("Predicted probability")
plt.ylabel("resp")

plt.legend(loc=0)
plt.show()

In [None]:
plt.scatter(validation_set.lgbm_predicted_prob, validation_set.resp, label="r-squared: {}".format(np.corrcoef(validation_set.lgbm_predicted_prob,
                                                                                             validation_set.weight)[0,1]))
plt.xlabel("Predicted probability")
plt.ylabel("resp")

plt.legend(loc=0)
plt.show()

# Train best model on entire train + validation dataset before running on test

In [None]:
train_and_validation_set = pd.concat([train_set, validation_set], axis=0)

In [None]:
final_lgbm_model = lgb.LGBMClassifier(max_depth=int(round(best_params["max_depth"])),
                                            learning_rate = best_params["learning_rate"],
                                            num_leaves = int(round(best_params["num_leaves"])),
                                            min_data_in_leaf = int(round(best_params["min_data_in_leaf"])),
                                            max_bin=int(round(best_params["max_bin"])),
                                            objective = 'binary', 
                                            boosting= 'gbdt',
                                            nthread=10,
                                            seed = 42,
                                            verbose = -1)

In [None]:
final_lgbm_model.fit(ttrain_and_validation_setrain_data[predictors],
                     train_and_validation_set[outcome].values.reshape(len(train_and_validation_set),), 
                     verbose=True)

### Save model as pickle file to avoid having to retrain

In [131]:
import joblib
# save model
joblib.dump(final_lgbm_model, 'final_lgbm_model1.pkl')
# load model
gbm_pickle = joblib.load('final_lgbm_model1.pkl')

In [132]:
gbm_pickle

LGBMClassifier(boosting='gbdt', learning_rate=0.06157343657751557, max_bin=164,
               max_depth=6, min_data_in_leaf=10, nthread=10, num_leaves=191,
               objective='binary', seed=42, verbose=-1)

# Compute test set utility

In [None]:
y_preds = final_lgbm_model.predict_proba(X_test)

In [None]:
test_set["lgbm_predicted_prob"] = y_preds[:, 1]

In [None]:
get_utility_from_df(test_set, best_threshold)

# Abandoned approach using TimeSeriesSplit:
Abandoned because was not computationally feasible, would shortcircuit my computers RAM when we tried to use it for hyperparameter selection

In [10]:
from sklearn.model_selection import TimeSeriesSplit

#Number of trades in each split, with TimeSeriesSplit
X_train, y_train = train_set[predictors], train_set[outcome]
ts = TimeSeriesSplit(n_splits=4)
for i in ts.split(X_train, y_train):
    print(len(i[0]), len(i[1]))

280617 280613
561230 280613
841843 280613
1122456 280613


In [11]:
#Number of dates in each split, with TimeSeriesSplit

ts = TimeSeriesSplit(n_splits=4)
for fold_index, holdout_index in ts.split(X_train, y_train):     
    X_fold, X_holdout = X_train.iloc[fold_index,:].date.nunique(), X_train.iloc[holdout_index,:].date.nunique()
    print('Train: {}, Test: {}'.format(X_fold, X_holdout))

Train: 44, Test: 53
Train: 96, Test: 75
Train: 170, Test: 68
Train: 237, Test: 64


In [None]:
import gc

n_folds = 4
folds = TimeSeriesSplit(n_splits=n_folds)

splits = folds.split(train_set[predictors], train_set[outcome])

y_preds = np.zeros(validation_set.shape[0])
y_oof = np.zeros(train_set.shape[0])
mean_score = []

feature_importances = pd.DataFrame()
feature_importances['feature'] = predictors

for fold_n, (train_index, valid_index) in enumerate(splits):

    print('Fold:', fold_n+1)
    
    X_train, X_valid = train_set[predictors].iloc[train_index], train_set[predictors].iloc[valid_index]
    y_train, y_valid = train_set[outcome].iloc[train_index], train_set[outcome].iloc[valid_index]

    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_valid, label=y_valid)

    model = lgb.train(params, d_train, 2500, valid_sets = [d_train, d_valid], early_stopping_rounds = 50, verbose_eval=100)
    
    feature_importances[f'fold_{fold_n + 1}'] = model.feature_importance()

    y_pred_valid = np.round(model.predict(X_valid, num_iteration=model.best_iteration))

    y_oof[valid_index] = y_pred_valid
    
    val_score = accuracy_score(y_pred_valid, y_valid)

    print(f'val AUC score is {val_score}')

    mean_score.append(val_score)

    y_preds += model.predict(validation_set[predictors], num_iteration=model.best_iteration) / n_folds

    del X_train, X_valid, y_train, y_valid
    gc.collect()

print('AUC score over folds is', np.mean(mean_score))

#test['demand'] = y_preds

In [None]:
# Parameters for lightGBM. Objective is minimizing logloss
params = {'learning_rate': 0.01,
          'boosting': 'gbdt',
          'objective': 'binary',
          'num_leaves': 200,
          'min_data_in_leaf': 10,
          'max_bin': 200,
          'max_depth': 6,
          'seed': 2018,
          'nthread': 10}