## First Edition: XGBoost for Jane Street data

In [None]:
# conda install -c nvidia -c rapidsai -c numba -c conda-forge -c defaults cudf


In [None]:
# load pkgs 
import pandas as pd
# import cudf  # turn GPU to use
import sys, os, shutil
import numpy as np
from tqdm.notebook import tqdm
import warnings
# warnings.filterwarnings("ignore")

# Machine Learning pkgs
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import optuna
import xgboost as xgb

import datatable as dt
import gc
# plotting stuff
from pandas.plotting import lag_plot
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
colorMap = sns.light_palette("blue", as_cmap=True)
import psutil

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Data description

- This dataset contains an anonymized set of features, feature_{0...129}, representing real stock market data. Each row in the dataset represents a trading opportunity, for which you will be predicting an action value: 1 to make the trade and 0 to pass on it. 
- Each trade has an associated weight and resp, which together represents a return on the trade. 
- The date column is an integer which represents the day of the trade, while ts_id represents a time ordering. 
- In addition to anonymized feature values, you are provided with metadata about the features in features.csv.
- In the training set, train.csv, you are provided a resp value, as well as several other resp_{1,2,3,4} values that represent returns over different time horizons. These variables are not included in the test set. Trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation.

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = set(['int16', 'int32', 'int64', 'float16', 'float32', 'float64'])
    start_mem = df.memory_usage().sum() / 1024**2    
    def helper(dfCol):
        colType = dfCol.dtypes
        if colType in numerics:
            cMin = dfCol.min()
            cMax = dfCol.max()
            if str(colType)[:3] == "int":
                if cMin > np.iinfo(np.int8).min and cMax < np.iinfo(np.int8).max:
                    dfCol = dfCol.astype(np.int8)
                elif cMin > np.iinfo(np.int16).min and cMax < np.iinfo(np.int16).max:
                    dfCol = dfCol.astype(np.int16)
                elif cMin > np.iinfo(np.int32).min and cMax < np.iinfo(np.int32).max:
                    dfCol = dfCol.astype(np.int32)
                elif cMin > np.iinfo(np.int64).min and cMax < np.iinfo(np.int64).max:
                    dfCol = dfCol.astype(np.int64)  
            else:
                if cMin > np.finfo(np.float16).min and cMax < np.finfo(np.float16).max:
                    dfCol = dfCol.astype(np.float16)
                elif cMin > np.finfo(np.float32).min and cMax < np.finfo(np.float32).max:
                    dfCol = dfCol.astype(np.float32)
                else:
                    dfCol = dfCol.astype(np.float64)    
        return dfCol    
    df = train.apply(helper, axis = 0)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time

# load data
train  = pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv')
meta_features = pd.read_csv('../input/jane-street-market-prediction/features.csv')
test = pd.read_csv('../input/jane-street-market-prediction/example_test.csv')

# reduce memory
# train  = reduce_mem_usage(pd.read_csv('/kaggle/input/jane-street-market-prediction/train.csv'))
# meta_features = reduce_mem_usage(pd.read_csv('../input/jane-street-market-prediction/features.csv'))

print(f'train data shape is {train.shape}.')
# print(f'test data shape is {test.shape}.')
print(f'features shape is {meta_features.shape}.')
train.head()

In [None]:
# print(psutil.virtual_memory().percent)

In [None]:
# train.describe()

In [None]:
# print(train.columns)  #date+weight+resp1-4+resp+features0-129+ts-id = 138 meta features
# print()

# # check unique tsid
# if len(train.ts_id.unique()) == 2390491:
#     print('Each ts_id is unique')

### Data Cleaning

1. Ignore all the rows with weight = 0 since trades with weight = 0 were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation.
2. Sort the data based on date and ts_id order (time series property)
3. Check the distribution of missing value per feature 0 - 129, insight about how to fill missing value
4. Correlation analysis on resp features? (potential to drop some features?)
5. Check imbalance data (for binary classification problem)
6. Generating 0 or 1 values on the basis of resp features and storing it to 'action' column
7. date > 80

### fill N/A per column

In [None]:
# def get_feature_mean(df):
#     """ 
#     """
#     feature_list = []
#     for column in df.columns:
#         feature_median = train[column].mean(axis=1)
#         df[column].fillna(feature_median)
#         feature_list.append(feature_median)
#     return df, feature_list

# filled_train = get_feature_mean()

# featrue_mean = train.apply(lambda x: pd.DataFrame.mean(x, skipna=True), axis=0)

In [None]:
train.mean()

In [None]:
# for col_name, col_mean in zip(train.columns, featrue_mean):
#     train[col_name].fillna(col_mean)
train.fillna(train.mean(), inplace=True)

In [None]:
# remove all the rows with weight = 0
train = train.query('weight > 0').reset_index(drop=True)
print(f'Shape of weight=0 removed train dataset: {train.shape}')

# Sort the data
train.sort_values(by=['date', 'ts_id'], inplace=True)

In [None]:
# summmarize the massing value
features_w_missing = list(train.columns[train.isna().sum()>0])
features_wo_mssing = list(train.columns[train.isna().sum()==0])
print(f'Number of features with missing value = {len(train.isnull().sum()[train.isna().sum()>0])}')

N_FEATURES = len(features_w_missing)
nan_val = train.isna().sum()[train.isna().sum() > 0].sort_values(ascending=False)
# print(nan_val)

fig, axs = plt.subplots(figsize=(20, 18))
sns.barplot(y = nan_val.index[0:N_FEATURES], x = nan_val.values[0:N_FEATURES], alpha = 0.7)
plt.title(f'Missing values of train dataset (Top {N_FEATURES} features)')
plt.xlabel('Number of Missing values')
plt.show()

In [None]:
# Break the train df into parts (w missing values, 4 levels) and wo missing values

features_w_missing_g0_list = list(train.columns[train.isna().sum()>=100000])
print(features_w_missing_g0_list)
features_w_missing_g1_list = list(train.columns[(train.isna().sum()>=40000) & (train.isna().sum()<100000)])
print(features_w_missing_g1_list)
features_w_missing_g2_list = list(train.columns[(train.isna().sum()>=3000) & (train.isna().sum()<40000)])
print(features_w_missing_g2_list)
features_w_missing_g3_list = list(train.columns[(train.isna().sum()<3000) & (train.isna().sum()>0)])
print(features_w_missing_g3_list)

print(len(features_w_missing_g0_list)+len(features_w_missing_g1_list)+len(features_w_missing_g2_list)+len(features_w_missing_g3_list))

train_w_missing_g0 = train[train.columns & features_w_missing_g0_list]
print(f'shape of group 0 is {train_w_missing_g0.shape}.')
train_w_missing_g1 = train[train.columns & features_w_missing_g1_list]
print(f'shape of group 1 is {train_w_missing_g1.shape}.')
train_w_missing_g2 = train[train.columns & features_w_missing_g2_list]
print(f'shape of group 2 is {train_w_missing_g2.shape}.')
train_w_missing_g3 = train[train.columns & features_w_missing_g3_list]
print(f'shape of group 3 is {train_w_missing_g3.shape}.')

train_wo_missing = train[train.columns & features_wo_mssing]
print(f'shape of wo-missing values data is {train_wo_missing.shape}.')

In [None]:
# Check the distribution of data df group 0

fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(20, 15))
palette = itertools.cycle(sns.color_palette())

for i, column in enumerate(train_w_missing_g0.columns):
    sns.distplot(train_w_missing_g0[column], ax=axes[i//5, i%5], color=next(palette))
    
# axes.set_title('Group 0 Distribution', size=12)

- QQ Plot

In [None]:
# Check the distribution of data df group 1

fig, axes = plt.subplots(nrows=3, ncols=6, figsize=(20, 15))
palette = itertools.cycle(sns.color_palette())

for i, column in enumerate(train_w_missing_g1.columns):
    sns.distplot(train_w_missing_g1[column], ax=axes[i//6, i%6], color=next(palette))
# axes.set_title('Group 1 Distribution', size=12)

In [None]:
# Check the distribution of data df group 2

fig, axes = plt.subplots(nrows=4, ncols=8, figsize=(25, 20))
palette = itertools.cycle(sns.color_palette())

for i, column in enumerate(train_w_missing_g2.columns):
    sns.distplot(train_w_missing_g2[column], ax=axes[i//8, i%8], color=next(palette))

In [None]:
# Check the distribution of data df group 3

fig, axes = plt.subplots(nrows=5, ncols=5, figsize=(20, 20))
palette = itertools.cycle(sns.color_palette())

for i, column in enumerate(train_w_missing_g3.columns):
    sns.distplot(train_w_missing_g3[column], ax=axes[i//5, i%5], color=next(palette))

In [None]:
# based on the plot above, a fluffy to deal with missing value is to fill with median of each feature

MEDIAN = train.median()  # group var
x_train = train.fillna(MEDIAN)

- label and simulate the outliers 

### Creating Train and Test DataFrame

In [None]:
# Generating 0 or 1 values on the basis of resp features and storing it to 'action' column
# It will serve as our test data 
train['action'] = (train['resp'] > 0 ).astype('int')
train.head()

- The criteria of a successful trasaction is determined if the resp is greater than 0. This criteria should be justified.

In [None]:
X = train.loc[:, train.columns.str.contains('feature')]
y = train.loc[:, 'action']

del train  # free some space 

# Splitting X,y into train and validation data 
x_train, x_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

This way of filling missing value is a possible way but may have better way. Thinking about the distribution or maybe correlations of feature data. EX. For those that are skewed, regression can be used? If perfectly Gaussian, avg looks good. Need discussion.

### XGBoost 

In [None]:
# Created the Xgboost specific DMatrix data format from the numpy array to optimise memory consumption
dtrain = xgb.DMatrix(x_train, label=y_train)
dvalid = xgb.DMatrix(x_valid, label=y_valid)

In [None]:
# check imbalance of responses

sns.set_palette("hls")
ax = sns.barplot(y_train.value_counts().index, y_train.value_counts()/len(y_train))
ax.set_title("Proportion of trades with action=0 and action=1")
ax.set_ylabel("Percentage")
ax.set_xlabel("Action")
sns.despine();

In [None]:
def objective(trial):
    
# params specifies the XGBoost hyperparameters to be tuned
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 400, 600),
        'max_depth': trial.suggest_int('max_depth', 10, 20),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, .1),
        'subsample': trial.suggest_uniform('subsample', 0.50, 1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.50, 1),
        'gamma': trial.suggest_int('gamma', 0, 10),
        'tree_method': 'gpu_hist',  
        'objective': 'binary:logistic'
    }
    
    bst = xgb.train(params, dtrain)
    preds = bst.predict(dvalid)
    pred_labels = np.rint(preds)
    
# trials will be evaluated based on their accuracy on the test set
    accuracy = sklearn.metrics.accuracy_score(y_valid, pred_labels)
    return accuracy

In [None]:
study = optuna.create_study()
study.optimize(objective,n_trials=5) 

print('Best trial: score {}, params {}'.format(study.best_trial.value, study.best_trial.params))

In [None]:
best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'      #gpu_hist is really fast
best_params['objective'] = 'binary:logistic'

del x_train, x_valid, y_train, y_valid, dtrain, dvalid  #free some space

In [None]:
# Fit the XGBoost classifier with optimal hyperparameters
clf = xgb.XGBClassifier(**best_params)

In [None]:
%time clf.fit(X, y)  #Used the whole training data

### Fitting classifier on test data¶


In [None]:
from tqdm import tqdm
import janestreet
env = janestreet.make_env() # initialize the environment
iter_test = env.iter_test() # an iterator which loops over the test set

In [None]:
for (test_df, pred_df) in tqdm(iter_test):
    if test_df['weight'].item() > 0:
        X_test = test_df.loc[:, test_df.columns.str.contains('feature')]
        y_preds = clf.predict(X_test)
        pred_df.action = y_preds
    else:
        pred_df.action = 0
    env.predict(pred_df)