
# Jane Street Market Prediction: A Commplete EDA

This is a simple exploratory data analysis (EDA) of the files provided for the [Jane Street Market Prediction](https://www.kaggle.com/c/jane-street-market-prediction) time series competition.

## Import necessary libraries

In [None]:
!pip install dabl
!pip install datatable

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import datatable as dt

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


import plotly.express as px
import plotly.graph_objects as go
colorMap = sns.light_palette("blue", as_cmap=True)


import dabl
import warnings
warnings.filterwarnings('ignore')

colorMap = sns.light_palette("blue", as_cmap=True)

import missingno as msno

In [None]:
!wc -l ../input/jane-street-market-prediction/train.csv

In [None]:
%%time

train_data = dt.fread('../input/jane-street-market-prediction/train.csv').to_pandas()

# Save memory !!! 🚀🚀🚀 Compure faster 🚀🚀🚀

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train_data = reduce_mem_usage(train_data)

In [None]:
train_data.head()

In [None]:
print(" Total days avaiallbe in dataset:- ",len(train_data['date'].unique()))

# EDA starts 

In [None]:
days_txn = train_data.groupby(['date']).agg({'ts_id':'count'}).reset_index()
days_txn.rename(columns={'ts_id':'txn_count'},inplace=True)

In [None]:
fig, ax = plt.subplots(figsize=(25, 10))
sns.lineplot(days_txn['date'],days_txn['txn_count'])
ax.set_xlabel ("Days", fontsize=18)
ax.set_ylabel ("Transaction count", fontsize=18);

# Sum of Weights accross dates

In [None]:
days_txn_wght = train_data.groupby(['date']).agg({'weight':'sum'}).reset_index()
fig, ax = plt.subplots(figsize=(25, 10))
sns.lineplot(days_txn_wght['date'],days_txn_wght['weight'])
ax.set_xlabel ("Days", fontsize=18)
ax.set_ylabel ("Sum of weight", fontsize=18)

# Mean of Weights accross dates

In [None]:
days_txn_wght = train_data.groupby(['date']).agg({'weight':'mean'}).reset_index()
fig, ax = plt.subplots(figsize=(25, 10))
sns.lineplot(days_txn_wght['date'],days_txn_wght['weight'])
ax.set_xlabel ("Days", fontsize=18)
ax.set_ylabel ("Mean of weight", fontsize=18)

### There is Huge trend and seasonality in the trade data 

In [None]:
percent_zeros = (100/train_data.shape[0])*((train_data.weight.values == 0).sum())
print('Percentage of zero weights is: %i' % percent_zeros +"%")

## Weight

> *Each trade has an associated `weight` and `resp`, which together represents a return on the trade.
Trades with `weight = 0` were intentionally included in the dataset for completeness, although such trades will not contribute towards the scoring evaluation. So we ignore 17% data while doing modeling*

In [None]:
max_weight = train_data['weight'].max()
print('The maximum weight was: %.2f' % max_weight)

In [None]:
train_data[train_data['weight']==(max_weight)]

### Tade of maximum weightage happends in Day-446 

## Features
> "*This dataset contains an anonymized set of features, `feature_{0...129}`, representing real stock market data.*"

However, `feature_0` seems to me to be a little unusual, as it is composed solely of the integers `+1` or `-1`:

## Curious nature of feature 1 

In [None]:
train_data['feature_0'].value_counts()

In [None]:
fig, ax = plt.subplots(figsize=(8, 3))
feature_0 = pd.Series(train_data['feature_0']).cumsum()
ax.set_xlabel ("Trade", fontsize=18)
ax.set_ylabel ("feature_0 (cumulative)", fontsize=18);
feature_0.plot(lw=3);

## Trade Action

`action`: 1 to make the trade and 0 to pass on it.In view of this let us add a new 'binary' column to our test dataset called `action` such that if `resp` is positive then `action=1` else `action=0`

In [None]:
train_data['action'] = ((train_data['resp'])>0)*1

## Lets see how the trade transactions in Day 5 

In [None]:
def plot_txn_day_cumsum(day):
    fig, ax = plt.subplots(figsize=(25, 10))
    balance= pd.Series(day['resp']).cumsum()
    resp_1= pd.Series(day['resp_1']).cumsum()
    resp_2= pd.Series(day['resp_2']).cumsum()
    resp_3= pd.Series(day['resp_3']).cumsum()
    resp_4= pd.Series(day['resp_4']).cumsum()
    ax.set_xlabel ("Trade", fontsize=18)
    ax.set_title ("Cumulative return for resp and time horizons 1, 2, 3, and 4", fontsize=18)
    balance.plot(lw=3)
    resp_1.plot(lw=3)
    resp_2.plot(lw=3)
    resp_3.plot(lw=3)
    resp_4.plot(lw=3)
    plt.legend(loc="upper left");

### Trade Day - 5

In [None]:
plot_txn_day_cumsum(train_data.loc[train_data['date'] == 5])

### Trade Day - 0

In [None]:
plot_txn_day_cumsum(train_data.loc[train_data['date'] == 0])

### Trade Day - 1

In [None]:
plot_txn_day_cumsum(train_data.loc[train_data['date'] == 1])

In [None]:
plot_txn_day_cumsum(train_data.loc[train_data['date'] == 499])

### Its has been oberved that Cummulative sum of trade response varies over days

In [None]:
msno.matrix(train_data.loc[train_data['date'] == 1])

In [None]:
msno.matrix(train_data.loc[train_data['date'] == 100])

### Observation :- 

 Bunch of fields have missing elements , Panoramically missing value pattern looks even accross trading days

# Pearson Correlation between features

In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(train_data.corr())

## Observation :- 

* Features 80-117 have high correlation with each other 
* Feature 17-25 & Feature 29-33 have high correlation


# Let see How distribution looks like 

In [None]:
# Taken from this notebook: https://www.kaggle.com/blurredmachine/jane-street-market-eda-viz-prediction

date = 0
n_features = 130

cols = [f'feature_{i}' for i in range(1, n_features)]
hist = px.histogram(
    train_data[train_data["date"] == date], 
    x=cols, 
    animation_frame='variable', 
    range_y=[0, 600], 
    range_x=[-7, 7]
)

hist.show()

# Automatic Data Vizualisation with Dabl 

In [None]:
dabl.plot(train_data.loc[train_data['date'] == 5], target_col="resp")

# Day - 5 Trade Txn Scatter Plot 

In [None]:
fig_1 = px.scatter(train_data.loc[train_data['date'] == 5], x=train_data.loc[train_data['date'] == 5]['ts_id'], y=train_data.loc[train_data['date'] == 5]['resp'], 
                   trendline="ols", marginal_y="violin",
                   title=("Scatter plot of resp with respect to ts_id for day 5"))
fig_1.show()

In [None]:
fig_1 = px.scatter(train_data.loc[train_data['date'] == 446], x=train_data.loc[train_data['date'] == 446]['ts_id'], y=train_data.loc[train_data['date'] == 446]['resp'], 
                   trendline="ols", marginal_y="violin",
                   title=("Scatter plot of resp with respect to ts_id for day 446"))
fig_1.show()

####  Observation - Trade reponse is above zero for Day-446 compared to Day 0 

## Subset data preperation for Feature importance and model explainability

In [None]:
X_train = train_data.loc[train_data['date'] == 5].loc[:, train_data.columns.str.contains('feature')]
X_train = X_train.fillna(X_train.mean())
# our target is the action
y_train = train_data.loc[train_data['date'] == 5]['resp']

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
regressor = RandomForestRegressor(max_features='auto')
regressor.fit(X_train, y_train)

## Feature Importance by PermutationImportance

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

In [None]:
perm_import = PermutationImportance(regressor, random_state=1).fit(X_train, y_train)

### visualize the results - Show top 20 features

In [None]:
eli5.show_weights(perm_import, top=20, feature_names = X_train.columns.tolist())

## Observation :- For Day 5 important features for Feature 43,6,35,45,64 etc

<font color="red" size=5>Please upvote this kernel if you like it. It motivates me to create kernal with great content  :) </font>

## Model Explainability with SHAP

In [None]:
import shap 

# load JS visualization code to notebook
shap.initjs()

explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X_train)

#use matplotlib=True


# visualize the first prediction's explanation (use matplotlib=True to avoid Javascript)
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])

# Feature interation with model output 

In [None]:
# sort the features indexes by their importance in the model
# (sum of SHAP value magnitudes over the train dataset)


explainer = shap.TreeExplainer(regressor)
shap_values = explainer.shap_values(X_train)


top_inds = np.argsort(-np.sum(np.abs(shap_values), 0))

# make SHAP plots of the three most important features
for i in range(len(top_inds)):
    shap.dependence_plot(top_inds[i], shap_values, X_train)

# Conclustion :- 

This notebook covers detail trend analysis of response and weights at day level , Interaction of target variable with independent variables****

# Future work

1. Feature Analysis 
2. Dimentionality reduction
3. Data Imputation

For Model explainability you can visit another kernel for more details 

https://www.kaggle.com/praveengovi/jane-street-model-interpretability-shap
