In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df_train = pd.read_csv('/content/drive/MyDrive/Deep Learning/Optiver - Trading at the Close/optiver-trading-at-the-close/train.csv')

In [None]:
df_train.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,time_id,row_id
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,0,0_0_0
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,0,0_0_1
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,0,0_0_2
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,0,0_0_3
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,0,0_0_4


In [None]:
def ana_data(data):
    print("========================Columns============================")
    print(f'\n{data.columns}\n\n')
    print("========================Info============================")
    print(f'\n{data.info}\n\n')
    print("========================Descritive============================")
    print(f'\n{data.describe()}\n\n')
    print("========================Null Value============================")
    print(f'\n{data.isnull().sum()}\n\n')
    print("========================Unique Value============================")
    print(f'\n{data.nunique()}\n\n')

ana_data(df_train)


Index(['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_size',
       'imbalance_buy_sell_flag', 'reference_price', 'matched_size',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price',
       'ask_size', 'wap', 'target', 'time_id', 'row_id'],
      dtype='object')



<bound method DataFrame.info of          stock_id  date_id  seconds_in_bucket  imbalance_size  \
0               0        0                  0      3180602.69   
1               1        0                  0       166603.91   
2               2        0                  0       302879.87   
3               3        0                  0     11917682.27   
4               4        0                  0       447549.96   
...           ...      ...                ...             ...   
5237975       195      480                540      2440722.89   
5237976       196      480                540       349510.47   
5237977       197      480                540            0.00   
5237978       198      4

In [None]:
df_train['target'].fillna(df_train['target'].mean(), inplace=True)

In [None]:
df_test = pd.read_csv("/content/drive/MyDrive/Deep Learning/Optiver - Trading at the Close/optiver-trading-at-the-close/example_test_files/test.csv")
df_test.head()

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,time_id,row_id,currently_scored
0,0,478,0,3753451.43,-1,0.999875,11548975.43,,,0.999875,22940.0,1.00005,9177.6,1.0,26290,478_0_0,False
1,1,478,0,985977.11,-1,1.000245,3850033.97,,,0.99994,1967.9,1.000601,19692.0,1.0,26290,478_0_1,False
2,2,478,0,599128.74,1,1.000584,4359198.25,,,0.999918,4488.22,1.000636,34955.12,1.0,26290,478_0_2,False
3,3,478,0,2872317.54,-1,0.999802,27129551.64,,,0.999705,16082.04,1.000189,10314.0,1.0,26290,478_0_3,False
4,4,478,0,740059.14,-1,0.999886,8880890.78,,,0.99972,19012.35,1.000107,7245.6,1.0,26290,478_0_4,False


In [None]:
len(df_test), df_test.shape

(33000, (33000, 17))

In [None]:
def feature_engineering(X):

    _X = X.copy()

    # features referenced from https://www.kaggle.com/code/zulqarnainali/explained-singel-model-optiver
    _X["liquidity_imbalance"] = _X.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    _X["matched_imbalance"] = _X.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    _X["price_spread"] = _X["ask_price"] - _X["bid_price"]
    _X['market_urgency'] = _X['price_spread'] * _X['liquidity_imbalance']

    return _X


In [None]:
def pipeline(dataframe, isDataTrain=False):


    """
    This is a pipeline for whole processing data
    """


    tar_col = ''
    if isDataTrain:
        y =  dataframe['target'].values
        tar_col = 'target'
    else:
        tar_col = 'currently_scored'


    # Drop columns unnescessery
    drop_cols = ['stock_id','date_id','time_id', 'row_id', tar_col]

    for col in drop_cols:
        dataframe = dataframe.drop(col, axis=1)


    # Handling missing data
    for col in dataframe.columns:
        if dataframe[col].isnull().any() == True:
            dataframe[col].fillna(dataframe[col].mean(), inplace=True)


    # Feature engineering
    dataframe = feature_engineering(dataframe)


    # Scaling data
    ct = ColumnTransformer(
        [('minMaxScale', MinMaxScaler(), ['seconds_in_bucket']),
        ('scale1', StandardScaler(), ['imbalance_size']),
        ('scale2', StandardScaler(), slice(3,None))],
        remainder='passthrough'
    )

    dataframe = ct.fit_transform(dataframe[:])


    # Split data
    if isDataTrain:
        X_train, X_test, y_train, y_test = train_test_split(dataframe, y, test_size=0.2, random_state=42)
        return X_train, X_test,y_train, y_test
    return dataframe


In [None]:
X_train, X_test, y_train, y_test = pipeline(df_train, True)

In [None]:
X_train_new = X_train[:2000]
y_train_new = y_train[:2000]

In [None]:
# y_train.isnull().any()

In [None]:
from lightgbm import LGBMRegressor

lgb = LGBMRegressor()
lgb.fit(X_train, y_train)
y_pred = lgb.predict(X_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.617160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3628
[LightGBM] [Info] Number of data points in the train set: 4190384, number of used features: 16
[LightGBM] [Info] Start training from score -0.051737


In [None]:
df_test = pipeline(df_test, False)

In [None]:
# physical_divices = tf.config.list_physical_devices("GPU")
# tf.config.experimental.set_memory_growth(physical_divices[0],True)

In [None]:
nn_model = tf.keras.Sequential([

    tf.keras.layers.Dense(32, activation='relu', input_shape=(16,)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)

])

nn_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.01), loss='mean_squared_error')

In [None]:
nn_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_9 (Dense)             (None, 32)                544       
                                                                 
 dense_10 (Dense)            (None, 16)                528       
                                                                 
 dense_11 (Dense)            (None, 1)                 17        
                                                                 
Total params: 1089 (4.25 KB)
Trainable params: 1089 (4.25 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
history = nn_model.fit(X_train, y_train, validation_data=(X_test[:500],y_test[:500]),batch_size=32, verbose=0, epochs=100)

In [None]:
history

In [None]:
nn_model.evaluate(X_test[:500],y_test[:500],batch_size=32)