In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
import warnings
warnings.filterwarnings('ignore')

***
### Load & Clean Data

In [2]:
train_csv = pd.read_csv('train.csv')

In [3]:
data = train_csv.copy()
data = data[~data.isna()['target']]
data.drop(columns=['row_id', 'time_id'], inplace=True)
display(data.head())

Unnamed: 0,stock_id,date_id,seconds_in_bucket,imbalance_size,imbalance_buy_sell_flag,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target
0,0,0,0,3180602.69,1,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704
1,1,0,0,166603.91,-1,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986
2,2,0,0,302879.87,-1,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995
3,3,0,0,11917682.27,-1,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102
4,4,0,0,447549.96,-1,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849


***
### Initial Model

##### Replacing NaN by 0

In [4]:
df = data.copy()
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [5]:
reg = LinearRegression().fit(X_train, y_train)

In [6]:
pred = reg.predict(X_test)
print("MAE:", abs(pred-y_test).mean())

MAE: 6.310901168038444


##### Replacing NaN by Mean

In [7]:
df = data.copy()
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

mu = X_train.mean()
X_train.fillna(mu, inplace=True)
X_test.fillna(mu, inplace=True)

In [8]:
reg = LinearRegression().fit(X_train, y_train)

In [9]:
pred = reg.predict(X_test)
print("MAE:", abs(pred-y_test).mean())

MAE: 6.310142024255629


***
### Feature Engineering

In [10]:
data['imbalance_auction'] = data['imbalance_size'] * data['imbalance_buy_sell_flag']
data['imbalance_auction_proportion_matched'] = data['imbalance_size'] / data['matched_size']
data['imbalance_order_book'] = data['bid_size']/(data['bid_size']+data['ask_size'])

data['spread'] = data['ask_price'] - data['bid_price']
data['mid_price'] = (data['ask_price'] + data['bid_price']) / 2

data['bef_300'] = np.where(data['seconds_in_bucket'] <= 300, 1, 0)
data['aft_300'] = np.where(data['seconds_in_bucket'] > 300, 1, 0)

data.drop(columns=['imbalance_size', 'imbalance_buy_sell_flag'], inplace=True)
display(data.head())

Unnamed: 0,stock_id,date_id,seconds_in_bucket,reference_price,matched_size,far_price,near_price,bid_price,bid_size,ask_price,ask_size,wap,target,imbalance_auction,imbalance_auction_proportion_matched,imbalance_order_book,spread,mid_price,bef_300,aft_300
0,0,0,0,0.999812,13380276.64,,,0.999812,60651.5,1.000026,8493.03,1.0,-3.029704,3180602.69,0.237708,0.87717,0.000214,0.999919,1,0
1,1,0,0,0.999896,1642214.25,,,0.999896,3233.04,1.00066,20605.09,1.0,-5.519986,-166603.91,0.101451,0.135625,0.000764,1.000278,1,0
2,2,0,0,0.999561,1819368.03,,,0.999403,37956.0,1.000298,18995.0,1.0,-8.38995,-302879.87,0.166475,0.666468,0.000895,0.99985,1,0
3,3,0,0,1.000171,18389745.62,,,0.999999,2324.9,1.000214,479032.4,1.0,-4.0102,-11917682.27,0.648061,0.00483,0.000215,1.000107,1,0
4,4,0,0,0.999532,17860614.95,,,0.999394,16485.54,1.000016,434.1,1.0,-7.349849,-447549.96,0.025058,0.974343,0.000622,0.999705,1,0


***
### Model with Feature Engineering

In [11]:
df = data.copy()
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

mu = X_train.mean()
X_train.fillna(mu, inplace=True)
X_test.fillna(mu, inplace=True)

In [12]:
reg = LinearRegression().fit(X_train, y_train)

In [13]:
pred = reg.predict(X_test)
print("MAE:", abs(pred-y_test).mean())

MAE: 6.308015695259861


***
### Model with Feature Selection

In [14]:
df = data.copy()
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

mu = X_train.mean()
X_train.fillna(mu, inplace=True)
X_test.fillna(mu, inplace=True)

In [15]:
reg = LinearRegression()
rfecv = RFE(estimator=reg, n_features_to_select=16, step=1)
X_rfecv_train = rfecv.fit_transform(X_train, y_train)
X_rfecv_test = rfecv.transform(X_test)
reg = reg.fit(X_rfecv_train, y_train)

In [16]:
display(X.columns[rfecv.support_])

Index(['stock_id', 'date_id', 'seconds_in_bucket', 'reference_price',
       'far_price', 'near_price', 'bid_price', 'bid_size', 'ask_price', 'wap',
       'imbalance_auction_proportion_matched', 'imbalance_order_book',
       'spread', 'mid_price', 'bef_300', 'aft_300'],
      dtype='object')

In [17]:
pred = reg.predict(X_rfecv_test)
print("MAE:", abs(pred-y_test).mean())

MAE: 6.309346923401272


***
### Sequenced Before-After 300s

In [18]:
df = data.copy()
X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

X_train_bef = X_train[X_train['bef_300'] == 1].drop(columns=['far_price', 'near_price'])
X_train_aft = X_train[X_train['aft_300'] == 1]

mu_1 = X_train_bef.mean()
mu_2 = X_train_aft.mean()
X_train_bef.fillna(mu_1, inplace=True)
X_train_aft.fillna(mu_2, inplace=True)

y_train_bef = y_train[y_train.index.isin(X_train_bef.index)]
y_train_aft = y_train[y_train.index.isin(X_train_aft.index)]

In [19]:
reg_bef = LinearRegression().fit(X_train_bef, y_train_bef)
reg_aft = LinearRegression().fit(X_train_aft, y_train_aft)

In [20]:
X_test_bef = X_test[X_test['bef_300'] == 1].drop(columns=['far_price', 'near_price'])
X_test_aft = X_test[X_test['aft_300'] == 1]

X_test_bef.fillna(mu_1, inplace=True)
X_test_aft.fillna(mu_2, inplace=True)

pred_bef = reg_bef.predict(X_test_bef)
pred_aft = reg_aft.predict(X_test_aft)
diff_bef = abs(pred_bef - y_test[y_test.index.isin(X_test_bef.index)])
diff_aft = abs(pred_aft - y_test[y_test.index.isin(X_test_aft.index)])
print("MAE:", (sum(diff_bef) + sum(diff_aft)) / len(X_test))

MAE: 6.307613587273323


***
### Sequenced By Stock

In [21]:
df = data.copy()
df['far_price'] = df['far_price'].fillna(0)
df['near_price'] = df['near_price'].fillna(0)
df.dropna(inplace=True)

X = df.drop(columns=['target'])
y = df['target']

for s_id in df['stock_id'].unique():
    X_id = X[X['stock_id'] == s_id]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)