In [1]:
import importlib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from datetime import datetime
from scipy.special import logsumexp

from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import mean_squared_error, classification_report

In [2]:
train = pd.read_csv("caltech-cs155-2020/train.csv")
test = pd.read_csv("caltech-cs155-2020/test.csv")
df = pd.concat([train,test],sort=False)
print(df.shape)
print(df.columns)
df.tail()

(784239, 28)
Index(['id', 'last_price', 'mid', 'opened_position_qty ',
       'closed_position_qty', 'transacted_qty', 'd_open_interest', 'bid1',
       'bid2', 'bid3', 'bid4', 'bid5', 'ask1', 'ask2', 'ask3', 'ask4', 'ask5',
       'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol',
       'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 'y'],
      dtype='object')


Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
191854,784234,5150.6,5150.8,,,0.0,0,5150.6,5150.4,5150.2,...,5,3,4,2,1,3,1,3,2,
191855,784235,5122.6,5122.7,1.0,2.0,3.0,-1,5122.6,5122.4,5122.2,...,1,5,1,20,16,17,4,1,16,
191856,784236,5192.8,5192.3,,,0.0,0,5192.0,5191.8,5191.2,...,2,1,1,1,1,2,1,2,1,
191857,784237,5152.0,5151.9,4.0,2.0,6.0,2,5151.8,5151.6,5151.4,...,5,6,8,37,7,1,2,1,1,
191858,784238,5200.0,5200.3,2.0,0.0,2.0,2,5200.0,5199.0,5198.8,...,5,1,2,1,1,1,4,1,3,


In [3]:
bid_cols = ['bid1','bid2', 'bid3', 'bid4', 'bid5']
bid_vol_cols = ['bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol']
ask_cols = ['ask1', 'ask2', 'ask3', 'ask4', 'ask5',]
ask_vol_cols = ['ask1vol','ask2vol', 'ask3vol', 'ask4vol', 'ask5vol']

group_cols = {"bid_cols":bid_cols,"bid_vol_cols":bid_vol_cols,"ask_cols":ask_cols,"ask_vol_cols":ask_vol_cols}

In [4]:
for group in group_cols.keys():
    print(group)
    df[f"{group}_max"] = df[group_cols[group]].max(axis=1)
    df[f"{group}_min"] = df[group_cols[group]].min(axis=1)
    df[f"{group}_spread"] = df[f"{group}_max"].div(df[f"{group}_min"])
#     df[f"{group}_logsumexp"] = df[grou/_cols[group]].apply(logsumexp)
    
    df[f"{group}_max"] = df[group_cols[group]].max(axis=1)
    
df["last_price_div__mid"] = df["last_price"].div(df["mid"])

bid_cols
bid_vol_cols
ask_cols
ask_vol_cols


In [5]:
df["date"] = pd.to_datetime("1.1.2019")
df["date"] = df["date"] + pd.to_timedelta(df["id"]/2,unit="s") # 500 ms per row

df["date"].describe()

count                         784239
unique                        784239
top       2019-01-02 14:21:54.500000
freq                               1
first            2019-01-01 00:00:00
last             2019-01-05 12:55:19
Name: date, dtype: object

# Define functions

In [11]:
def get_normalized_X_and_train_y(df, col):
    train_opq = df.loc[~df[col].isna()]
    print(f"train shape {train.shape[0]}")
    test_opq = df.loc[df[col].isna()]
    print(f"test shape {test.shape[0]}")
    print(train_opq.shape, test_opq.shape)

    X_opq = train_opq.drop(["opened_position_qty ", "closed_position_qty", "d_open_interest","id","date","y"],axis=1)
    from scipy import stats
    normalized_X_opq = pd.DataFrame(stats.zscore(X_opq, axis=0))
    normalized_X_opq.columns = [X_opq.columns]
    y_opq = train_opq[col]

    print(normalized_X_opq.shape, y_opq.shape)


    test_X_opq = test_opq.drop(["opened_position_qty ", "closed_position_qty", "d_open_interest", "id","date","y"],axis=1)

    normalized_test_X_opq = pd.DataFrame(stats.zscore(test_X_opq, axis=0))
    normalized_test_X_opq.columns = [test_X_opq.columns]
    normalized_test_X_opq.index = test_X_opq.index
    print(normalized_test_X_opq.shape)
#     print(normalized_test_X_opq.columns)
    return normalized_X_opq, y_opq, normalized_test_X_opq, train_opq, test_opq

In [12]:
def get_df_with_preds_for_nans(df, col):
    from sklearn import linear_model
    clf = linear_model.SGDClassifier(max_iter=10, tol=1e-3)
    clf.fit(normalized_X_opq, y_opq)

    pred_opq = clf.predict(normalized_test_X_opq)
    test_opq[col] = pred_opq
    df = pd.concat([train_opq,test_opq],sort=False)
    df = df.sort_values('id')
    return df

In [13]:
df

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid_vol_cols_min,bid_vol_cols_spread,ask_cols_max,ask_cols_min,ask_cols_spread,ask_vol_cols_max,ask_vol_cols_min,ask_vol_cols_spread,last_price_div__mid,date
0,0,3842.4,3842.6,,,103.0,0,3842.4,3842.0,3841.8,...,1,14.00,3844.0,3842.8,1.000312,10,1,10.0,0.999948,2019-01-01 00:00:00.000
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,1,11.00,3844.8,3843.8,1.000260,13,1,13.0,0.999844,2019-01-01 00:00:00.500
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,21.00,3845.8,3844.8,1.000260,16,1,16.0,0.999922,2019-01-01 00:00:01.000
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,2,6.50,3844.8,3843.8,1.000260,11,1,11.0,1.000104,2019-01-01 00:00:01.500
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,2,7.00,3845.0,3843.4,1.000416,15,1,15.0,1.000026,2019-01-01 00:00:02.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191854,784234,5150.6,5150.8,,,0.0,0,5150.6,5150.4,5150.2,...,1,5.00,5152.4,5151.0,1.000272,3,1,3.0,0.999961,2019-01-05 12:55:17.000
191855,784235,5122.6,5122.7,1.0,2.0,3.0,-1,5122.6,5122.4,5122.2,...,1,20.00,5123.6,5122.8,1.000156,17,1,17.0,0.999980,2019-01-05 12:55:17.500
191856,784236,5192.8,5192.3,,,0.0,0,5192.0,5191.8,5191.2,...,1,2.00,5193.4,5192.6,1.000154,2,1,2.0,1.000096,2019-01-05 12:55:18.000
191857,784237,5152.0,5151.9,4.0,2.0,6.0,2,5151.8,5151.6,5151.4,...,4,9.25,5152.8,5152.0,1.000155,7,1,7.0,1.000019,2019-01-05 12:55:18.500


In [14]:
normalized_X_opq, y_opq, normalized_test_X_opq, train_opq, test_opq = get_normalized_X_and_train_y(df, "opened_position_qty ")

train shape 592380
test shape 191859
(558123, 42) (226116, 42)
(558123, 36) (558123,)
(226116, 36)


In [15]:
df = get_df_with_preds_for_nans(df, "opened_position_qty ")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [16]:
df

Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid_vol_cols_min,bid_vol_cols_spread,ask_cols_max,ask_cols_min,ask_cols_spread,ask_vol_cols_max,ask_vol_cols_min,ask_vol_cols_spread,last_price_div__mid,date
0,0,3842.4,3842.6,15.0,,103.0,0,3842.4,3842.0,3841.8,...,1,14.00,3844.0,3842.8,1.000312,10,1,10.0,0.999948,2019-01-01 00:00:00.000
1,1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,...,1,11.00,3844.8,3843.8,1.000260,13,1,13.0,0.999844,2019-01-01 00:00:00.500
2,2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,...,1,21.00,3845.8,3844.8,1.000260,16,1,16.0,0.999922,2019-01-01 00:00:01.000
3,3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,...,2,6.50,3844.8,3843.8,1.000260,11,1,11.0,1.000104,2019-01-01 00:00:01.500
4,4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,...,2,7.00,3845.0,3843.4,1.000416,15,1,15.0,1.000026,2019-01-01 00:00:02.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191854,784234,5150.6,5150.8,1.0,,0.0,0,5150.6,5150.4,5150.2,...,1,5.00,5152.4,5151.0,1.000272,3,1,3.0,0.999961,2019-01-05 12:55:17.000
191855,784235,5122.6,5122.7,1.0,2.0,3.0,-1,5122.6,5122.4,5122.2,...,1,20.00,5123.6,5122.8,1.000156,17,1,17.0,0.999980,2019-01-05 12:55:17.500
191856,784236,5192.8,5192.3,1.0,,0.0,0,5192.0,5191.8,5191.2,...,1,2.00,5193.4,5192.6,1.000154,2,1,2.0,1.000096,2019-01-05 12:55:18.000
191857,784237,5152.0,5151.9,4.0,2.0,6.0,2,5151.8,5151.6,5151.4,...,4,9.25,5152.8,5152.0,1.000155,7,1,7.0,1.000019,2019-01-05 12:55:18.500


In [17]:
normalized_X_opq, y_opq, normalized_test_X_opq, train_opq, test_opq = get_normalized_X_and_train_y(df, "closed_position_qty")

train shape 592380
test shape 191859
(558123, 42) (226116, 42)
(558123, 36) (558123,)
(226116, 36)


In [18]:
df = get_df_with_preds_for_nans(df, "closed_position_qty")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [None]:
df

In [None]:
train = df.loc[~df.y.isna()]
print(f"train shape {train.shape[0]}")
test = df.loc[df.y.isna()]
print(f"test shape {test.shape[0]}")

train_no_nan = train.fillna(-100)
test_no_nan = test.fillna(-100)

# we don't know if the test set has a temporal split, so we'll just try a random split for now
X = train_no_nan.drop(["id","date","y"],axis=1)
y = train_no_nan["y"]

In [None]:
import torch
import torch.nn as nn
import second_neural_network
importlib.reload(second_neural_network)
model = second_neural_network.second_neural_network()

In [None]:
import get_average_val_err

importlib.reload(get_average_val_err)

print(get_average_val_err.get_val_err(10, train_no_nan, model))