In [1]:
import importlib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from datetime import datetime
from scipy.special import logsumexp

from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import mean_squared_error, classification_report

In [2]:
train = pd.read_csv("caltech-cs155-2020/train.csv")
test = pd.read_csv("caltech-cs155-2020/test.csv")
df = pd.concat([train,test],sort=False)
print(df.shape)
print(df.columns)
df.tail()

(784239, 28)
Index(['id', 'last_price', 'mid', 'opened_position_qty ',
       'closed_position_qty', 'transacted_qty', 'd_open_interest', 'bid1',
       'bid2', 'bid3', 'bid4', 'bid5', 'ask1', 'ask2', 'ask3', 'ask4', 'ask5',
       'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol',
       'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 'y'],
      dtype='object')


Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
191854,784234,5150.6,5150.8,,,0.0,0,5150.6,5150.4,5150.2,...,5,3,4,2,1,3,1,3,2,
191855,784235,5122.6,5122.7,1.0,2.0,3.0,-1,5122.6,5122.4,5122.2,...,1,5,1,20,16,17,4,1,16,
191856,784236,5192.8,5192.3,,,0.0,0,5192.0,5191.8,5191.2,...,2,1,1,1,1,2,1,2,1,
191857,784237,5152.0,5151.9,4.0,2.0,6.0,2,5151.8,5151.6,5151.4,...,5,6,8,37,7,1,2,1,1,
191858,784238,5200.0,5200.3,2.0,0.0,2.0,2,5200.0,5199.0,5198.8,...,5,1,2,1,1,1,4,1,3,


In [3]:
bid_cols = ['bid1','bid2', 'bid3', 'bid4', 'bid5']
bid_vol_cols = ['bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol']
ask_cols = ['ask1', 'ask2', 'ask3', 'ask4', 'ask5',]
ask_vol_cols = ['ask1vol','ask2vol', 'ask3vol', 'ask4vol', 'ask5vol']

group_cols = {"bid_cols":bid_cols,"bid_vol_cols":bid_vol_cols,"ask_cols":ask_cols,"ask_vol_cols":ask_vol_cols}

In [4]:
for group in group_cols.keys():
    print(group)
    df[f"{group}_max"] = df[group_cols[group]].max(axis=1)
    df[f"{group}_min"] = df[group_cols[group]].min(axis=1)
    df[f"{group}_spread"] = df[f"{group}_max"].div(df[f"{group}_min"])
#     df[f"{group}_logsumexp"] = df[grou/_cols[group]].apply(logsumexp)
    
    df[f"{group}_max"] = df[group_cols[group]].max(axis=1)
    
df["last_price_div__mid"] = df["last_price"].div(df["mid"])

bid_cols
bid_vol_cols
ask_cols
ask_vol_cols


In [5]:
df["date"] = pd.to_datetime("1.1.2019")
df["date"] = df["date"] + pd.to_timedelta(df["id"]/2,unit="s") # 500 ms per row

df["date"].describe()

count                         784239
unique                        784239
top       2019-01-02 14:21:54.500000
freq                               1
first            2019-01-01 00:00:00
last             2019-01-05 12:55:19
Name: date, dtype: object

In [6]:
train = df.loc[~df.y.isna()]
print(f"train shape {train.shape[0]}")
test = df.loc[df.y.isna()]
print(f"test shape {test.shape[0]}")

train shape 592380
test shape 191859


In [7]:
train_no_nan = train.fillna(-100)
test_no_nan = test.fillna(-100)

In [8]:
# we don't know if the test set has a temporal split, so we'll just try a random split for now
X = train_no_nan.drop(["id","date","y"],axis=1)
y = train_no_nan["y"]

In [9]:
X

Unnamed: 0,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,bid4,...,bid_vol_cols_max,bid_vol_cols_min,bid_vol_cols_spread,ask_cols_max,ask_cols_min,ask_cols_spread,ask_vol_cols_max,ask_vol_cols_min,ask_vol_cols_spread,last_price_div__mid
0,3842.4,3842.6,-100.0,-100.0,103.0,0,3842.4,3842.0,3841.8,3841.0,...,14,1,14.0,3844.0,3842.8,1.000312,10,1,10.0,0.999948
1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,3842.0,...,11,1,11.0,3844.8,3843.8,1.000260,13,1,13.0,0.999844
2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,3843.0,...,21,1,21.0,3845.8,3844.8,1.000260,16,1,16.0,0.999922
3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,3842.0,...,13,2,6.5,3844.8,3843.8,1.000260,11,1,11.0,1.000104
4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,3841.8,...,14,2,7.0,3845.0,3843.4,1.000416,15,1,15.0,1.000026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592375,4110.2,4110.3,1.0,1.0,2.0,1,4110.2,4110.0,4109.4,4109.2,...,3,1,3.0,4112.6,4110.4,1.000535,10,2,5.0,0.999976
592376,4109.4,4110.5,6.0,5.0,11.0,1,4109.2,4109.0,4108.6,4108.2,...,6,2,3.0,4113.0,4111.8,1.000292,9,3,3.0,0.999732
592377,4109.4,4110.5,-100.0,-100.0,0.0,0,4109.2,4109.0,4108.6,4108.2,...,6,2,3.0,4113.0,4111.8,1.000292,9,3,3.0,0.999732
592378,4109.4,4110.5,-100.0,-100.0,0.0,0,4109.2,4109.0,4108.6,4108.2,...,6,2,3.0,4113.0,4111.8,1.000292,9,3,3.0,0.999732


In [10]:
X.shape, y.shape

((592380, 39), (592380,))

In [12]:
import torch
import torch.nn as nn
import second_neural_network
importlib.reload(second_neural_network)
model = second_neural_network.second_neural_network()
import pickle
filehandler = open("model_second_neural_network1", 'wb') 
pickle.dump(model, filehandler)

In [None]:
import randomforestorder
importlib.reload(randomforestorder)
model = randomforestorder.RandomForestOrder()
import pickle
filehandler = open("model_randomforestorder1", 'wb') 
pickle.dump(model, filehandler)

In [18]:
import ensembleorder
importlib.reload(ensembleorder)
model = ensembleorder.EnsembleOrder()
import pickle
filehandler = open("model_ensembleorder1", 'wb') 
pickle.dump(model, filehandler)

In [19]:
import numpy as np
a = [1, 2, 3]
b = [10, 20, 30]
np.mean( np.array([ a, b ]), axis=0 )

array([ 5.5, 11. , 16.5])

In [22]:
model.model1.batch_s

32

In [27]:
model.model2.num_epochs

10

In [None]:
import get_average_val_err

importlib.reload(get_average_val_err)

print(get_average_val_err.get_val_err(3, train_no_nan, model))

592380
Fold  1  of  3  test indices: [     0      1      2 ... 197457 197458 197459]
len(val_index_list):  197460
Train Epoch: 1  Loss: 0.7275
Train Epoch: 2  Loss: 0.6762
Train Epoch: 3  Loss: 0.5720
Train Epoch: 4  Loss: 0.5813
Train Epoch: 5  Loss: 0.7654
Train Epoch: 6  Loss: 0.4887
Train Epoch: 7  Loss: 0.7004
Train Epoch: 8  Loss: 0.4799
Train Epoch: 9  Loss: 0.4075
Train Epoch: 10  Loss: 0.5353
x train tensor:  tensor([[-2.1965e+00, -2.1984e+00,  6.0399e-01,  ..., -5.4239e-01,
         -2.0939e-01,  4.7410e-01],
        [-2.1965e+00, -2.1965e+00,  6.2575e-01,  ...,  5.0589e-01,
         -6.8383e-01, -3.0243e-03],
        [-2.1928e+00, -2.1947e+00,  6.2575e-01,  ...,  5.0589e-01,
         -6.8383e-01,  4.7407e-01],
        ...,
        [ 1.7876e+00,  1.8080e+00, -1.5719e+00,  ...,  1.5542e+00,
         -7.5161e-01, -4.9756e+00],
        [ 1.7876e+00,  1.8080e+00, -1.5719e+00,  ...,  1.5542e+00,
         -7.5161e-01, -4.9756e+00],
        [ 1.7876e+00,  1.8080e+00, -1.5719e+00,  .

In [24]:
a = np.array([[1], [2], [3]])
a.shape
a.ravel()

array([1, 2, 3])

In [None]:
test["Predicted"] = model.predict(test_no_nan.drop(["id","date","y"],axis=1))
test[["id","Predicted"]].to_csv("submission_test_second_neural_network2.csv",index=False)

In [None]:
X_transformed

In [None]:
transformer = FactorAnalysis(n_components=64, random_state=0)
X_transformed = transformer.fit_transform(X)
X_transformed.shape

In [None]:
X_transformed

In [None]:
X

In [None]:
import numpy as np
array = np.array([4,2,7,1])
order = array.argsort()
ranks = order.argsort()

In [None]:
ranks.reshape((-1,1))