In [1]:
import importlib
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
from datetime import datetime
from scipy.special import logsumexp

from catboost import Pool, cv, CatBoostClassifier, CatBoostRegressor
from sklearn.metrics import mean_squared_error, classification_report

In [2]:
train = pd.read_csv("caltech-cs155-2020/train.csv")
test = pd.read_csv("caltech-cs155-2020/test.csv")
df = pd.concat([train,test],sort=False)
print(df.shape)
print(df.columns)
df.tail()

(784239, 28)
Index(['id', 'last_price', 'mid', 'opened_position_qty ',
       'closed_position_qty', 'transacted_qty', 'd_open_interest', 'bid1',
       'bid2', 'bid3', 'bid4', 'bid5', 'ask1', 'ask2', 'ask3', 'ask4', 'ask5',
       'bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol', 'ask1vol',
       'ask2vol', 'ask3vol', 'ask4vol', 'ask5vol', 'y'],
      dtype='object')


Unnamed: 0,id,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,...,bid2vol,bid3vol,bid4vol,bid5vol,ask1vol,ask2vol,ask3vol,ask4vol,ask5vol,y
191854,784234,5150.6,5150.8,,,0.0,0,5150.6,5150.4,5150.2,...,5,3,4,2,1,3,1,3,2,
191855,784235,5122.6,5122.7,1.0,2.0,3.0,-1,5122.6,5122.4,5122.2,...,1,5,1,20,16,17,4,1,16,
191856,784236,5192.8,5192.3,,,0.0,0,5192.0,5191.8,5191.2,...,2,1,1,1,1,2,1,2,1,
191857,784237,5152.0,5151.9,4.0,2.0,6.0,2,5151.8,5151.6,5151.4,...,5,6,8,37,7,1,2,1,1,
191858,784238,5200.0,5200.3,2.0,0.0,2.0,2,5200.0,5199.0,5198.8,...,5,1,2,1,1,1,4,1,3,


In [3]:
bid_cols = ['bid1','bid2', 'bid3', 'bid4', 'bid5']
bid_vol_cols = ['bid1vol', 'bid2vol', 'bid3vol', 'bid4vol', 'bid5vol']
ask_cols = ['ask1', 'ask2', 'ask3', 'ask4', 'ask5',]
ask_vol_cols = ['ask1vol','ask2vol', 'ask3vol', 'ask4vol', 'ask5vol']

group_cols = {"bid_cols":bid_cols,"bid_vol_cols":bid_vol_cols,"ask_cols":ask_cols,"ask_vol_cols":ask_vol_cols}

In [4]:
for group in group_cols.keys():
    print(group)
    df[f"{group}_max"] = df[group_cols[group]].max(axis=1)
    df[f"{group}_min"] = df[group_cols[group]].min(axis=1)
    df[f"{group}_spread"] = df[f"{group}_max"].div(df[f"{group}_min"])
#     df[f"{group}_logsumexp"] = df[grou/_cols[group]].apply(logsumexp)
    
    df[f"{group}_max"] = df[group_cols[group]].max(axis=1)
    
df["last_price_div__mid"] = df["last_price"].div(df["mid"])

bid_cols
bid_vol_cols
ask_cols
ask_vol_cols


In [5]:
df["date"] = pd.to_datetime("1.1.2019")
df["date"] = df["date"] + pd.to_timedelta(df["id"]/2,unit="s") # 500 ms per row

df["date"].describe()

count                         784239
unique                        784239
top       2019-01-02 14:21:54.500000
freq                               1
first            2019-01-01 00:00:00
last             2019-01-05 12:55:19
Name: date, dtype: object

In [6]:
train = df.loc[~df.y.isna()]
print(f"train shape {train.shape[0]}")
test = df.loc[df.y.isna()]
print(f"test shape {test.shape[0]}")

train shape 592380
test shape 191859


In [7]:
train_no_nan = train.fillna(-100)
test_no_nan = test.fillna(-100)

In [8]:
# we don't know if the test set has a temporal split, so we'll just try a random split for now
X = train_no_nan.drop(["id","date","y"],axis=1)
y = train_no_nan["y"]

In [9]:
X

Unnamed: 0,last_price,mid,opened_position_qty,closed_position_qty,transacted_qty,d_open_interest,bid1,bid2,bid3,bid4,...,bid_vol_cols_max,bid_vol_cols_min,bid_vol_cols_spread,ask_cols_max,ask_cols_min,ask_cols_spread,ask_vol_cols_max,ask_vol_cols_min,ask_vol_cols_spread,last_price_div__mid
0,3842.4,3842.6,-100.0,-100.0,103.0,0,3842.4,3842.0,3841.8,3841.0,...,14,1,14.0,3844.0,3842.8,1.000312,10,1,10.0,0.999948
1,3842.8,3843.4,6.0,49.0,55.0,-43,3843.0,3842.8,3842.4,3842.0,...,11,1,11.0,3844.8,3843.8,1.000260,13,1,13.0,0.999844
2,3844.0,3844.3,7.0,77.0,84.0,-69,3843.8,3843.6,3843.2,3843.0,...,21,1,21.0,3845.8,3844.8,1.000260,16,1,16.0,0.999922
3,3843.8,3843.4,3.0,34.0,37.0,-30,3843.0,3842.8,3842.4,3842.0,...,13,2,6.5,3844.8,3843.8,1.000260,11,1,11.0,1.000104
4,3843.2,3843.1,3.0,38.0,41.0,-35,3842.8,3842.4,3842.0,3841.8,...,14,2,7.0,3845.0,3843.4,1.000416,15,1,15.0,1.000026
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592375,4110.2,4110.3,1.0,1.0,2.0,1,4110.2,4110.0,4109.4,4109.2,...,3,1,3.0,4112.6,4110.4,1.000535,10,2,5.0,0.999976
592376,4109.4,4110.5,6.0,5.0,11.0,1,4109.2,4109.0,4108.6,4108.2,...,6,2,3.0,4113.0,4111.8,1.000292,9,3,3.0,0.999732
592377,4109.4,4110.5,-100.0,-100.0,0.0,0,4109.2,4109.0,4108.6,4108.2,...,6,2,3.0,4113.0,4111.8,1.000292,9,3,3.0,0.999732
592378,4109.4,4110.5,-100.0,-100.0,0.0,0,4109.2,4109.0,4108.6,4108.2,...,6,2,3.0,4113.0,4111.8,1.000292,9,3,3.0,0.999732


In [10]:
X.shape, y.shape

((592380, 39), (592380,))

In [11]:
from sklearn.datasets import load_digits
from sklearn.decomposition import FactorAnalysis
X, y = load_digits(return_X_y=True)
transformer = FactorAnalysis(n_components=7, random_state=0)
X_transformed = transformer.fit_transform(X)
X_transformed.shape

(1797, 7)

In [12]:
X.shape

(1797, 64)

In [13]:
y.shape

(1797,)

In [36]:
import torch
import torch.nn as nn
import second_neural_network
importlib.reload(second_neural_network)
model = second_neural_network.second_neural_network()

In [37]:
import get_average_val_err

importlib.reload(get_average_val_err)

print(get_average_val_err.get_val_err(10, train_no_nan, model))

592380
Fold  1  of  10  test indices: [    0     1     2 ... 59235 59236 59237]
len(val_index_list):  59238
Train Epoch: 1  Loss: 0.0091
Train Epoch: 2  Loss: 0.5524
Train Epoch: 3  Loss: 0.3352
Train Epoch: 4  Loss: 0.4086
Train Epoch: 5  Loss: 0.5421
validation error:  0.6485635727480674
Fold  2  of  10  test indices: [ 59238  59239  59240 ... 118473 118474 118475]
len(val_index_list):  59238
Train Epoch: 1  Loss: 1.6097
Train Epoch: 2  Loss: 0.6599
Train Epoch: 3  Loss: 0.2742
Train Epoch: 4  Loss: 0.2478
Train Epoch: 5  Loss: 0.8830
validation error:  0.6534842077788663
Fold  3  of  10  test indices: [118476 118477 118478 ... 177711 177712 177713]
len(val_index_list):  59238
Train Epoch: 1  Loss: 0.9412
Train Epoch: 2  Loss: 0.8627
Train Epoch: 3  Loss: 0.1922
Train Epoch: 4  Loss: 0.5959
Train Epoch: 5  Loss: 0.7785
validation error:  0.6551097842149405
Fold  4  of  10  test indices: [177714 177715 177716 ... 236949 236950 236951]
len(val_index_list):  59238
Train Epoch: 1  Loss: 

In [38]:
test["Predicted"] = model.predict(test_no_nan.drop(["id","date","y"],axis=1))
test[["id","Predicted"]].to_csv("submission_test_second_neural_network2.csv",index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
X_transformed

array([[-0.10390271,  0.28198204,  1.93908236, ..., -0.38873227,
        -0.30377059,  0.7951861 ],
       [-0.96261823,  0.18862239, -1.73202491, ..., -0.02628507,
        -0.33583478, -1.4748951 ],
       [-1.07572294, -0.31865454, -1.38943619, ..., -0.99384465,
        -0.52678795,  0.52332889],
       ...,
       [-0.68171725,  0.05397779, -0.80913822, ..., -0.01853757,
        -0.92232969, -0.28710684],
       [-0.34632883, -0.12407529,  1.22216715, ...,  0.21910061,
        -0.29530191, -0.7910442 ],
       [ 0.66672038, -0.92771996, -0.0150591 , ..., -0.84866807,
         0.21509133,  0.51120728]])

In [15]:
transformer = FactorAnalysis(n_components=64, random_state=0)
X_transformed = transformer.fit_transform(X)
X_transformed.shape

(1797, 64)

In [16]:
X_transformed

array([[-0.06488965,  1.65622217, -0.78845486, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.57454077, -1.62102338,  0.36935249, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.51612897, -0.77575138,  0.24773356, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.79600586, -0.55058492,  0.47146544, ...,  0.        ,
         0.        ,  0.        ],
       [-0.33631286,  0.97600191, -0.85748027, ...,  0.        ,
         0.        ,  0.        ],
       [-0.0298756 ,  0.49774478,  0.8988633 , ...,  0.        ,
         0.        ,  0.        ]])

In [17]:
X

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [25]:
import numpy as np
array = np.array([4,2,7,1])
order = array.argsort()
ranks = order.argsort()

In [26]:
ranks.reshape((-1,1))

array([[2],
       [1],
       [3],
       [0]])