In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
import seaborn as sns
import time

### data preparation

In [None]:
# read data
data = pd.read_csv("data-training.csv")
data_array = np.array(data)

# show sample
print(data[0:2])

# plot distribution
sns.distplot(data_array[:, 60], kde=False)


In [None]:
new_data = data.dropna(axis=0)
new_data = new_data.drop_duplicates()
print('data size:', new_data.shape)

In [None]:
# split to train & test
train_data  = new_data[0:2000000]
test_data = new_data[2000000:2700000]

In [None]:
diff_1st = data['askRate0'] - data['bidRate0']
diff_2nd = np.diff(diff_1st)
plt.scatter(diff_2nd, np.diff(data['y']))
plt.show()

In [None]:
train_x = np.array(train_data[train_data.columns[0:60]])
train_y = np.array(train_data['y'])
test_x = np.array(test_data[test_data.columns[0:60]])
test_y = np.array(test_data['y'])

### score

In [None]:
with open('normed_data.pickle', 'rb') as f:
    normed_data = pickle.load(f)

### askrate0 diff

In [None]:
data_diff = np.zeros((data_array.shape[0], 30), dtype=np.float32)

In [None]:
flag1 = np.all(data_array[:-1, 30:60] == data_array[1:, 30:60], axis=1)

flag2 = np.all(data_array[:-1, 1:15] == data_array[1:, 0:14], axis=1)
flag2_ = np.all(data_array[:-1, 16:30] == data_array[1:, 15:29], axis=1)

flag3 = np.all(data_array[:-1, 0:14] == data_array[1:, 1:15], axis=1)
flag3_ = np.all(data_array[:-1, 15:29] == data_array[1:, 16:30], axis=1)

flag = flag1 & ((flag2 & flag2_) | (flag3 & flag3_))

In [None]:
nb = data.shape[0] - 1
data_t1 = np.array(data.loc[np.arange(nb)[flag]+1])
data_t0 = np.array(data.loc[np.arange(nb)[flag]])
diff_ask = (data_t1[:, 0] - data_t0[:, 0]) #/ (data_t0[:, 0] - data_t0[:, 30])
diff_ask_bid = data_t0[:, 0] - data_t0[:, 30]
diff_y = data_t1[:, -1] - data_t0[:, -1]

In [None]:
plt.scatter(diff_ask_bid[diff_ask==0.5], diff_y[diff_ask==0.5])
plt.show()

In [None]:
plt.scatter(diff_ask, diff_y)
plt.show()

In [None]:
scipy.stats.pearsonr(diff_ask, diff_y)

### data normalization

In [None]:
%%time
data_array = np.nan_to_num(data_array, 0)

In [None]:
%%time

normed_data = np.zeros_like(data_array, dtype=np.float32)
normed_data[:, :15] = data_array[:, :15] - np.expand_dims(data_array[:, 0], axis=1)
normed_data[:, 30:45] = data_array[:, 30:45] - np.expand_dims(data_array[:, 0], axis=1)

asksize_book = {x:0 for x in np.arange(1500, 1800, 0.5)}
bidsize_book = {x:0 for x in np.arange(1500, 1800, 0.5)}

temp_asksize_book = {}
temp_bidsize_book = {}
for i in range(data_array.shape[0]):
    for j in range(15):
        # for ask
        if data_array[i, j] not in temp_asksize_book and j < 12:
            normed_data[i, j+15] = data_array[i ,j+15]
        else:
            normed_data[i, j+15] = data_array[i, j+15] - asksize_book[data_array[i, j]]
        # update ask size book
        asksize_book[data_array[i, j]] = data_array[i, j+15]
    
    for j in range(30, 45):
        # for bid
        if data_array[i, j] not in temp_bidsize_book and j < 42:
            normed_data[i, j+15] = data_array[i ,j+15]
        else:
            normed_data[i, j+15] = data_array[i, j+15] - bidsize_book[data_array[i, j]]
        # update bid size book
        bidsize_book[data_array[i, j]] = data_array[i, j+15]
        
    # update temp book
    temp_asksize_book = {data_array[i,k]: data_array[i,k+15] for k in range(15)}
    temp_bidsize_book = {data_array[i,k+30]: data_array[i,k+45] for k in range(15)}
        
    

In [None]:
normed_data[normed_data < -1000] = 0

# model

### linear model

In [None]:
nb_train = 2000000

In [None]:
x0 = data_array[:nb_train, 0:60]# - data_array[:nb_train, 30:60]
train_y = data_array[:nb_train, 60]

x1 = data_array[nb_train:, 0:60]# - data_array[nb_train:, 30:60]
test_y = data_array[nb_train:, 60]

reg = LinearRegression(normalize=True).fit(x0, train_y)

In [None]:
train_pred = reg.predict(x0)
test_pred = reg.predict(x1)


print('train score: {:.5f}'.format(score(train_y, train_pred)))
print('test score: {:.5f}'.format(score(test_y, test_pred)))

### linear model with normalized data

In [None]:
with open('normed_data.pickle', 'rb') as f:
    normed_data = pickle.load(f)

In [None]:
nb_train = 2000000
# train data
train_x = normed_data[:nb_train, :60]
train_y = normed_data[:nb_train, 60]
# test data
test_x = normed_data[nb_train:, :60]
test_y = normed_data[nb_train:, 60]

reg = LinearRegression(normalize=False).fit(train_x, train_y)

In [None]:
train_pred = reg.predict(train_x)
test_pred = reg.predict(test_x)


print('train score: {:.5f}'.format(score(train_y, train_pred)))
print('test score: {:.5f}'.format(score(test_y, test_pred)))

### linear model with multi - normalized data

In [None]:
def tile(x, n):
    res = np.zeros((x.shape[0] - n + 1, x.shape[1], n), dtype=np.float32)
    for i in range(n):
        if i > 0:
            res[..., i] = x[n-i-1 : -i]
        else:
            res[..., 0] = x[n-1:]
    return res.reshape((res.shape[0], -1))

In [None]:
n = 8
# train data
train_x = tile(normed_data[:nb_train, :60], n)
train_y = data_array[n-1:nb_train, 60]
# test data
test_x = tile(normed_data[nb_train:, :60], n)
test_y = data_array[nb_train+n-1:, 60]

reg = LinearRegression(normalize=False).fit(train_x.reshape(train_x.shape[0], -1), train_y)

# test
train_pred = reg.predict(train_x.reshape(train_x.shape[0], -1))
test_pred = reg.predict(test_x.reshape(test_x.shape[0], -1))


print('train score: {:.5f}'.format(score(train_y, train_pred)))
print('test score: {:.5f}'.format(score(test_y, test_pred)))

### linear model with pca

In [None]:
pca = PCA(n_components=4)
pca = pca.fit(train_x)

x0 = pca.transform(train_x)
x1 = pca.transform(test_x)

reg = LinearRegression().fit(x0, train_y)

In [None]:
pca.explained_variance_ratio_

In [None]:
train_pred = reg.predict(x0)
test_pred = reg.predict(x1)

print('train score: {:.5f}'.format(score(train_y, train_pred)))
print('test score: {:.5f}'.format(score(test_y, test_pred)))