# Data Preparation & Feature Engineering

In [1]:
import numpy as np
import pandas as pd
import random

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Multiply
from tensorflow.keras.layers import Add
from tensorflow.keras.layers import Subtract
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Reshape
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import RepeatVector
from tensorflow.keras.layers import Lambda
from tensorflow.keras import Model
from tensorflow.keras import backend
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.optimizers import Adam

import os
import math
from collections import deque

from sklearn.preprocessing import MinMaxScaler

In [2]:
def read_data(path):
    dataset = pd.read_csv(path)
    dataset = dataset.drop(['open_interest','amount','open_interest','datetime'], axis=1)
    dataset = dataset.dropna(axis=0)
    print('data shape:', dataset.shape)
    return dataset

file_path = 'data/jd.csv'
data = read_data(file_path)

data shape: (1038090, 8)


In [6]:
data

Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,selling_price_scaled,buying_price_scaled
1,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,0.343254,0.344214
2,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,0.349206,0.373887
3,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,0.375992,0.382789
4,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,0.370040,0.380811
5,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,0.370040,0.378833
...,...,...,...,...,...,...,...,...,...,...
1048570,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,0.040675,0.040554
1048571,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,0.040675,0.040554
1048572,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,0.040675,0.040554
1048573,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,0.040675,0.040554


In [4]:
def scale_price(dataset):
    scaler=MinMaxScaler(feature_range=(0,1))
    buying_price_scaled = scaler.fit_transform(dataset.filter(['ask_price1']).values)
    selling_price_scaled=scaler.fit_transform(dataset.filter(['bid_price1']).values)
    dataset['selling_price_scaled'] = selling_price_scaled
    dataset['buying_price_scaled'] = buying_price_scaled
    print('data shape:', dataset.shape)
    return dataset

In [7]:
def VWAP(dataset, tic):
    last = dataset['last_price']
    volume = dataset['volume']
    vwap = np.zeros((len(last),1))
    for i in range(0, len(last)):
        if i < tic:
            vs = np.sum(volume[0 : i+1])
            ps=0
            for j in range(0,i+1):
                s=last.iloc[j]*volume.iloc[j]
                ps += s
            vwap[i][0] = ps/vs
        else:
            vs = np.sum(last.iloc[(i+1-tic) : i+1])
            ps=0
            for j in range(i-tic+1, i+1):
                s=last.iloc[j]*volume.iloc[j]
                ps += s
            vwap[i][0] = ps/vs
    return vwap

In [33]:
#std_day 标准差天数，std_multi 标准差倍数
def VWAP_up(dataset, std_tik, std_multi):
    return 
    

SyntaxError: unexpected EOF while parsing (<ipython-input-33-d5df632e419d>, line 4)

In [5]:
def data_processing(dataset):
#     temp= []
#     for i in range(0,len(dataset)):
#         temp.append(i)
#     dataset['i']=temp
#     dataset = dataset.set_index('i')
    dataset = scale_price(dataset)
    
    return dataset

data = data_processing(data)

data shape: (1038090, 10)


In [8]:
vwap = VWAP(data, 40)
vwap

array([[ 3800.        ],
       [ 3822.63235294],
       [ 3826.70394737],
       ...,
       [75375.61248165],
       [75383.93834498],
       [75392.23885973]])

In [9]:
# data.index.astype('object')
data.index.dtype


dtype('int64')

In [10]:
data.iloc[0]

last_price              3800.000000
highest                 3800.000000
lowest                  3800.000000
bid_price1              3797.000000
bid_volume1                2.000000
ask_price1              3800.000000
ask_volume1               16.000000
volume                    22.000000
selling_price_scaled       0.343254
buying_price_scaled        0.344214
Name: 1, dtype: float64

In [15]:
def moving_average(df, col_name, window_size):
    col = df[col_name]
    res = []
    for i in range(0, len(col)):
        if i < window_size:
            mean = np.mean(col[0:i+1])
            res.append(mean)
        else:
            mean = np.mean(col[i-window_size:i+1])
            res.append(mean)
    df[col_name+'_MA'+str(window_size)] = res
    return df
data = moving_average(data,'ask_price1', 500)
data

Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,selling_price_scaled,buying_price_scaled,ask_price1_MA500
1,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,0.343254,0.344214,3800.000000
2,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,0.349206,0.373887,3815.000000
3,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,0.375992,0.382789,3823.000000
4,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,0.370040,0.380811,3826.500000
5,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,0.370040,0.378833,3828.200000
...,...,...,...,...,...,...,...,...,...,...,...
1048570,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,0.040675,0.040554,3493.095808
1048571,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,0.040675,0.040554,3493.087824
1048572,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,0.040675,0.040554,3493.079840
1048573,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,0.040675,0.040554,3493.073852


In [22]:
def moving_max(df, col_name, window_size, block_size):
    col = df[col_name]
    res = []
    for i in range(0, len(col)):
        if i< window_size:
            if i< block_size:
                max = np.max(col[0:i+1])
                res.append(max)
            else:
                block_max = []
                for j in range(1, (window_size//block_size)+1):
                    max= np.max(col[i-(j*block_size):i-((j-1)*block_size)+1])
                    block_max.append(max)
                block_mean = np.mean(block_max)
                res.append(block_mean)
        else:
            block_max = []
            for j in range(1, (window_size//block_size)+1):
                max= np.max(col[i-(j*block_size):i-((j-1)*block_size)+1])
                block_max.append(max)
            block_mean = np.mean(block_max)
            res.append(block_mean)
    df[col_name+'_MMax'+str(window_size)+'x'+str(block_size)] = res
    return df

data = moving_max(data,'ask_price1', 500, 50)
data

Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,selling_price_scaled,buying_price_scaled,ask_price1_MA500,ask_price1_MMax500x50
1,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,0.343254,0.344214,3800.000000,3800.0
2,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,0.349206,0.373887,3815.000000,3830.0
3,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,0.375992,0.382789,3823.000000,3839.0
4,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,0.370040,0.380811,3826.500000,3839.0
5,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,0.370040,0.378833,3828.200000,3839.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1048570,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,0.040675,0.040554,3493.095808,3494.2
1048571,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,0.040675,0.040554,3493.087824,3494.2
1048572,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,0.040675,0.040554,3493.079840,3494.2
1048573,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,0.040675,0.040554,3493.073852,3494.2


In [24]:
data.to_csv('data/jd_modified2.csv', index=False)

In [45]:
data = pd.read_csv('data/jd_modified2.csv')
data = data.drop(['selling_price_scaled','buying_price_scaled'], axis=1)


In [46]:
data

Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,ask_price1_MA500,ask_price1_MMax500x50
0,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,3800.000000,3800.0
1,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,3815.000000,3830.0
2,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,3823.000000,3839.0
3,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,3826.500000,3839.0
4,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,3828.200000,3839.0
...,...,...,...,...,...,...,...,...,...,...
1038085,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,3493.095808,3494.2
1038086,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,3493.087824,3494.2
1038087,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,3493.079840,3494.2
1038088,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,3493.073852,3494.2


In [47]:

def normalize_data(df, col_name, window_size, out_name):
    scaler=MinMaxScaler(feature_range=(0,1))
    print(out_name)
    col = df[col_name]
    res = []
    for i in range(0, len(col)):
        if i < window_size:
            max = np.max(col[0:i+1])
            min = np.min(col[0:i+1])
#             std = np.std(col[0:i+1])
            dif = max-min
            if dif==0:
                dif = 1
            temp = (col[i]-min)/dif
            res.append(temp)
        else:
            max = np.max(col[i-window_size:i+1])
            min = np.min(col[i-window_size:i+1])
#             std = np.std(col[i-window_size:i+1])
            temp = (col[i]-min)/(max-min)
            res.append(temp)
        if i% 500000==0:
            print(i%500000)
    df[out_name] = res
    return df

data = normalize_data(data, 'ask_price1', 50, 'buy_scaled_50')
data = normalize_data(data, 'ask_price1', 200, 'buy_scaled_200')
data = normalize_data(data, 'ask_price1', 500, 'buy_scaled_500')
data = normalize_data(data, 'bid_price1', 50, 'sell_scaled_50')
data = normalize_data(data, 'bid_price1', 200, 'sell_scaled_200')
data = normalize_data(data, 'bid_price1', 500, 'sell_scaled_500')
data

buy_scaled_50
0




0
0
buy_scaled_200
0
0
0
buy_scaled_500
0
0
0
sell_scaled_50
0
0
0
sell_scaled_200
0
0
0
sell_scaled_500
0
0
0


Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,ask_price1_MA500,ask_price1_MMax500x50,buy_scaled_50,buy_scaled_200,buy_scaled_500,sell_scaled_50,sell_scaled_200,sell_scaled_500
0,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,3800.000000,3800.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,3815.000000,3830.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,3823.000000,3839.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,3826.500000,3839.0,0.948718,0.948718,0.948718,0.818182,0.818182,0.818182
4,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,3828.200000,3839.0,0.897436,0.897436,0.897436,0.818182,0.818182,0.818182
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038085,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,3493.095808,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455
1038086,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,3493.087824,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455
1038087,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,3493.079840,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455
1038088,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,3493.073852,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455


In [53]:
data = normalize_data(data, 'ask_price1_MA500', 500, 'buy_MA500')
data

buy_MA500
0
0
0


Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,ask_price1_MA500,ask_price1_MMax500x50,buy_scaled_50,buy_scaled_200,buy_scaled_500,sell_scaled_50,sell_scaled_200,sell_scaled_500,buy_MA500
0,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,3800.000000,3800.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0
1,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,3815.000000,3830.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0
2,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,3823.000000,3839.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.0
3,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,3826.500000,3839.0,0.948718,0.948718,0.948718,0.818182,0.818182,0.818182,1.0
4,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,3828.200000,3839.0,0.897436,0.897436,0.897436,0.818182,0.818182,0.818182,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038085,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,3493.095808,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.0
1038086,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,3493.087824,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.0
1038087,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,3493.079840,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.0
1038088,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,3493.073852,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.0


In [76]:
# np.mean(data['buy_MA500'])
count = 0
for i in range(len(b)) :
    if b[i] == 0 or b[i]==1:
        count+=1
count
# data['buy_MA500'][r]

4

In [77]:
scaler=MinMaxScaler(feature_range=(0,1))
b = scaler.fit_transform(data.filter(['ask_price1_MA500']).values)
b

array([[0.34744333],
       [0.36257197],
       [0.37064058],
       ...,
       [0.03789105],
       [0.03788501],
       [0.03787696]])

# ----------- 分割 -----------


In [9]:
data = dataset.head(500000).values
data.shape
data[0]

array([3.80000000e+03, 3.80000000e+03, 3.80000000e+03, 3.79700000e+03,
       2.00000000e+00, 3.80000000e+03, 1.60000000e+01, 3.43253968e-01,
       3.44213650e-01])

In [20]:
data_price = dataset['ask_price1'].head(500000).values
data_price.shape

(500000,)

In [11]:
l = len(data)
l

500000

In [80]:
def normalize_data_v2(df, col_name, col_name2, window_size, out_name):
    scaler=MinMaxScaler(feature_range=(0,1))
    print(out_name)
    col = df[col_name]
    col2 = df[col_name2]
    res = []
    for i in range(0, len(col)):
        if i < window_size:
            max = np.max(col[0:i+1])
            min = np.min(col[0:i+1])
#             std = np.std(col[0:i+1])
            dif = max-min
            if dif==0:
                dif = 1
            temp = (col[i]-min)/dif
            res.append(temp)
        else:
            max = np.max(col[i-window_size:i+1])
            min = np.min(col[i-window_size:i+1])
#             std = np.std(col[i-window_size:i+1])
            temp = (col2[i]-min)/(max-min)
            res.append(temp)
        if i% 500000==0:
            print(i%500000)
    df[out_name] = res
    return df


In [81]:
data = normalize_data_v2(data, 'ask_price1','ask_price1_MA500', 500, 'buy_MA500')
data

buy_MA500
0
0
0


Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,ask_price1_MA500,ask_price1_MMax500x50,buy_scaled_50,buy_scaled_200,buy_scaled_500,sell_scaled_50,sell_scaled_200,sell_scaled_500,buy_MA500
0,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,3800.000000,3800.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,3815.000000,3830.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,3823.000000,3839.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,3826.500000,3839.0,0.948718,0.948718,0.948718,0.818182,0.818182,0.818182,0.948718
4,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,3828.200000,3839.0,0.897436,0.897436,0.897436,0.818182,0.818182,0.818182,0.897436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038085,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,3493.095808,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.554164
1038086,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,3493.087824,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.553439
1038087,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,3493.079840,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552713
1038088,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,3493.073852,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552168


In [82]:
data = normalize_data_v2(data, 'ask_price1','ask_price1_MMax500x50', 500, 'buy_MA500_win')
data

buy_MA500_win
0
0
0


Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,ask_price1_MA500,ask_price1_MMax500x50,buy_scaled_50,buy_scaled_200,buy_scaled_500,sell_scaled_50,sell_scaled_200,sell_scaled_500,buy_MA500,buy_MA500_win
0,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,3800.000000,3800.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,3815.000000,3830.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,3823.000000,3839.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
3,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,3826.500000,3839.0,0.948718,0.948718,0.948718,0.818182,0.818182,0.818182,0.948718,0.948718
4,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,3828.200000,3839.0,0.897436,0.897436,0.897436,0.818182,0.818182,0.818182,0.897436,0.897436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038085,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,3493.095808,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.554164,0.654545
1038086,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,3493.087824,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.553439,0.654545
1038087,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,3493.079840,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552713,0.654545
1038088,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,3493.073852,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552168,0.654545


In [83]:
data = moving_average(data,'ask_price1', 2500)
data

Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,ask_price1_MA500,ask_price1_MMax500x50,buy_scaled_50,buy_scaled_200,buy_scaled_500,sell_scaled_50,sell_scaled_200,sell_scaled_500,buy_MA500,buy_MA500_win,ask_price1_MA2500
0,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,3800.000000,3800.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3800.000000
1,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,3815.000000,3830.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,3815.000000
2,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,3823.000000,3839.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,3823.000000
3,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,3826.500000,3839.0,0.948718,0.948718,0.948718,0.818182,0.818182,0.818182,0.948718,0.948718,3826.500000
4,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,3828.200000,3839.0,0.897436,0.897436,0.897436,0.818182,0.818182,0.818182,0.897436,0.897436,3828.200000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038085,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,3493.095808,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.554164,0.654545,3491.385846
1038086,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,3493.087824,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.553439,0.654545,3491.389844
1038087,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,3493.079840,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552713,0.654545,3491.394242
1038088,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,3493.073852,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552168,0.654545,3491.398641


In [84]:
data = normalize_data_v2(data, 'ask_price1','ask_price1_MA2500', 500, 'buy_MA2500')
data

buy_MA2500
0
0
0


Unnamed: 0,last_price,highest,lowest,bid_price1,bid_volume1,ask_price1,ask_volume1,volume,ask_price1_MA500,ask_price1_MMax500x50,buy_scaled_50,buy_scaled_200,buy_scaled_500,sell_scaled_50,sell_scaled_200,sell_scaled_500,buy_MA500,buy_MA500_win,ask_price1_MA2500,buy_MA2500
0,3800.0,3800.0,3800.0,3797.0,2,3800.0,16,22,3800.000000,3800.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3800.000000,0.000000
1,3827.0,3830.0,3796.0,3803.0,5,3830.0,8,114,3815.000000,3830.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,3815.000000,1.000000
2,3830.0,3839.0,3796.0,3830.0,6,3839.0,1,168,3823.000000,3839.0,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,3823.000000,1.000000
3,3823.0,3839.0,3796.0,3824.0,1,3837.0,2,180,3826.500000,3839.0,0.948718,0.948718,0.948718,0.818182,0.818182,0.818182,0.948718,0.948718,3826.500000,0.948718
4,3827.0,3839.0,3796.0,3824.0,1,3835.0,2,197,3828.200000,3839.0,0.897436,0.897436,0.897436,0.818182,0.818182,0.818182,0.897436,0.897436,3828.200000,0.897436
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1038085,3493.0,3502.0,3452.0,3492.0,15,3493.0,169,75536,3493.095808,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.554164,0.654545,3491.385846,0.398713
1038086,3493.0,3502.0,3452.0,3492.0,18,3493.0,167,75539,3493.087824,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.553439,0.654545,3491.389844,0.399077
1038087,3493.0,3502.0,3452.0,3492.0,20,3493.0,167,75539,3493.079840,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552713,0.654545,3491.394242,0.399477
1038088,3493.0,3502.0,3452.0,3492.0,22,3493.0,157,75549,3493.073852,3494.2,1.000000,1.000000,0.545455,1.000000,1.000000,0.545455,0.552168,0.654545,3491.398641,0.399876


In [87]:
for col in data.columns: 
    print(col)

last_price
highest
lowest
bid_price1
bid_volume1
ask_price1
ask_volume1
volume
ask_price1_MA500
ask_price1_MMax500x50
buy_scaled_50
buy_scaled_200
buy_scaled_500
sell_scaled_50
sell_scaled_200
sell_scaled_500
buy_MA500
buy_MA500_win
ask_price1_MA2500
buy_MA2500


In [89]:
data_ = data[['buy_scaled_50',
"buy_scaled_200",
"buy_scaled_500",
"sell_scaled_50",
"sell_scaled_200",
"sell_scaled_500",
"buy_MA500",
'buy_MA500_win','buy_MA2500']]


In [91]:
data_.to_csv('data/jd_scaled.csv')

In [93]:
data_.shape[1]

9

In [196]:
data_array = data_.values
sshit = data_array[0:256]
sshit.reshape(-1,256,9,1)
sshit.shape

(256, 9)

In [118]:
price_array = data['ask_price1'].values
price_array

array([3800., 3830., 3839., ..., 3493., 3493., 3493.])

In [124]:
shit = data['bid_volume1'].values
shit[10000]

30

# Training

In [116]:
window_size = 256
epochs = 1
batch_size = 32

In [15]:
# get_states(data_price, 600, window_size+1)

In [190]:
class Agent:

    """Summary of class here.

    This is the agent class that contains all the methods and attributes of the agent

    Attributes:
        state_size: an int indicating the number of state of the information form the envoriment
        action_size: an int indicating the number of actions that the
        ...

    """

    def __init__(self, state_size, action_size, train, model_name='', is_eval=False):
        self.state_size = state_size
        self.action_size = action_size      # buy, sell, sit
        self.memory = deque(maxlen=100)
        self.inventory_price = 0
        self.inventory_number = 0
        self.inventory_list = []
        self.is_eval = is_eval
        self.traini = train

        self.gamma = 0.95
        self.epsolon = 1.0
        self.epsolon_min = 0.01
        self.epsolon_decay = 0.995

#         self.model = load_model("models/" + model_name) if is_eval else self._model()
        self.model = load_model("models/" + model_name) if is_eval else self.dqn_model(self.traini)


    def _model(self):
        model = Sequential()
        model.add(Dense(units=64, input_dim=self.state_size, activation='relu'))
        model.add(Dense(units=32, activation='relu'))
        model.add(Dense(units=8, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(lr=0.001))        
        return model
    
    def dqn_model(self, state):
        
        dropout_rate = 0.1
        inputs = Input(shape=(state.shape[0], state.shape[1], 1))
        
        x = Conv2D(16, 
                   (4, state.shape[1]), 
                   input_shape = (state.shape[0], state.shape[1], 1), 
                   padding='valid', 
                   data_format='channels_last', 
                   activation = 'relu')(inputs)
        y1 = Conv2D(16, 
                   (4, state.shape[1]), 
                   input_shape = (state.shape[0], state.shape[1], 1), 
                   padding='valid', 
                   data_format='channels_last', 
                   activation = 'relu')(inputs)
        y2 = Conv2D(16, 
                   (1, 1), 
                   padding='valid', 
                   activation = 'relu')(y1)
        y3 = Conv2D(16, 
                   (1, 1), 
                   padding='valid', 
                   activation = 'relu')(y2)
        y = Conv2D(16, (1, 1), padding='valid', activation = 'relu')(y3)
        
        out = Multiply()([x,y])
        
        z1 = Reshape((state.shape[0]-1,16), input_shape=(state.shape[0]-1,16))(out)
        z2 = Conv1D(8, 3, input_shape=(state.shape[0]-1,16), activation = 'relu')(z1)
        z3 = Conv1D(4, 2, activation = 'relu')(z2)
        z4 = Dropout(dropout_rate)(z3)
        z5 = Flatten()(z4)
        z6 = Dense(3, activation='linear')(z5)
        model = Model(inputs=inputs, outputs=z6)
        
        model.compile(loss='mae', optimizer = 'nadam')
        return model
        

    def act(self, state):

        if not self.is_eval and np.random.rand()<= self.epsolon:
            return random.randrange(self.action_size)

        options = self.model.predict(state)

        return np.argmax(options[0])

    def expReplay(self, batch_size):

        mini_batch = []
        l = len(self.memory)
        for i in range(1-batch_size+1,1):
            mini_batch.append(self.memory[1])

        for state, action, reward, next_state, done in mini_batch:
            target = reward
            for i in range(len(reward)):
                print(reward)
                if not done:
                    target[i] = reward[i]+self.gamma*np.argmax(self.model.predict(next_state)[0])
                    
            target_f = self.model.predict(state)
            target_f[action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)

        if self.epsolon> self.epsolon_min:
            self.epsolon *= self.epsolon_decay

In [191]:
agent = Agent(window_size,3,data_array[0:window_size])

In [192]:
agent.model.summary()

Model: "model_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_12 (InputLayer)           [(None, 256, 9, 1)]  0                                            
__________________________________________________________________________________________________
conv2d_44 (Conv2D)              (None, 253, 1, 16)   592         input_12[0][0]                   
__________________________________________________________________________________________________
conv2d_45 (Conv2D)              (None, 253, 1, 16)   272         conv2d_44[0][0]                  
__________________________________________________________________________________________________
conv2d_46 (Conv2D)              (None, 253, 1, 16)   272         conv2d_45[0][0]                  
____________________________________________________________________________________________

In [185]:
agent.model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 64)                16448     
_________________________________________________________________
dense_20 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_21 (Dense)             (None, 8)                 264       
_________________________________________________________________
dense_22 (Dense)             (None, 3)                 27        
Total params: 18,819
Trainable params: 18,819
Non-trainable params: 0
_________________________________________________________________


In [193]:
import time
t1 = time.time()
l = len(data_array)

for e in range(epochs):
    
    
    state = data_array[0:window_size]
    state.reshape(-1,window_size, 9,1)
    print(state.shape)
    total_profit = 0
    
    price = data['ask_price1'].values
    ask_vol = data['ask_volume1'].values
    
    bid_price = data['bid_price1'].values
    bid_vol = data['bid_volume1'].values
    
    agent.inventory_price = 0
    agent.inventory_number = 0
    agent.inventory_list = []
    
    action_buffer = 1 #to store the previous action. Start at 1 = sell
    price_buffer = 0
    
    for t in range(window_size+1, l):
        
        if t%20 == 0:
            t2 = time.time()
            print('Day:', str(t), 'time:', str(t2-t1))
            t1 = t2
        
        action = agent.act(state)
        
        next_state = data_array[t-window_size: t]
        next_state.reshape(-1,window_size, 9,1)
        
        reward = [0,0,0]
        
        #0 buy, 1 sell, 2 sit
        if action == 0 :
            if action_buffer == 1:
                action_buffer = 0
                reward[0] = -price_buffer + price[t] -0.1
                reward[1] = -0.1
                reward[2] = 0
                price_buffer = price[t]
                #buy activity
                buy_in_num = min(ask_vol[t],10)
                agent.inventory_number += ask_vol[t] #ask volume1
                agent.inventory_price += buy_in_num * price[t]
                agent.inventory_list.append([buy_in_num, price[t]])
                print (t, "buy: ", str(buy_in_num),
                       "| amount:", str(price[t]))
                
            else:
                print(t, 'wrong action')
                reward[0] = -1
                reward[1] = -price[t] + price_buffer -0.1
                reward[2] = 0
                
        elif action ==1 :
            if action_buffer == 1:
                print(t, 'wrong action')
                reward[0] = -price_buffer + price[t] -0.1
                reward[1] = -0.1
                reward[2] = 0
            else:
                action_buffer = 1
                reward[0] = -0.1
                reward[1] = -price[t] + price_buffer -0.1
                reward[2] = 0
                price_buffer = price[t]
                
                #sell activity
                bid_p = bid_price[t]
                bid_n = bid_vol[t]
                print(bid_n, agent.inventory_number)
                max_num = min(bid_n, agent.inventory_number)
                remain_num = min(bid_n, agent.inventory_number)
                
                price_in = []
            
                while remain_num > 0:
                    price_in.append(agent.inventory_list[0][0]*agent.inventory_list[0][1])
                    if remain_num >= agent.inventory_list[0][1]:
                        remain_num -=agent.inventory_list[0][1]
                        agent.inventory_list.pop(0)
                    else:
                        agent.inventory_list[0][1]-=remain_num
                        remain_num = 0
            
                agent.inventory_number-=max_num
           
                cost = np.sum(price_in)
                bid = bid_p*max_num
                profit = bid-cost
                reward = max(profit, 0)
                total_profit+=profit
            
                print (t, "sell:", str(bid_p), 
                       "| amount:", str(max_num), 
                       "| profit:", str(profit), 
                       "| total:", str(total_profit))
                
                
        else:
            if action_buffer ==1:
                reward[0] = -price_buffer + price[t] -0.1
                reward[1] = -0.1
                reward[2] = 0
            else:
                action_buffer =1
                reward[0] = -0.1
                reward[1] = -price[t] + price_buffer -0.1
                reward[2] = 0
        
        '''
        '''
        if t==l-1:
            done = True
        else:
            done = False
            
        agent.memory.append((state, action, reward, next_state, done))
        state = next_state
        
        if done:
            print('---------------------------------------')
            print("Total Profit:", str(total_profit))
            print('---------------------------------------')
        
        if len(agent.memory)>batch_size:
            agent.expReplay(batch_size)
            
        if t%500 == 0:
            print('### SAVING ###')
            agent.model.save('model/model_'+str(t))
    
    

(256, 9)
257 buy:  4 | amount: 3816.0
259 wrong action
Day: 260 time: 0.0009348392486572266
260 wrong action
261 wrong action
262 buy:  2 | amount: 3818.0
1 6
263 sell: 3820.0 | amount: 1 | profit: -11444.0 | total: -11444.0
264 buy:  1 | amount: 3820.0
265 wrong action
91 6
266 sell: 3818.0 | amount: 6 | profit: 7648.0 | total: -3796.0
267 wrong action
269 buy:  2 | amount: 3821.0
270 wrong action
271 wrong action
274 wrong action
275 wrong action
276 wrong action
278 buy:  1 | amount: 3822.0
1 3
279 sell: 3822.0 | amount: 1 | profit: -11414.0 | total: -15210.0
Day: 280 time: 0.0018749237060546875
281 wrong action
283 buy:  4 | amount: 3823.0
286 buy:  7 | amount: 3823.0
4 13
287 sell: 3821.0 | amount: 4 | profit: 52.0 | total: -15158.0
289 wrong action
[-0.1, -2.1, 0]


ValueError: Error when checking input: expected input_12 to have 4 dimensions, but got array with shape (256, 9)

In [157]:
data_price

NameError: name 'data_price' is not defined

In [21]:
data_price

array([3800., 3830., 3839., ..., 4155., 4155., 4154.])

In [129]:
data_array = dataset.values
data_array[0]

array([3.80000000e+03, 3.80000000e+03, 3.80000000e+03, 3.79700000e+03,
       2.00000000e+00, 3.80000000e+03, 1.60000000e+01, 3.43253968e-01,
       3.44213650e-01])

# Test


In [115]:
def new_get_state(data, t, n):
    d = t - n
    if d >= 0:
        block = data[d:t]
    else:
        block = -d * [data[0]] + data[0:t + 1]

        
    max = np.max(block)
    min = np.min(block)
    mean = np.mean(block)
#     print(sigmoid(max-min))
    
    temp1 = []
    new_block = []
    for i in range(len(block)):
        temp1.append(block[i])
        if i % 50 == 0:
            new_block.append(np.mean(temp1))
            temp1 = [] 
    res = []

    max_n = np.max(new_block)
    min_n = np.max(new_block)
    mean_n = np.mean(new_block)
    std_n = np.std(new_block)
    
    print(mean_n, std_n)
    
#     for i in range(len(new_block) - 1):

#         dif = new_block[i + 1] - new_block[i]
#         temp = sigmoid(dif)
#         res.append(temp)

#     for i in range(1, len(new_block)):
#     max = (max-min)/std_n
#     min = (min-min)/std_n
#     mean = (mean-min)/std_n
#     print(max,min)
    res.append(max)
    res.append(min)
    res.append(mean)
    new_block = (new_block-min)/std_n
    for b in new_block:
        res.append(b)
        
    
    '''
    res[0] = max
    res[1] = min
    res[2] = mean
    '''
    return np.array([res])

In [116]:
test_state = new_get_state(data_price, 500, window_size+1)
test_state

7620.550909090909 7.772470596506282
5.017709557824569 0.0


array([[  5.01770956,   0.        , 980.71273266, 977.81006768,
        980.10792134, 981.31217163, 981.88599175, 980.30862972,
        980.16710458, 980.4115571 , 980.35494704, 980.57366771,
        980.64057051, 981.42281856]])

In [117]:
test_state = new_get_state(data_price, 5000, window_size+1)
test_state

3815.7218181818184 1.4678234856834917
3.4064041410754173 0.0


array([[   3.40640414,    0.        , 2599.75948142, 2597.72379798,
        2598.4732069 , 2598.43233005, 2600.63967993, 2600.46254691,
        2600.13553212, 2599.56325622, 2599.27711827, 2599.41337444,
        2601.13020213, 2600.10828088]])

In [125]:
something = dataset['bid_volume1']-dataset['ask_volume1']
np.mean(dataset['ask_volume1'])

40.3257472858808

In [None]:
def

# Evaluation

# Storage

In [14]:
def get_states(data, t, n):
    d = t-n
#     print(d)
    if d>=0:
        block = data[d:t]
    else:
        block = -d*[data[0]]+data[0:t+1]
#     print (block.shape)
    
    temp1 = []
    new_block = []
    for i in range(len(block)):
        temp1.append(block[i])
        if i%50==0:
            new_block.append(np.mean(temp1))
            temp1 = []
    
#     print(len(new_block))
    
    res = []
    for i in range(11-1):
#         temp = sigmoid(block[i+1]-block[i])
#         print(new_block[i+1])
        dif = new_block[i+1]-new_block[i]
        
        temp = sigmoid(dif)
#         print(temp)
        res.append(temp)
    return np.array([res])

In [13]:
def sigmoid(x):
    return 1/(1+math.exp(-x))