**Imports and workspace setting**

In [None]:
# Install guide for external libs: https://docs.aws.amazon.com/sagemaker/latest/dg/nbi-add-external.html

# %conda install -y -c plotly python-kaleido
%pip install -U kaleido

# restart kernel

In [1]:
import pandas as pd
import numpy as np
import io
import pickle

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from PIL import Image

from s3fs.core import S3FileSystem
s3 = S3FileSystem()

In [2]:
# Variables
IMAGE_WIDTH = 256
IMAGE_HEIGHT = 20
PRICE_UPPER_LIMIT = 1

**Load data from local machine**

In [3]:
def prepare_x(data):
    df1 = data[:40, :].T
    return np.array(df1)

def get_label(data):
    lob = data[-5:, :].T
    return lob

def data_classification(X, Y, T):
    [N, D] = X.shape
    df = np.array(X)

    dY = np.array(Y)

    dataY = dY[T - 1:N]

    dataX = np.zeros((N - T + 1, T, D))
    for i in range(T, N + 1):
        dataX[i - T] = df[i - T:i, :]

    return dataX.reshape(dataX.shape + (1,)), dataY

In [4]:
# please change the data_path to your local path
data_path = '../FI-2010-lob-dataset/NoAuction'

dec_train = np.loadtxt(data_path + '/3.NoAuction_DecPre/NoAuction_DecPre_Training/Train_Dst_NoAuction_DecPre_CF_7.txt')
dec_test1 = np.loadtxt(data_path + '/3.NoAuction_DecPre/NoAuction_DecPre_Testing/Test_Dst_NoAuction_DecPre_CF_7.txt')
dec_test2 = np.loadtxt(data_path + '/3.NoAuction_DecPre/NoAuction_DecPre_Testing/Test_Dst_NoAuction_DecPre_CF_8.txt')
dec_test3 = np.loadtxt(data_path + '/3.NoAuction_DecPre/NoAuction_DecPre_Testing/Test_Dst_NoAuction_DecPre_CF_9.txt')
dec_test = np.hstack((dec_test1, dec_test2, dec_test3))

# extract limit order book data from the FI-2010 dataset
train_lob = prepare_x(dec_train)
test_lob = prepare_x(dec_test)

# extract label from the FI-2010 dataset
train_label = get_label(dec_train)
test_label = get_label(dec_test)

In [5]:
train_lob.shape

(254750, 40)

In [37]:
test_lob.shape

(139587, 40)

**Understanding Data frame**

> train_lob [firstIndex] [secondIndex]

1. First index is different snapshots on orderbook in time.
2. Second index 0-40 (price and quantity levels)

In [38]:
df = pd.DataFrame(test_lob)    # test_lob for testing data, train_lob for training data
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,0.2666,0.00129,0.2654,0.00225,0.2669,0.00246,0.2653,0.01033,0.267,0.0005,...,0.2644,0.00169,0.269,0.0124,0.2641,0.00282,0.27,0.002,0.2638,0.00156
1,0.2669,0.00397,0.2656,0.00144,0.267,0.0005,0.2654,0.00225,0.2671,0.00143,...,0.2641,0.00282,0.27,0.002,0.2638,0.00156,0.2709,0.01117,0.2634,0.00167
2,0.2665,0.00229,0.2654,0.00225,0.2666,0.00307,0.2653,0.01033,0.2667,0.00307,...,0.2638,0.00156,0.2688,0.005,0.2634,0.00167,0.269,0.0124,0.263,0.0002
3,0.2669,0.00397,0.2654,0.00225,0.267,0.0005,0.2653,0.01033,0.2671,0.00143,...,0.2638,0.00156,0.27,0.002,0.2634,0.00167,0.2709,0.01117,0.263,0.0002
4,0.2665,0.00287,0.2654,0.00225,0.2666,0.00307,0.2653,0.01033,0.2669,0.00397,...,0.2638,0.00156,0.269,0.0124,0.2634,0.00167,0.2697,0.01481,0.263,0.0002


In [39]:
# Ask already followed natural order
dfAskPrices = df.loc[:, range(0,40,4)]
dfAskVolumes = df.loc[:, range(1,40,4)]

# Bid follows reversed natural order
dfBidPrices = df.loc[:, range(2,40,4)]
dfBidVolumes = df.loc[:, range(3,40,4)]

# Reverse Bid price and volumnes to make them follow natural order
dfBidPrices = dfBidPrices[dfBidPrices.columns[::-1]]
dfBidVolumes = dfBidVolumes[dfBidVolumes.columns[::-1]]

# Concatenate Bid and Ask together to form complete orderbook picture
dfPrices = dfBidPrices.join(dfAskPrices, how='outer')
dfVolumnes = dfBidVolumes.join(dfAskVolumes, how='outer')

#Rename columns starting from 1->20
dfPrices.columns = range(1, 21)
dfVolumnes.columns = range(1, 21)

dfVolumnes.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
0,0.00156,0.00282,0.00169,0.00263,0.00425,0.00121,0.02,0.00289,0.01033,0.00225,0.00129,0.00246,0.0005,0.00143,0.0018,0.01,0.00021,0.005,0.0124,0.002
1,0.00167,0.00156,0.00282,0.00169,0.00263,0.02,0.00702,0.01033,0.00225,0.00144,0.00397,0.0005,0.00143,0.01,0.00021,0.005,0.0142,0.01481,0.002,0.01117
2,0.0002,0.00167,0.00156,0.00282,0.00169,0.00263,0.02,0.00702,0.01033,0.00225,0.00229,0.00307,0.00307,0.00704,0.0005,0.00143,0.01,0.00021,0.005,0.0124
3,0.0002,0.00167,0.00156,0.00282,0.00169,0.00263,0.02,0.00702,0.01033,0.00225,0.00397,0.0005,0.00143,0.01,0.00021,0.005,0.0124,0.01481,0.002,0.01117
4,0.0002,0.00167,0.00156,0.00282,0.00169,0.00263,0.02,0.00702,0.01033,0.00225,0.00287,0.00307,0.00397,0.0005,0.00143,0.01,0.00021,0.005,0.0124,0.01481


In [40]:
dfVolumnes.describe()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
count,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0,139587.0
mean,0.021203,0.018281,0.016937,0.017601,0.019014,0.02275,0.025199,0.022515,0.013808,0.011497,0.013381,0.015589,0.024614,0.026867,0.023904,0.019888,0.019636,0.020113,0.022743,0.023053
std,0.0437,0.032188,0.022647,0.02014,0.019891,0.022313,0.02495,0.024297,0.016522,0.013648,0.017035,0.018731,0.026964,0.026791,0.022692,0.020565,0.022513,0.027149,0.038488,0.042069
min,4e-05,4e-05,4e-05,4e-05,4e-05,4e-05,4e-05,2e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05,1e-05
25%,0.00274,0.003,0.0035,0.00406,0.005,0.006,0.00498,0.00368,0.003,0.00249,0.00284,0.00385,0.0045,0.0056,0.006,0.005,0.00409,0.00361,0.00325,0.0032
50%,0.00941,0.0096,0.01,0.01095,0.013,0.017,0.0189,0.01405,0.00741,0.00598,0.0066,0.00839,0.01514,0.019,0.0171,0.01236,0.011,0.0106,0.01089,0.01
75%,0.021,0.02059,0.021,0.02235,0.02548,0.03258,0.0373,0.0335,0.01898,0.015455,0.0174,0.02078,0.0362,0.0403,0.03543,0.02878,0.02721,0.02648,0.0266,0.02473
max,0.38378,0.36888,0.36572,0.31006,0.31006,0.2814,0.2814,0.2814,0.26666,0.15021,0.2129,0.2281,0.26089,0.27047,0.2405,0.31,0.36897,0.37329,0.39216,0.401


In [41]:
dfPrices.iloc[3].tolist()

[0.263,
 0.2634,
 0.2638,
 0.2641,
 0.2644,
 0.2646,
 0.265,
 0.2651,
 0.2653,
 0.2654,
 0.2669,
 0.267,
 0.2671,
 0.2677,
 0.2681,
 0.2688,
 0.269,
 0.2697,
 0.27,
 0.2709]

**Volume max value**

In [42]:
MAX_VOLUME = dfVolumnes.to_numpy().max()
MAX_VOLUME

0.401

In [43]:
# 0.6001 for training set, 0.401 for test set
MAX_VOLUME = 0.60001

**Get price list, order counts, volumnes from numpy array**

In [44]:
def getPriceAndOrderCountLists(snapshot, isNormalized=False, isGrayScale=False):
        
    if isGrayScale:
        colors = ['rgb(128, 128, 128)'] * 10 + ['rgb(256, 256, 256)'] * 10
    else:
        colors = ['lightslategrey',] * 10 + ['crimson',] * 10

#     return dfPrices.iloc[snapshot].tolist(), [], dfVolumnes.iloc[snapshot].tolist(), colors, 0.0
    return list(range(0, 20)) , [], dfVolumnes.iloc[snapshot].tolist(), colors, 0.0

In [45]:
prices, counts, volumes, colors, mid = getPriceAndOrderCountLists(23000, isNormalized=False)
print (prices)
print (counts)
print (volumes)
print (colors)
print ('mid is ' + str(mid))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]
[]
[0.0019, 0.00445, 0.02117, 0.04341, 0.02153, 0.0721, 0.06376, 0.05429, 0.005, 0.02125, 0.004, 0.01999, 0.04955, 0.0435, 0.02818, 0.01556, 0.05525, 0.02456, 0.04379, 0.03349]
['lightslategrey', 'lightslategrey', 'lightslategrey', 'lightslategrey', 'lightslategrey', 'lightslategrey', 'lightslategrey', 'lightslategrey', 'lightslategrey', 'lightslategrey', 'crimson', 'crimson', 'crimson', 'crimson', 'crimson', 'crimson', 'crimson', 'crimson', 'crimson', 'crimson']
mid is 0.0


**Visualize orderbook snapshot (different angles) - Utility functions**

In [46]:
# Remove title
# Remove black theme (template='none')
# Remove gap between bars (bargap = 0)
# Remove grid lines
# Remove axis and labels
# xaxis=dict(range=[0, 12000], showgrid=False, zeroline= False, visible= False),
# yaxis=dict(range=[30.0, 30.3], showgrid=False, zeroline= False, visible= False),

# Bar width -> constant? [TO_DO]

def makePlainFigure(fig, axisVisibility):
    fig.update_layout(
        title=None,
        template='none',
        bargap = 0,
        plot_bgcolor='rgb(0, 0, 0)',  # grayscale middle
        paper_bgcolor='rgb(0, 0, 0)', # grayscale middle
        xaxis=dict(showgrid = False,
                   zeroline = False,
                   visible = axisVisibility),
        yaxis=dict(showgrid =False,
                   zeroline = False,
                   visible = axisVisibility),
    )
    
    if not axisVisibility:
        fig.update_layout(
            margin = dict(l=0, #left margin
                            r=0, #right margin
                            b=0, #bottom margin
                            t=0),  #top margin
        )

In [47]:
def plotOrderbookAt(timeSnapshot, trueVolumes=True, isNormalized=False, isGrayScale=False):
    # trueVolumes False for counts, True for volumes
    prices, counts, volumes, colors, mid = getPriceAndOrderCountLists(timeSnapshot, isNormalized, isGrayScale)
   
    xBarValues = counts
    if trueVolumes:
        xBarValues = volumes
        
    fig = go.Figure()
    fig.add_trace(go.Bar(
#         x= ['{:.2f}'.format(x) for x in xBarValues],
        x = xBarValues,
        y=prices,
        orientation='h',
        marker_color=colors
    ))

    fig.update_layout(
        title='Orderbook snapshot',
        xaxis_title="Order Volume",
        yaxis_title="Price levels",
#         width=IMAGE_WIDTH,
#         height=IMAGE_HEIGHT,
        template='plotly_dark',
    )
    return fig

**Visualize orderbook status - RGB**

In [48]:
figCounts = plotOrderbookAt(timeSnapshot=23000, trueVolumes=True, isNormalized=True, isGrayScale=False)
# makePlainFigure(figCounts, False)
figCounts.update_layout(
        xaxis_title="Order Volume",
        yaxis=dict(range=[-0.5, 19.5]),
#         xaxis=dict(range=[0, MAX_VOLUME]),
    )
figCounts.show()

**Normalized plot with gray scale**

In [49]:
timeSnapshot = 23000

figCounts = plotOrderbookAt(timeSnapshot=timeSnapshot, isNormalized=True, isGrayScale=True)

makePlainFigure(figCounts, axisVisibility=False)
figCounts.update_layout(
    yaxis=dict(range=[-0.5, 19.5]),
#     xaxis=dict(range=[0, MAX_VOLUME]),
#     width=IMAGE_WIDTH,
#     height=IMAGE_HEIGHT,
)
figCounts.show()

**Animate raw orderbook data - Order count, Volume, Normalized volume**

In [30]:
def animateOrderbook(startIndex, trueVolumes=True, isNormalized=False, isGrayScale=False):
    barIndex = 1 # for counts
    if trueVolumes:
        barIndex = 2 # for volumes

    fig = go.Figure(
        data=[go.Bar(y= getPriceAndOrderCountLists(startIndex, isNormalized, isGrayScale)[0],
                     x=getPriceAndOrderCountLists(startIndex, isNormalized, isGrayScale)[barIndex],
                     orientation='h',
                     marker_color=getPriceAndOrderCountLists(startIndex, isNormalized, isGrayScale)[3]),
              ],
        layout=go.Layout(#width=IMAGE_WIDTH,
#                          height=IMAGE_HEIGHT,
                         title="Volume of buy, sell price levels of an orderbook",
                         xaxis_title="Volume",
                         yaxis_title="Price levels",
                         template='plotly_dark',
                         hovermode="closest",
                         updatemenus=[dict(type="buttons",
                                           showactive=True,
                                           x=0.01,
                                           xanchor="left",
                                           y=1.15,
                                           yanchor="top",
                                           font={"color":'blue'},
                                           buttons=[dict(label="Play",
                                                         method="animate",
                                                         args=[None])])]),
        frames=[go.Frame(
            data=[go.Bar(y= getPriceAndOrderCountLists(k, isNormalized, isGrayScale)[0],
                         x=getPriceAndOrderCountLists(k, isNormalized, isGrayScale)[barIndex],
                         orientation='h',
                         marker_color=getPriceAndOrderCountLists(k, isNormalized, isGrayScale)[3])]) for k in range(startIndex, startIndex + 100)]
    )

    return fig

In [31]:
fig = animateOrderbook(startIndex = 23000, trueVolumes = True, isNormalized=True, isGrayScale=True)

makePlainFigure(fig, axisVisibility=False)
fig.update_layout(
        yaxis=dict(range=[-0.5, 19.5]),
#         xaxis=dict(range=[0, MAX_VOLUME]),
)

fig.show()

**Generating byte images (byte array)**    
https://plotly.com/python/static-image-export/

In [50]:
def plotly_fig2array_gray(fig):
    #convert Plotly fig to  an array
    fig_bytes = fig.to_image(format="png")
    buf = io.BytesIO(fig_bytes)
    img = Image.open(buf).convert('L')    # Only getting the channel L
    return img

In [51]:
figCounts = plotOrderbookAt(timeSnapshot=34000, isNormalized=True, isGrayScale=True)

makePlainFigure(figCounts, axisVisibility=False)
figCounts.update_layout(
    yaxis=dict(range=[-0.5, 19.5]),
#     xaxis=dict(range=[0, MAX_VOLUME]),
    height=IMAGE_HEIGHT,
    width=IMAGE_WIDTH,
)
figCounts.show()
img = plotly_fig2array_gray(figCounts)
np.asarray(img)

array([[255, 255, 255, ...,   0,   0,   0],
       [255, 255, 255, ...,   0,   0,   0],
       [255, 255, 255, ...,   0,   0,   0],
       ...,
       [128, 128, 128, ...,   0,   0,   0],
       [128, 128, 128, ...,   0,   0,   0],
       [128, 128, 128, ...,   0,   0,   0]], dtype=uint8)

In [52]:
px.imshow(np.asarray(img))

**Save images to local machine**

In [53]:
def saveFigureToLocal(location, figure):
    img = plotly_fig2array_gray(figure)

    buf = io.BytesIO()
    img.save(buf, format='PNG')
    byte_im = buf.getvalue()
    
    with open(location,'w+b') as f:
        f.write(byte_im)
        f.close()

In [54]:
for timeSnapshot in range(0, 139587):
    fig = plotOrderbookAt(timeSnapshot=timeSnapshot, isNormalized=True, isGrayScale=True)
    
    makePlainFigure(fig, axisVisibility=False)
    fig.update_layout(
        yaxis=dict(range=[-0.5, 19.5]),
#         xaxis=dict(range=[0, MAX_VOLUME]),
        height=IMAGE_HEIGHT,
        width=IMAGE_WIDTH,
    )

    data_path = '../FI-2010-lob-dataset/NoAuction/3.NoAuction_DecPre/NoAuction_DecPre_Testing/Test_Dst_NoAuction_DecPre_CF_7_8_9_FreeScale'
#     data_path = '../FI-2010-lob-dataset/NoAuction/3.NoAuction_DecPre/NoAuction_DecPre_Training/Train_Dst_NoAuction_DecPre_CF_7_FreeScale'

    saveFigureToLocal('{}/lob_snapshot_{}.PNG'.format(data_path, timeSnapshot), fig)

**Saving images to S3**

In [120]:
def saveFigureToS3(name, figure):
    key = 'Prageeth/' + name
    bucket = 'update-bucket-name-here'

    img = plotly_fig2array_gray(figure)

    buf = io.BytesIO()
    img.save(buf, format='PNG')
    byte_im = buf.getvalue()
    
    with s3.open('{}/{}'.format(bucket, key),'wb') as f:
        f.write(byte_im)
        f.close() # Explicit close requires?

In [121]:
for timeSnapshot in range(0, 5):
    fig = plotOrderbookAt(timeSnapshot=timeSnapshot, isNormalized=True, isGrayScale=True)
    
    makePlainFigure(fig, axisVisibility=False)
    fig.update_layout(
        yaxis=dict(range=[-0.5, 19.5]),
        xaxis=dict(range=[0, MAX_VOLUME]),
        height=IMAGE_HEIGHT,
    )

    saveFigureToS3('lob_images/FI2010_DecPre/images/train_cf7/lob_snapshot_{}.PNG'.format(timeSnapshot), fig)

**Saving labels to S3**

In [66]:
labels = test_label.astype(int) - 1
labels

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [2, 2, 1, 1, 1],
       ...,
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

In [67]:
def saveLabelsToS3(npyArray, name):
    with s3.open('{}/{}'.format('update-bucket-name-here', name), 'wb') as f:
        f.write(pickle.dumps(npyArray))

In [68]:
# saveLabelsToS3(labels, 'Prageeth/lob_images/FI2010_DecPre/labels/train_cf7/price_movement_labels.pkl')
saveLabelsToS3(labels, 'Prageeth/lob_images/FI2010_DecPre/labels/test_cf789/price_movement_labels.pkl')

In [69]:
def readLabelsFromS3(name):
    bucket = 'update-bucket-name-here'
    return np.load(s3.open('{}/{}'.format(bucket, name)), allow_pickle=True)

In [70]:
# labels = readLabelsFromS3('Prageeth/lob_images/FI2010_DecPre/labels/train_cf7/' + 'price_movement_labels.pkl')
labels = readLabelsFromS3('Prageeth/lob_images/FI2010_DecPre/labels/test_cf789/' + 'price_movement_labels.pkl')

In [71]:
labels

array([[1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [2, 2, 1, 1, 1],
       ...,
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])