In [27]:
import pandas as pd
import numpy as np
def llprint(message):
    sys.stdout.write(message)
    sys.stdout.flush()
    
    
def t_s(time):
    t = time.split(":")
    return float(t[0])*3600+float(t[1])*60+float(t[2])

In [30]:
def preProcessData(Quote_dir,Trade_dir):
    df_quote = pd.read_csv(Quote_dir)
    df_trade = pd.read_csv(Trade_dir)
    
    print('Start preprocess!')
    quote_name = {'TIME_M':0, 'EX':1, 'BID':2, 'BIDSIZ':3, 'ASK':4, 'ASKSIZ':5}
    trade_name = {'TIME_M':0, 'SIZE':1, 'PRICE':2}
    quote = df_quote[['TIME_M', 'EX', 'BID', 'BIDSIZ', 'ASK', 'ASKSIZ']].values
    trade = df_trade[['TIME_M', 'SIZE', 'PRICE']].values
    
    ## Timestamp processing
    vt_s = np.vectorize(t_s)
    quote[:, quote_name['TIME_M']] = vt_s(quote[:, quote_name['TIME_M']])
    trade[:, trade_name['TIME_M']] = vt_s(trade[:, trade_name['TIME_M']])
    
    ## Given start and end time, cut the trade and quote data
    def time_selection(data):
        start_time = t_s("09:30:00")
        end_time = t_s("16:00:00")
        time_line = data[:,0]
        for i in range(len(time_line)):
            if time_line[i] >= start_time:
                begin = i
                break
        for i in range(len(time_line)-1, 0, -1):
            if time_line[i] < end_time:
                end = i
                break
        return data[begin:end+1,:]
    
    df_arr = time_selection(quote)
    df_trade_arr = time_selection(trade)
    print('Data reading complete!')
    
    ## 
    market_depth5 = []
    message = []

    bids = {}
    asks = {}

    # all exchanges
    exs = np.unique(df['EX'])
    for ex in exs:
        bids[ex] = {}
        asks[ex] = {}

    index_trade = 0
    n_quote = len(df_arr)
    n_trade = len(df_trade_arr)
    
    for i in range(n_quote-1):
#     for i in range(1000):
        llprint("\rLooping trade ... %d/%d" % (index_trade, n_trade))

        slice_df = df_arr[i]

        # store elements
        time = slice_df[1]
        ex = slice_df[2]
        bid = float(slice_df[3])
        bid_size = int(slice_df[4])
        ask = float(slice_df[5])
        ask_size = int(slice_df[6])

        # refresh bid and ask
        if bid>0:
            bids[ex][bid] = bid_size
        if ask>0:
            asks[ex][ask] = ask_size

    #     #refresh all exchange bids and asks
    #     for ex_t in exs:
    #         bids[ex_t] = {key:val for key, val in bids[ex_t].items() if key <= bid}
    #         asks[ex_t] = {key:val for key, val in asks[ex_t].items() if key >= ask}

        # refresh one exchange bid & ask: due to the seperate of exchange
        bids[ex] = {key:val for key, val in bids[ex].items() if key <= bid}
        asks[ex] = {key:val for key, val in asks[ex].items() if key >= ask}

        # refresh combine bids and asks: combine bid & ask from all exchange together
        # if there is no trade data, stop
        if(index_trade>=n_trade):
            break
            
        # if initial trade time less than quote time, should append np.nan
        while(df_trade_arr[index_trade][1]<=time):
            market_depth5.append([np.nan]*10)
            index_trade+=1
        
        # store the trade time
        trade_time = df_trade_arr[index_trade][1]    
        
        
        # if the quote happen just before the trade, store the quote
        if (time<trade_time and df_arr[i+1][1]>=trade_time):
            index_trade += 1
            order_type = np.random.choice([1,4],1)[0]
            direction = np.random.choice([1,-1],1)[0]

            # update when trade comes in by exchange
            bids_depth = {}
            asks_depth = {}
            for ex_t in exs:
                for key,val in bids[ex_t].items():
                    if (key not in bids_depth):
                        bids_depth[key] = val
                    else:
                        bids_depth[key] += val
                for key,val in asks[ex_t].items():
                    if (key not in asks_depth):
                        asks_depth[key] = val
                    else:
                        asks_depth[key] += val
            # sort bids and ask
            bids_depth = {key:bids_depth[key] for key in np.array(sorted(bids_depth.keys(),reverse=True))}
            asks_depth = {key:asks_depth[key] for key in np.array(sorted(asks_depth.keys()))}

            # solve bid & ask intercept
            bids_t = list(bids_depth.items())
            asks_t = list(asks_depth.items())

            # if trade price intercept with bid or ask, make trade
            while(len(bids_t)>0 and bids_t[0][0]> df_trade_arr[index_trade-1][7]):
                order_type = 4 # execution
                direction = -1 #bid greater than trade: sell
                bids_t = bids_t[1:]
    
            while(len(asks_t)>0 and asks_t[0][0]< df_trade_arr[index_trade-1][7]):
                order_type = 4
                direction = 1 #ask less than trade price,
                asks_t = asks_t[1:]

            # if bid & ask price intercept, make trade 
            while(len(bids_t)>0 and len(asks_t)>0 and bids_t[0][0]>=asks_t[0][0]):
                if(bids_t[0][1]>asks_t[0][1]):
                    bids_t[0] = (bids_t[0][0], bids_t[0][1] - asks_t[0][1])
                    asks_t = asks_t[1:]
                    direction = -1 # bid greater than ask
                elif(bids_t[0][1]<asks_t[0][1]):
                    asks_t[0] = (asks_t[0][0], asks_t[0][1] - bids_t[0][1])
                    bids_t = bids_t[1:]
                    direction = 1 # ask less than bid
                else:
                    bids_t = bids_t[1:]
                    asks_t = asks_t[1:]
            # transfer all to dict
            bids_depth = dict(bids_t)
            asks_depth = dict(asks_t)
            # can get level 5 bids and asks：the structure are bid1 vol1 ask1 vol1 bid2 vol2...
            bids_tmp = list(np.array(list(bids_depth.items())).ravel())
            asks_tmp = list(np.array(list(asks_depth.items())).ravel())
            level = []
            
            # rearange
            for j in range(5):
                # chaoyin modify
                if len(asks_tmp)>=2*j+2:
                    level += asks_tmp[2*j:2*j+2]
                else:
                    level += [np.nan,np.nan]
                if len(bids_tmp)>=2*j+2:
                    level += bids_tmp[2*j:2*j+2]
                else:
                    level += [np.nan,np.nan]
            #check the trade price is between bids and ask
    #         print(level)
    #         print("trade price:", df_trade_arr[index_trade-1][7])
            market_depth5.append(level)
        
            trade_slice = df_trade_arr[index_trade-1]
            message.append([trade_slice[1].round(8),order_type,trade_slice[9],trade_slice[6],trade_slice[7],direction])
            # if more than one trade between two quotes
            while(index_trade<n_trade and df_arr[i+1][1]>=df_trade_arr[index_trade][1]):
                index_trade += 1
                #check the trade price is between bids and ask
    #             print(level)
    #             print("trade price:", df_trade_arr[index_trade-1][7])
                market_depth5.append(level)
                trade_slice = df_trade_arr[index_trade-1]
                message.append([trade_slice[1].round(8),order_type,trade_slice[9],trade_slice[6],trade_slice[7],direction])
            
            
    return market_depth5, message

In [31]:
quote, trade = preProcessData('../Data/Quote.csv', '../Data/Trade.csv')

Start preprocess!
Data reading complete!


NameError: name 'df' is not defined

In [23]:
trade

Unnamed: 0,DATE,TIME_M,EX,SYM_ROOT,SYM_SUFFIX,TR_SCOND,SIZE,PRICE,TR_CORR,TR_SEQNUM,TR_ID,TR_SOURCE,TR_RF
0,20161108,4:00:00.018807965,P,AAPL,,@ T,100,110.26,0,1032,1,N,
1,20161108,4:09:43.466642142,Q,AAPL,,@ TI,7,110.34,0,1037,1,N,
2,20161108,4:12:17.128231245,P,AAPL,,@FT,100,110.20,0,1038,2,N,
3,20161108,4:12:24.702567832,P,AAPL,,@FT,406,110.29,0,1039,3,N,
4,20161108,4:12:40.223930123,P,AAPL,,@ TI,49,110.29,0,1040,4,N,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1323520,20161109,18:46:40.175106572,D,MSFT,,@ T,100,60.06,0,3081979,38066,N,Q
1323521,20161109,18:48:26.180468568,Q,MSFT,,@ TI,10,60.15,0,3081983,76792,N,
1323522,20161109,19:17:19.938645001,Q,MSFT,,@FT,200,60.15,0,3082149,76793,N,
1323523,20161109,19:20:54.476011922,P,MSFT,,@ TI,10,60.11,0,3082154,40409,N,


In [8]:
quote = quote[['TIME_M', 'EX', 'BID', 'BIDSIZ', 'ASK', 'ASKSIZ']].values

In [17]:
quote[:,0]

array(['4:00:00.009695203', '4:00:00.010298339', '4:00:00.050014967', ...,
       '20:00:00.073421765', '20:00:00.073460480', '20:00:00.073525224'],
      dtype=object)

In [24]:
vt_s = np.vectorize(t_s)
quote[:,0] = vt_s(quote[:,0])