In [1]:
import csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
import os
from datetime import datetime

%matplotlib inline

# Functions

In [2]:
def import_dict(dict_loc):
    """ Loads a dictionary from a csv """
    df = pd.read_csv(dict_loc)
    df.index = df.iloc[:,0]
    df = df.drop(df.columns[0], axis = 1)
    return df.to_dict()

def convertDateToQuarter(date):
    quarter = (date.month-1)//3 + 1
    return (str(date.year) + 'Q' + str(quarter))

# Data Import

## Directories

In [3]:
# 605 and 606 csv directory
dir_605 = '../data/605/'
dir_606 = '../data/606/'

## Raw Data

In [4]:
## Import 605 data
# Find market center csvs
marketcenter_csv_list = [x for x in os.listdir(dir_605) if x[-4:] == '.csv']
# Get .csv directories
marketcenter_csv_dirs = [dir_605 + x for x in marketcenter_csv_list]
# Read .csv files
csv_df_list_605 = [pd.read_csv(file, sep = ',') for file in marketcenter_csv_dirs]
# Merge each marketcenter's data
rawdata_605 = pd.concat(csv_df_list_605)
# Clean up
del(csv_df_list_605)

In [5]:
## Import 606 Data

csv_df_list_606 = []
# Find broker folders
broker_folders = [x for x in os.listdir(dir_606) if '.' not in x]
# Merge .csv's for each broker
for broker in broker_folders:
    # Get file locations of csv's for each broker
    directory = dir_606 + broker
    broker_csv_list  = [x for x in os.listdir(directory)]
    broker_csv_dirs  = [dir_606 + broker + '/' + csv for csv in broker_csv_list]
    # Read csv's as dataframes
    csv_df_list_606_broker = [pd.read_csv(file) for file in broker_csv_dirs]
    csv_df_list_606.append(pd.concat(csv_df_list_606_broker))
    
# Merge each broker's data
rawdata_606 = pd.concat(csv_df_list_606)
# Clean up
del(csv_df_list_606)

## Dictionaries

In [6]:
symbol_dict = import_dict('../data/keys/symbols.csv')['Exchange']
mktctr_mpid_dict = import_dict('../data/keys/mpids.csv')['MPID']
ordertype_dict = {11: 'Market', 12: 'Limit'}
broker_vol_dict = import_dict('../data/keys/broker_volumes.csv')

# Data Prep

## Broker Data

In [7]:
# Import Data
data_606 = rawdata_606.copy()

# Fix Routing Venue labels
data_606['RoutingVenue'] = data_606['RoutingVenue'].apply(
    lambda x: mktctr_mpid_dict.get(x.strip(), "Unknown_" + str(x.strip())))

# Convert date to quarter
data_606['Quarter'] = data_606['Date'].apply(
    lambda x: convertDateToQuarter(datetime.strptime(str(x), '%Y%m')))
data_606['Quarter'] = pd.PeriodIndex(data_606['Quarter'], freq='Q').values
data_606 = data_606.drop('Date', axis=1)

# Change column names
data_606 = data_606.rename(
    columns={'RoutingVenue': 'MarketCenter', 'Pct': 'MktShare'})

# Add binary var for presence of rebates
data_606['Rebate_Dummy'] = (data_606['Rebate'].apply(lambda x: (x > 0))
                            | data_606['Broker'].apply(lambda x: x == 'TD_Ameritrade')).apply(lambda x: int(x))


data_606.head()

Unnamed: 0,Broker,Exchange,OrderType,MktShare,Rebate,MarketCenter,Quarter,Rebate_Dummy
0,Aurora Capital,NYSE,Total,0.419,0.0,KCG,2014Q1,0
1,Aurora Capital,NYSE,Total,0.3597,0.0,UBSS,2014Q1,0
2,Aurora Capital,NYSE,Total,0.2174,0.0,ATDF,2014Q1,0
3,Aurora Capital,NASDAQ,Total,0.4343,0.0,KCG,2014Q1,0
4,Aurora Capital,NASDAQ,Total,0.3743,0.0,ATDF,2014Q1,0


## Market Center Data

In [8]:
# Import data
data_605 = rawdata_605.copy()

# Quarter column
data_605['Quarter'] = data_605['idate'].apply(lambda x: convertDateToQuarter(datetime.strptime(str(x), '%Y%m')))
data_605['Quarter'] = pd.PeriodIndex(data_605['Quarter'], freq='Q').values
data_605 = data_605.drop('idate', axis = 1)

# Temporary Variables for Aggregation
data_605['PrImp_TotalT']     = data_605['PrImpShares']    * data_605['PrImp_AvgT']
data_605['PrImp_TotalAmt']   = data_605['PrImpShares']    * data_605['PrImp_AvgAmt']
data_605['ATQ_TotalT']       = data_605['ATQShares']      * data_605['ATQ_AvgT']
data_605['OTQ_TotalT']       = data_605['OTQShares']      * data_605['OTQ_AvgT']
data_605['AvgRealSpread_T']  = data_605['AvgRealSpread']  * data_605['ExecShares']
data_605['AvgEffecSpread_T'] = data_605['AvgEffecSpread'] * data_605['ExecShares']

data_605 = data_605.groupby(['MarketCenter', 'Quarter', 'Exchange', 'OrderCode']) \
        .sum().reset_index()

# Reconstruct original variables
data_605['PrImp_AvgT']     = data_605['PrImp_TotalT']     / data_605['PrImpShares']
data_605['PrImp_AvgAmt']   = data_605['PrImp_TotalAmt']   / data_605['PrImpShares'] 
data_605['ATQ_AvgT']       = data_605['ATQ_TotalT']       / data_605['ATQShares']
data_605['OTQ_AvgT']       = data_605['OTQ_TotalT']       / data_605['OTQShares']
data_605['AvgRealSpread']  = data_605['AvgRealSpread_T']  / data_605['ExecShares']
data_605['AvgEffecSpread'] = data_605['AvgEffecSpread_T'] / data_605['ExecShares'] 
data_605['PrImp_Pct']      = data_605['PrImpShares']      / data_605['ExecShares']
data_605['ATQ_Pct']        = data_605['ATQShares']        / data_605['ExecShares']
data_605['OTQ_Pct']        = data_605['OTQShares']        / data_605['ExecShares']

## New Vars

# Absolute
data_605['OrderType']    = data_605['OrderCode'].apply(lambda x: ordertype_dict.get(x, 'Other'))
data_605['PrImp_ExpAmt'] = data_605['PrImp_AvgAmt'] * data_605['PrImp_Pct']
data_605['All_AvgT']     = (data_605['PrImp_TotalT'] + data_605['ATQ_TotalT'] + data_605['OTQ_TotalT']) \
                            / data_605['ExecShares']
data_605 = data_605.rename(columns = {'idate': 'Date'})

# Relative values
data_605_grouped = data_605.groupby(['Exchange', 'OrderType', 'Quarter'])

data_605['MktCtrAvg_PrImp_Pct']  = data_605_grouped['PrImp_Pct'].transform("mean")
data_605['Rel_PrImp_Pct']        = data_605['PrImp_Pct'] - data_605['MktCtrAvg_PrImp_Pct']
data_605['MktCtrAvg_PrImp_AvgT'] = data_605_grouped['PrImp_AvgT'].transform("mean")
data_605['Rel_PrImp_AvgT']       = data_605['PrImp_AvgT'] - data_605['MktCtrAvg_PrImp_AvgT']    
data_605['MktCtrAvg_PrImp_ExpAmt'] = data_605_grouped['PrImp_ExpAmt'].transform("mean")
data_605['Rel_PrImp_ExpAmt']       = data_605['PrImp_ExpAmt'] - data_605['MktCtrAvg_PrImp_ExpAmt']   
data_605['MktCtrAvg_All_AvgT'] = data_605_grouped['All_AvgT'].transform("mean")
data_605['Rel_All_AvgT']       = data_605['All_AvgT'] - data_605['MktCtrAvg_All_AvgT']   

data_605.head()

Unnamed: 0,MarketCenter,Quarter,Exchange,OrderCode,CoveredOrders,CoveredShares,CancelledShares,MktCtrExecShares,AwayExecShares,ExecShares_0_9,...,PrImp_ExpAmt,All_AvgT,MktCtrAvg_PrImp_Pct,Rel_PrImp_Pct,MktCtrAvg_PrImp_AvgT,Rel_PrImp_AvgT,MktCtrAvg_PrImp_ExpAmt,Rel_PrImp_ExpAmt,MktCtrAvg_All_AvgT,Rel_All_AvgT
0,ARCA,2015Q3,NASDAQ,11,167917,72636881,10384868,40788074,21463939,62072012,...,0.019544,0.671599,0.745672,-0.306562,0.438198,0.636061,0.010542,0.009003,0.474635,0.196964
1,ARCA,2015Q3,NASDAQ,12,11963921,3132598691,428908786,1832975495,73922935,1906190605,...,0.001719,0.156047,0.417352,-0.316506,0.395793,-0.372792,0.00279,-0.001072,21.679366,-21.523319
2,ARCA,2015Q3,NASDAQ,13,2049676,429176278,245833327,139132834,12298742,151102939,...,,0.0,0.0,0.0,,,,,0.0,0.0
3,ARCA,2015Q3,NASDAQ,14,30626255,7740311867,6336374765,1310626905,28550727,685596002,...,,0.0,0.0,0.0,,,,,0.0,0.0
4,ARCA,2015Q3,NASDAQ,15,87537916,54307617429,53536207786,551030623,5705856,162391561,...,,0.0,0.0,0.0,,,,,0.0,0.0


# Merge Datasets

In [9]:
data_merged = data_605.merge(data_606)

data_merged.set_index(['Quarter'])
data_merged['Broker_Size'] = data_merged['Broker'].apply(lambda x: broker_vol_dict['Size'].get(x))


print('Total Observations: ' + str(len(data_merged)))
print('Brokers: ' + str(len(set(list(data_merged['Broker'])))))
print('Market Centers: ' + str(len(set(list(data_merged['MarketCenter'])))))

data_merged.head()

Total Observations: 8740
Brokers: 21
Market Centers: 10


Unnamed: 0,MarketCenter,Quarter,Exchange,OrderCode,CoveredOrders,CoveredShares,CancelledShares,MktCtrExecShares,AwayExecShares,ExecShares_0_9,...,Rel_PrImp_AvgT,MktCtrAvg_PrImp_ExpAmt,Rel_PrImp_ExpAmt,MktCtrAvg_All_AvgT,Rel_All_AvgT,Broker,MktShare,Rebate,Rebate_Dummy,Broker_Size
0,ARCA,2015Q3,NASDAQ,11,167917,72636881,10384868,40788074,21463939,62072012,...,0.636061,0.010542,0.009003,0.474635,0.196964,Barclays Capital,0.0,1.0,1,23958270000.0
1,ARCA,2015Q3,NASDAQ,11,167917,72636881,10384868,40788074,21463939,62072012,...,0.636061,0.010542,0.009003,0.474635,0.196964,Cowen Execution,0.0,0.0,0,50187100.0
2,ARCA,2015Q3,NASDAQ,11,167917,72636881,10384868,40788074,21463939,62072012,...,0.636061,0.010542,0.009003,0.474635,0.196964,Credit Suisse,0.0,1.0,1,32667210000.0
3,ARCA,2015Q3,NASDAQ,11,167917,72636881,10384868,40788074,21463939,62072012,...,0.636061,0.010542,0.009003,0.474635,0.196964,Deutsche,0.0151,1.0,1,20450090000.0
4,ARCA,2015Q3,NASDAQ,12,11963921,3132598691,428908786,1832975495,73922935,1906190605,...,-0.372792,0.00279,-0.001072,21.679366,-21.523319,Barclays Capital,0.1375,1.0,1,23958270000.0


# Data Export

## Panel

## First Difference

In [10]:
def tempfunc(data, broker, marketcenter, exchange, ordertype):
    
    data = data.drop('Rebate_Dummy', axis = 1)
    data['Broker'] = broker
    data['MarketCenter'] = marketcenter
    data['Exchange'] = exchange
    data['OrderType'] = ordertype
    data['Rebate_Dummy'] = data_606.query(query).iloc[0]['Rebate_Dummy']
    
    return data

def getDifferencedData(data_merged, query_command):
    
    if query_command != "":
        data_merged = data_merged.query(query_command)

    data_merged_diff = data_merged.groupby(['Broker', 'MarketCenter', 'Exchange', 'OrderType']).diff(1).dropna()
    
    return data_merged_diff

data_merged_diff_list = []
data_merged_diff_lag_list = []

brokers = list(set(list(data_merged['Broker'])))
marketcenters = list(set(list(data_merged['MarketCenter'])))
exchanges = list(set(list(data_merged['Exchange'])))
ordertypes = ['Market','Limit']

for broker in brokers:
    for marketcenter in marketcenters:
        for exchange in exchanges:
            for ordertype in ordertypes:
                query = "Broker == '" + broker + "' & OrderType == '" + ordertype \
                + "' & Exchange == '" + exchange + "' & MarketCenter == '" + marketcenter + "'"
                data_merged_diff_temp = getDifferencedData(data_merged, query)
                #data_merged_diff_lag_temp = getDifferencedData(data_merged_lag, query)
                
                if len(data_merged_diff_temp) > 0:
                    data_merged_diff_temp = tempfunc(data_merged_diff_temp, broker, marketcenter, 
                                                     exchange, ordertype)
                    #data_merged_diff_lag_temp = tempfunc(data_merged_diff_lag_temp, broker, marketcenter, 
                    #                                 exchange, ordertype)
                    
                    data_merged_diff_list.append(data_merged_diff_temp)
                    #data_merged_diff_lag_list.append(data_merged_diff_lag_temp)

## With Binaries

In [11]:
data_clean = data_merged.dropna().copy()

# {Market Center, Broker, Order Type, Exchange} dummies
temp_dict = {}

for mktctr in list(set(list(data_clean['MarketCenter'])))[:-1]:
    
    print('Processing: ' + mktctr)
    
    column_label = (mktctr + '_ind')
    data_clean[column_label] = data_clean['MarketCenter'].apply(lambda x: int(x == mktctr))
    
    # query only brokers that send to this market center
    query_txt = 'MarketCenter == "' + mktctr + '"'
    
    # all brokers except the last to prevent multicollinearity
    for brk in list(set(list(data_clean.query(query_txt)['Broker'])))[:-1]: 
        
        # Add column
        column_label = mktctr + '_' + brk.replace(' ', '_') + '_ind'
        data_clean[column_label] = data_clean[mktctr + '_ind'] & data_clean['Broker'].apply(lambda x: int(x == brk))
        
        query_txt2 = 'MarketCenter == "' + mktctr + '" & Broker == "' + brk + '"'
        
        for exc in list(set(list(data_clean.query(query_txt2)['Exchange'])))[:-1]: 
            
            # Add column
            column_label = mktctr + '_' + brk.replace(' ', '_') + '_' + exc + '_ind'
            data_clean[column_label] = data_clean[mktctr + '_ind'] & data_clean['Broker'].apply(lambda x: int(x == brk)) & \
                data_clean['Exchange'].apply(lambda x: int(x == exc))
                
            query_txt3 = 'MarketCenter == "' + mktctr + '" & Broker == "' + brk + '" & Exchange == "' + exc + '"'    
                
            for ot in list(set(list(data_clean.query(query_txt3)['OrderType'])))[:-1]:     
                
                # Add column
                column_label = mktctr + '_' + brk.replace(' ', '_') + '_' + exc + '_' + ot + '_ind'
                data_clean[column_label] = data_clean[mktctr + '_ind'] & data_clean['Broker'].apply(lambda x: int(x == brk)) & \
                    data_clean['Exchange'].apply(lambda x: int(x == exc)) & data_clean['OrderType'].apply(lambda x: int(x == ot))
        
        
print('Complete')

Processing: BNYC
Processing: UBSS
Processing: SGMA
Processing: CDRG
Processing: VRTU
Processing: G1ES
Processing: WOLV
Processing: CITI
Processing: ARCA
Complete


## Fixed Effects

In [12]:
data_merged_demeaned = data_merged.dropna() - data_merged.dropna().groupby(
    ['Broker', 'MarketCenter', 'Exchange', 'OrderType']).transform("mean")

data_merged_demeaned[['Broker', 'Exchange', 'MarketCenter', 'OrderType', 'Quarter']
                     ] = data_merged.dropna()[['Broker', 'Exchange', 'MarketCenter', 'OrderType', 'Quarter']]

data_merged_demeaned['Rebate_Dummy'] = data_merged.dropna()['Rebate_Dummy']

## To CSV

In [13]:
# Panel
data_merged.to_csv('../data/processed/regression_data_levels.csv', index=False)
# Differenced
pd.concat(data_merged_diff_list).to_csv(
    '../data/processed/regression_data_fdiffs.csv', index=False)
# With Binary Vars
data_clean.to_csv(
    '../data/processed/regression_data_levels_binaries.csv', index=False)
# Demeaned
data_merged_demeaned.to_csv(
    '../data/processed/regression_data_levels_demeaned.csv', index=False)

# Statistics

In [25]:
data_605_q = data_605[data_605.apply(lambda x: x.Quarter == pd.Period('2015Q3'), axis = 1)]
print(data_605.columns)
data_605_q.query('Exchange == "NASDAQ" & OrderCode == 11')[['MarketCenter', 'MktCtrExecShares', 'PrImp_Pct', 'PrImp_AvgAmt', 'All_AvgT']]

Index(['MarketCenter', 'Quarter', 'Exchange', 'OrderCode', 'CoveredOrders',
       'CoveredShares', 'CancelledShares', 'MktCtrExecShares',
       'AwayExecShares', 'ExecShares_0_9', 'ExecShares_10_29',
       'ExecShares_30_59', 'ExecShares_60_299', 'ExecShares_5_30',
       'AvgRealSpread', 'AvgEffecSpread', 'PrImpShares', 'PrImp_AvgAmt',
       'PrImp_AvgT', 'ATQShares', 'ATQ_AvgT', 'OTQShares', 'OTQ_AvgAmt',
       'OTQ_AvgT', 'ExecShares', 'PrImp_TotalT', 'PrImp_TotalAmt',
       'ATQ_TotalT', 'OTQ_TotalT', 'AvgRealSpread_T', 'AvgEffecSpread_T',
       'PrImp_Pct', 'ATQ_Pct', 'OTQ_Pct', 'OrderType', 'PrImp_ExpAmt',
       'All_AvgT', 'MktCtrAvg_PrImp_Pct', 'Rel_PrImp_Pct',
       'MktCtrAvg_PrImp_AvgT', 'Rel_PrImp_AvgT', 'MktCtrAvg_PrImp_ExpAmt',
       'Rel_PrImp_ExpAmt', 'MktCtrAvg_All_AvgT', 'Rel_All_AvgT'],
      dtype='object')


Unnamed: 0,MarketCenter,MktCtrExecShares,PrImp_Pct,PrImp_AvgAmt,All_AvgT
0,ARCA,40788074,0.43911,0.044509,0.671599
177,BNYC,187389877,0.624134,0.007284,1.276349
357,CDRG,1684972233,0.812235,0.010775,0.161658
942,CITI,0,,,
1661,G1ES,848647217,0.877246,0.011175,0.217041
2060,SGMA,654205209,0.846468,0.011098,0.212152
2645,UBSS,894802777,0.874837,0.012816,0.309009
