In [90]:
import csv
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.formula.api as smf
import os
from datetime import datetime

%matplotlib inline

# Functions

In [91]:
def import_dict(dict_loc):
    """ Loads a dictionary from a csv """
    df = pd.read_csv(dict_loc)
    df.index = df.iloc[:,0]
    df = df.drop(df.columns[0], axis = 1)
    return df.to_dict()

def convertDateToQuarter(date):
    quarter = (date.month-1)//3 + 1
    return (str(date.year) + 'Q' + str(quarter))

# Data Import

## Directories

In [92]:
# 605 and 606 csv directory
dir_605 = '../data/605/'
dir_606 = '../data/606/'

## Raw Data

In [93]:
## Import 605 data
# Find market center csvs
marketcenter_csv_list = [x for x in os.listdir(dir_605) if x[-4:] == '.csv']
# Get .csv directories
marketcenter_csv_dirs = [dir_605 + x for x in marketcenter_csv_list]
# Read .csv files
csv_df_list_605 = [pd.read_csv(file, sep = ',') for file in marketcenter_csv_dirs]
# Merge each marketcenter's data
rawdata_605 = pd.concat(csv_df_list_605)
# Clean up
del(csv_df_list_605)

In [94]:
## Import 606 Data

csv_df_list_606 = []
# Find broker folders
broker_folders = [x for x in os.listdir(dir_606) if '.' not in x]
# Merge .csv's for each broker
for broker in broker_folders:
    # Get file locations of csv's for each broker
    directory = dir_606 + broker
    broker_csv_list  = [x for x in os.listdir(directory)]
    broker_csv_dirs  = [dir_606 + broker + '/' + csv for csv in broker_csv_list]
    # Read csv's as dataframes
    csv_df_list_606_broker = [pd.read_csv(file) for file in broker_csv_dirs]
    csv_df_list_606.append(pd.concat(csv_df_list_606_broker))
    
# Merge each broker's data
rawdata_606 = pd.concat(csv_df_list_606)
# Clean up
del(csv_df_list_606)

## Dictionaries

In [95]:
symbol_dict = import_dict('../data/keys/symbols.csv')['Exchange']
mktctr_mpid_dict = import_dict('../data/keys/mpids.csv')['MPID']
ordertype_dict = {11: 'Market', 12: 'Limit'}
broker_vol_dict = import_dict('../data/keys/broker_volumes.csv')

# Data Prep

## Broker Data

### Prepare Raw Data

In [96]:
# Import Data
data_606 = rawdata_606.copy()

# Fix Routing Venue labels
data_606['RoutingVenue'] = data_606['RoutingVenue'].apply(
    lambda x: mktctr_mpid_dict.get(x.strip(), "(Unknown) " + str(x.strip())))

# Drop unknown routing venues
data_606 = data_606[data_606['RoutingVenue'].apply(lambda x: not x.startswith('(Unk'))]

# Convert date to quarter
data_606['Quarter'] = data_606['Date'].apply(
    lambda x: convertDateToQuarter(datetime.strptime(str(x), '%Y%m')))
data_606['Quarter'] = pd.PeriodIndex(data_606['Quarter'], freq='Q').values
data_606 = data_606.drop('Date', axis=1)

# Change column names
data_606 = data_606.rename(
    columns={'RoutingVenue': 'MarketCenter', 'Pct': 'MktShare'})

# Merge known marketcenters of same firm
data_606 = data_606.groupby(['Broker', 'Exchange', 'OrderType', 'Quarter', 'Rebate', 'MarketCenter']).sum().reset_index()

# Add binary var for presence of rebates
data_606['Rebate_Dummy'] = (data_606['Rebate'].apply(lambda x: (x > 0))
                            | data_606['Broker'].apply(lambda x: x == 'TD Ameritrade')).apply(lambda x: int(x))

# Filter 606 data to market centers with data available
mktctrs_available = rawdata_605['MarketCenter'].unique()
data_606 = data_606[data_606['MarketCenter'].apply(lambda x: x in mktctrs_available)]

### Fill in missing 0's

In [97]:
data_606['Obs_id'] = data_606['Broker'] + '-' + data_606['MarketCenter'] + '-' + data_606['Exchange'] + '-' + data_606['OrderType']
data_606_new = data_606.copy()

dates_set = pd.Series(list(data_606['Quarter'].unique())).sort_values()

rebate_dummy_dict = {broker: data_606.query('Broker == "' + broker + '"').iloc[0]['Rebate_Dummy'] for broker in data_606['Broker'].unique()}

# from second element onwards
for quarter in dates_set.iloc[0:]: 
    
    print(quarter, end = ' ')
    mask_1 = (data_606['Quarter'] <  quarter) & (data_606['Quarter'] >= (quarter - 2)) # within last given period
    mask_2 = (data_606['Quarter'] == quarter)
    
#     print('Elements in previous quarters')
#     display(data_606.loc[mask_1].sort_values(by = 'Quarter'))
    
#     print('Elements in this quarter')
#     display(data_606.loc[mask_2])
            
    set_1 = set(data_606.loc[mask_1]['Obs_id'].unique())
    set_2 = set(data_606.loc[mask_2]['Obs_id'].unique())
    
#     display(set_2 - set_1)
    
    # missing id's (last period obs that are not in this period)
    id_list = [list(x.split('-')) for x in (set_1 - set_2)]    
#     display(id_list)
    
    # add missing id's
    print('(%d)' % len(id_list), end = ', ')
    for missing_id in id_list:
        #print(missing_id)
        
        data_606_new = data_606_new.append({'Broker': missing_id[0], 'MarketCenter': missing_id[1], 'Exchange': missing_id[2], 
                            'OrderType': missing_id[3], 'Quarter': quarter, 'Obs_id': '-'.join(missing_id),
                            'Rebate_Dummy': rebate_dummy_dict.get(missing_id[0], np.nan), 'MktShare': 0}, 
                           ignore_index = True)
    
#     print('Updated for this quarter')
#     display(data_606.loc[(data_606['Quarter'] == quarter)])

data_606 = data_606_new.copy()
data_606.head()

2010Q1 (0), 2010Q2 (4), 2010Q3 (0), 2010Q4 (4), 2011Q1 (4), 2011Q2 (4), 2011Q3 (4), 2011Q4 (4), 2012Q1 (4), 2012Q2 (12), 2012Q3 (8), 2012Q4 (4), 2013Q1 (0), 2013Q2 (0), 2013Q3 (0), 2013Q4 (4), 2014Q1 (4), 2014Q2 (32), 2014Q3 (48), 2014Q4 (24), 2015Q1 (76), 2015Q2 (148), 2015Q3 (92), 2015Q4 (72), 2016Q1 (140), 2016Q2 (144), 2016Q3 (488), 2016Q4 (336), 2017Q1 (64), 2017Q2 (184), 2017Q3 (152), 2017Q4 (684), 

Unnamed: 0,Broker,Exchange,OrderType,Quarter,Rebate,MarketCenter,MktShare,Rebate_Dummy,Obs_id
0,AXA,NASDAQ,Limit,2014Q1,0.0,CDRG,0.1997,0,AXA-CDRG-NASDAQ-Limit
1,AXA,NASDAQ,Limit,2014Q1,0.0,FBCO,0.1205,0,AXA-FBCO-NASDAQ-Limit
2,AXA,NASDAQ,Limit,2014Q1,0.0,G1ES,0.0293,0,AXA-G1ES-NASDAQ-Limit
3,AXA,NASDAQ,Limit,2014Q2,0.0,CDRG,0.1906,0,AXA-CDRG-NASDAQ-Limit
4,AXA,NASDAQ,Limit,2014Q2,0.0,FBCO,0.1436,0,AXA-FBCO-NASDAQ-Limit


## Market Center Data

In [98]:
# Import data
data_605 = rawdata_605.copy()

# Quarter column
data_605['Quarter'] = data_605['idate'].apply(lambda x: convertDateToQuarter(datetime.strptime(str(int(x)), '%Y%m')))
data_605['Quarter'] = pd.PeriodIndex(data_605['Quarter'], freq='Q').values
data_605 = data_605.drop('idate', axis = 1)

# Temporary Variables for Aggregation
data_605['PrImp_TotalT']     = data_605['PrImpShares']    * data_605['PrImp_AvgT']
data_605['PrImp_TotalAmt']   = data_605['PrImpShares']    * data_605['PrImp_AvgAmt']
data_605['ATQ_TotalT']       = data_605['ATQShares']      * data_605['ATQ_AvgT']
data_605['OTQ_TotalT']       = data_605['OTQShares']      * data_605['OTQ_AvgT']
data_605['AvgRealSpread_T']  = data_605['AvgRealSpread']  * data_605['ExecShares']
data_605['AvgEffecSpread_T'] = data_605['AvgEffecSpread'] * data_605['ExecShares']

data_605 = data_605.groupby(['MarketCenter', 'Quarter', 'Exchange', 'OrderCode']) \
        .sum().reset_index()

# Reconstruct original variables
data_605['PrImp_AvgT']     = data_605['PrImp_TotalT']     / data_605['PrImpShares']
data_605['PrImp_AvgAmt']   = data_605['PrImp_TotalAmt']   / data_605['PrImpShares'] 
data_605['ATQ_AvgT']       = data_605['ATQ_TotalT']       / data_605['ATQShares']
data_605['OTQ_AvgT']       = data_605['OTQ_TotalT']       / data_605['OTQShares']
data_605['AvgRealSpread']  = data_605['AvgRealSpread_T']  / data_605['ExecShares']
data_605['AvgEffecSpread'] = data_605['AvgEffecSpread_T'] / data_605['ExecShares'] 
data_605['PrImp_Pct']      = data_605['PrImpShares']      / data_605['ExecShares']
data_605['ATQ_Pct']        = data_605['ATQShares']        / data_605['ExecShares']
data_605['OTQ_Pct']        = data_605['OTQShares']        / data_605['ExecShares']

## New Vars

# Absolute
data_605['OrderType']    = data_605['OrderCode'].apply(lambda x: ordertype_dict.get(x, 'Other'))
data_605['PrImp_ExpAmt'] = data_605['PrImp_AvgAmt'] * data_605['PrImp_Pct']
data_605['All_AvgT']     = (data_605['PrImp_TotalT'] + data_605['ATQ_TotalT'] + data_605['OTQ_TotalT']) \
                            / data_605['ExecShares']
data_605 = data_605.rename(columns = {'idate': 'Date'})

# Relative values
data_605_grouped = data_605.groupby(['Exchange', 'OrderType', 'Quarter'])

data_605['MktCtrAvg_PrImp_Pct']  = data_605_grouped['PrImp_Pct'].transform("mean")
data_605['Rel_PrImp_Pct']        = data_605['PrImp_Pct'] - data_605['MktCtrAvg_PrImp_Pct']
data_605['MktCtrAvg_PrImp_AvgT'] = data_605_grouped['PrImp_AvgT'].transform("mean")
data_605['Rel_PrImp_AvgT']       = data_605['PrImp_AvgT'] - data_605['MktCtrAvg_PrImp_AvgT']    
data_605['MktCtrAvg_PrImp_ExpAmt'] = data_605_grouped['PrImp_ExpAmt'].transform("mean")
data_605['Rel_PrImp_ExpAmt']       = data_605['PrImp_ExpAmt'] - data_605['MktCtrAvg_PrImp_ExpAmt']   
data_605['MktCtrAvg_All_AvgT'] = data_605_grouped['All_AvgT'].transform("mean")
data_605['Rel_All_AvgT']       = data_605['All_AvgT'] - data_605['MktCtrAvg_All_AvgT']   

data_605.head()

Unnamed: 0,MarketCenter,Quarter,Exchange,OrderCode,CoveredOrders,CoveredShares,CancelledShares,MktCtrExecShares,AwayExecShares,ExecShares_0_9,...,PrImp_ExpAmt,All_AvgT,MktCtrAvg_PrImp_Pct,Rel_PrImp_Pct,MktCtrAvg_PrImp_AvgT,Rel_PrImp_AvgT,MktCtrAvg_PrImp_ExpAmt,Rel_PrImp_ExpAmt,MktCtrAvg_All_AvgT,Rel_All_AvgT
0,BNYC,2015Q1,NASDAQ,11,213581,43704397,70249,43628268,0,43526470,...,0.005705,0.240511,0.877085,0.061611,0.082561,0.115073,0.011198,-0.005493,0.10555,0.134962
1,BNYC,2015Q1,NASDAQ,12,37042,7770176,4851,7750829,0,7713717,...,0.00059,2.094708,0.661344,0.215484,0.188143,0.141117,0.005108,-0.004517,1.566054,0.528654
2,BNYC,2015Q1,NASDAQ,13,12616,2573500,501881,1976798,0,1048397,...,,0.0,0.0,0.0,,,,,0.0,0.0
3,BNYC,2015Q1,NASDAQ,14,11996,2589515,641176,1838962,0,504154,...,,0.0,0.0,0.0,,,,,0.0,0.0
4,BNYC,2015Q1,NASDAQ,15,36132,7574355,2619472,4075705,0,404016,...,,0.0,0.0,0.0,,,,,0.0,0.0


# Merge Datasets

In [99]:
data_merged = data_605.merge(data_606)

data_merged = data_merged.query('OrderCode < 13')
data_merged = data_merged.drop('Obs_id', axis = 1)
data_merged.set_index(['Quarter'])
data_merged['Broker_Size'] = data_merged['Broker'].apply(lambda x: broker_vol_dict['Size'].get(x))


print('Total Observations: ' + str(len(data_merged)))
print('Brokers: ' + str(len(set(list(data_merged['Broker'])))))
print('Market Centers: ' + str(len(set(list(data_merged['MarketCenter'])))))

data_merged.head()

Total Observations: 5964
Brokers: 30
Market Centers: 8


Unnamed: 0,MarketCenter,Quarter,Exchange,OrderCode,CoveredOrders,CoveredShares,CancelledShares,MktCtrExecShares,AwayExecShares,ExecShares_0_9,...,Rel_PrImp_AvgT,MktCtrAvg_PrImp_ExpAmt,Rel_PrImp_ExpAmt,MktCtrAvg_All_AvgT,Rel_All_AvgT,Broker,Rebate,MktShare,Rebate_Dummy,Broker_Size
0,BNYC,2015Q1,NASDAQ,11,213581,43704397,70249,43628268,0,43526470,...,0.115073,0.011198,-0.005493,0.10555,0.134962,Insigneo Securities,0.0,0.7766,0,
1,BNYC,2015Q1,NASDAQ,12,37042,7770176,4851,7750829,0,7713717,...,0.141117,0.005108,-0.004517,1.566054,0.528654,Insigneo Securities,0.0,0.6279,0,
5,BNYC,2015Q1,NYSE,11,396811,81015802,108105,80902077,0,80833088,...,0.070049,0.006949,-0.002302,0.094201,0.074817,Insigneo Securities,0.0,0.7379,0,
6,BNYC,2015Q1,NYSE,12,45238,9529134,5233,9506679,0,9478187,...,0.016105,0.003145,-0.002832,1.285204,-0.294374,Insigneo Securities,0.0,0.5228,0,
10,BNYC,2015Q1,Other,11,257920,55831472,108233,55715793,0,55640416,...,-0.023154,0.008315,-0.003278,0.331655,-0.098908,Insigneo Securities,0.0,0.7026,0,


# Data Export

## Panel

In [100]:
# test
data_merged[data_merged['Quarter'] == pd.Period('2017Q1')].query('Broker == "Deutsche" & OrderType == "Market" & Exchange == "NASDAQ"')

Unnamed: 0,MarketCenter,Quarter,Exchange,OrderCode,CoveredOrders,CoveredShares,CancelledShares,MktCtrExecShares,AwayExecShares,ExecShares_0_9,...,Rel_PrImp_AvgT,MktCtrAvg_PrImp_ExpAmt,Rel_PrImp_ExpAmt,MktCtrAvg_All_AvgT,Rel_All_AvgT,Broker,Rebate,MktShare,Rebate_Dummy,Broker_Size
5822,FBCO,2017Q1,NASDAQ,11,7489,1597372,700,25403,1571269,1595271,...,0.152687,0.012673,-0.008989,0.185031,0.159272,Deutsche,1.0,0.0,1,20450090000.0
13381,UBSS,2017Q1,NASDAQ,11,574854,117944884,25936,70779706,47106367,117861334,...,0.086393,0.012673,0.000702,0.185031,0.315858,Deutsche,1.0,0.0016,1,20450090000.0


## Fixed Effects

In [101]:
data_merged_demeaned = data_merged - data_merged.groupby(
    ['Broker', 'MarketCenter', 'Exchange', 'OrderType']).transform("mean")

data_merged_demeaned[['Broker', 'Exchange', 'MarketCenter', 'OrderType', 'Quarter']] = data_merged[['Broker', 'Exchange', 'MarketCenter', 'OrderType', 'Quarter']]

data_merged_demeaned['Rebate_Dummy'] = data_merged['Rebate_Dummy']

## To CSV

In [102]:
# Panel
data_merged.to_csv('../data/processed/regression_data_levels.csv', index=False)

# Demeaned
data_merged_demeaned.to_csv(
    '../data/processed/regression_data_levels_demeaned.csv', index=False)

# 605 and 606
data_605.to_csv('../data/processed/605_processed.csv', index = False)
data_606.to_csv('../data/processed/606_processed.csv', index = False)

# raw
rawdata_605.to_csv('../data/rawdata_605.csv', index = False)
rawdata_606.to_csv('../data/rawdata_606.csv', index = False)