In [482]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
from scipy.stats import chi2
from scipy.stats import norm

%matplotlib inline

# Functions

In [483]:
def hausman_test(result_a, result_b, r):
    
    cov_a = result_a.cov_HC0[0:r,0:r]
    cov_b = result_b.cov_HC0[0:r,0:r]
    
    beta_a = result_a.params[0:r]
    beta_b = result_b.params[0:r]
    
    wald_stat = np.matrix(beta_a - beta_b) * np.linalg.inv(np.matrix(cov_b - cov_a)) * np.matrix(beta_a - beta_b).T
    pval = 1 - chi2.cdf(wald_stat, r-1)
    
    return wald_stat, pval

def get_sig_stars(p_val, p_value_labels = {0.05: '*', 0.01: '**', 0.001: '***'}, as_latex = False):

    below_ind = np.where([p_val < x for x in p_value_labels.keys()])[0]
    below_vals = [list(p_value_labels.keys())[i] for i in below_ind]
    
    if not below_vals:
        stars = ''
    else:
        min_p_val = np.min(below_vals)
        stars = p_value_labels[min_p_val]
        
    if as_latex:
        if not stars:
            return ''
        else:
            return '$^{' + stars + '}$'
    else:
        return stars

# Params

In [558]:
workbook_file_loc = '../analysis/results/parametric_regressions_all_limitasdf.xlsx'
sample_query = 'OrderType == "Limit" & Exchange != "Otaher"'

# Load Data

In [559]:
# demeaned for fixed effects
data_df = pd.read_csv('../data/processed/regression_data_levels_demeaned.csv')
data_df['PrImp_Pct_Rebate_Dummy'] = data_df['PrImp_Pct'] * data_df['Rebate_Dummy']
data_df['PrImp_AvgAmt_Rebate_Dummy'] = data_df['PrImp_AvgAmt'] * data_df['Rebate_Dummy']
data_df['PrImp_ExpAmt_Rebate_Dummy'] = data_df['PrImp_ExpAmt'] * data_df['Rebate_Dummy']
data_df['PrImp_AvgT_Rebate_Dummy'] = data_df['PrImp_AvgT'] * data_df['Rebate_Dummy']
data_df['All_AvgT_Rebate_Dummy'] = data_df['All_AvgT'] * data_df['Rebate_Dummy']
data_df_dmd = data_df.copy()

# levels for random effects
data_df = pd.read_csv('../data/processed/regression_data_levels.csv')
data_df['PrImp_Pct_Rebate_Dummy'] = data_df['PrImp_Pct'] * data_df['Rebate_Dummy']
data_df['PrImp_AvgAmt_Rebate_Dummy'] = data_df['PrImp_AvgAmt'] * data_df['Rebate_Dummy']
data_df['PrImp_ExpAmt_Rebate_Dummy'] = data_df['PrImp_ExpAmt'] * data_df['Rebate_Dummy']
data_df['PrImp_AvgT_Rebate_Dummy'] = data_df['PrImp_AvgT'] * data_df['Rebate_Dummy']
data_df['All_AvgT_Rebate_Dummy'] = data_df['All_AvgT'] * data_df['Rebate_Dummy']


# Info
print('Market Centers: ', end = '') 
print(data_df['MarketCenter'].unique())

print('Brokers: ', end = '') 
print(data_df['Broker'].unique())


sample_frac = 1 # None => All obs
data_df = data_df.sample(frac = sample_frac).query(sample_query)

print('Samples: %d' % data_df.shape[0])
print('Sparsity: %0.2f%%' % (100*data_df.query('MktShare == 0').shape[0] / data_df.shape[0]))

# Add dummy vars to levels

data_df['obs_id'] = data_df['Broker'].apply(lambda x: x.replace(" ", "")) + data_df['MarketCenter'] + data_df['Exchange'] 
data_df = pd.get_dummies(data_df, columns = ['obs_id'], prefix = 'dummy')

dummy_coeff_cols = [x for x in list(data_df.columns) if x[:5] == 'dummy']
dummy_coeff_formula = ' + '.join(dummy_coeff_cols)

data_df.head()

Market Centers: ['AQUA' 'ARCA' 'BNYC' 'CDRG' 'CITI' 'EDGX' 'FBCO' 'G1ES' 'SGMA' 'UBSS'
 'VRTU' 'WOLV']
Brokers: ['Deutsche' 'Credit Suisse' 'Barclays Capital' 'BMO Capital' 'BTIG'
 'TD Ameritrade' 'Insigneo Securities' 'Bank of the West'
 'Boenning Scattergood' 'Edward Jones' 'Hollencrest Securities' 'AXA'
 'COR Clearing' 'DA Davidson' 'Euro Pacific Capital' 'Florida Atlantic'
 'LPL' 'DST Market Services' 'Aurora Capital' 'Corporate Investment Group'
 'E1 Asset Mgmt' 'Elish Elish' 'Dakota Securities' 'Benjamin Jerold'
 'INTL FCStone' 'Bull Market Securities' 'Fifth Third' 'Freedom Investors'
 'Inlet Securities' 'Cambria Capital' 'JP Morgan' 'Evercore Group'
 'BGC Financial']
Samples: 3418
Sparsity: 43.39%


Unnamed: 0,MarketCenter,Quarter,Exchange,OrderCode,CoveredOrders,CoveredShares,CancelledShares,MktCtrExecShares,AwayExecShares,ExecShares_0_9,...,dummy_TDAmeritradeG1ESNYSE,dummy_TDAmeritradeG1ESOther,dummy_TDAmeritradeSGMANASDAQ,dummy_TDAmeritradeSGMANYSE,dummy_TDAmeritradeSGMAOther,dummy_TDAmeritradeUBSSOther,dummy_TDAmeritradeVRTUNASDAQ,dummy_TDAmeritradeVRTUNYSE,dummy_TDAmeritradeVRTUOther,dummy_TDAmeritradeWOLVNYSE
1303,CDRG,2016Q4,NYSE,12,48438354,19003828482,12077877502,6909125882,5894174,6828573150,...,0,0,0,0,0,0,0,0,0,0
4965,UBSS,2014Q2,NYSE,12,90105708,29413322282,25161210076,2309175286,1182631132,3426069328,...,0,0,0,0,0,0,0,0,0,0
2838,FBCO,2015Q1,Other,12,35306,55140676,3575058,49375430,1861392,48674102,...,0,0,0,0,0,0,0,0,0,0
1818,CDRG,2017Q4,NASDAQ,12,20992422,9303077702,6883179156,2398298526,15581398,2355136718,...,0,0,0,0,0,0,0,0,0,0
2903,FBCO,2015Q3,NASDAQ,12,162752,115728508,18510462,97033814,0,94108482,...,0,0,0,0,0,0,0,0,0,0


# Regressions 

In [560]:
# Fits
# fit1_formula = 'MktShare ~ PrImp_Pct + PrImp_AvgAmt + PrImp_AvgT'
# fit2_formula = 'MktShare ~ PrImp_ExpAmt + PrImp_AvgT'
# fit3_formula = 'MktShare ~ PrImp_Pct + PrImp_AvgAmt + All_AvgT'
# fit4_formula = 'MktShare ~ PrImp_ExpAmt + All_AvgT'

fit1_formula = 'MktShare ~ PrImp_Pct + PrImp_Pct_Rebate_Dummy + PrImp_AvgAmt + PrImp_AvgAmt_Rebate_Dummy + PrImp_AvgT + PrImp_AvgT_Rebate_Dummy'
fit2_formula = 'MktShare ~ PrImp_ExpAmt + PrImp_ExpAmt_Rebate_Dummy + PrImp_AvgT + PrImp_AvgT_Rebate_Dummy'
fit3_formula = 'MktShare ~ PrImp_Pct + PrImp_Pct_Rebate_Dummy + PrImp_AvgAmt + PrImp_AvgAmt_Rebate_Dummy + All_AvgT + All_AvgT_Rebate_Dummy'
fit4_formula = 'MktShare ~ PrImp_ExpAmt + PrImp_ExpAmt_Rebate_Dummy + All_AvgT + All_AvgT_Rebate_Dummy'


formulaCols = lambda x: x.replace(' ', '').replace('~', '+').split('+') 
fit_formulae = [fit1_formula, fit2_formula, fit3_formula, fit4_formula]
fit_formulae = [formulaCols(x) for x in fit_formulae]

# Store results
fit_results_re = [None] * len(fit_formulae)
fit_results_fe = [None] * len(fit_formulae)

## Results

In [561]:
i = 1

## Get results
for i in range(0, len(fit_formulae)):
    
    print('Regressing with fit %d...' % (i + 1) )

    data = data_df_dmd[fit_formulae[i]]
    
    X = data.iloc[:,1:]
    Y = data.iloc[:,0]

    fit_results_fe[i] = sm.OLS(Y, X, missing = 'drop').fit().get_robustcov_results(cov_type='HC0')
    
    # include dummies and drop one 
    data = data_df[fit_formulae[i] + dummy_coeff_cols[:-1]]
    
    X = data.iloc[:,1:]
    Y = data.iloc[:,0]

    fit_results_re[i] = sm.OLS(Y, sm.add_constant(X), missing = 'drop').fit().get_robustcov_results(cov_type='HC0')
    

Regressing with fit 1...
Regressing with fit 2...
Regressing with fit 3...
Regressing with fit 4...


## Run Hausman Tests

In [562]:
fit_results = [None] * len(fit_formulae)
hausman_results = [None] * len(fit_formulae)

for i in range(0, len(fit_formulae)):

    n_var = len(fit_formulae[i]) - 1

    wald_stat, pval = hausman_test(fit_results_re[i], fit_results_fe[i], n_var)
    
    if pval < 0.05:
        fit_results[i] = fit_results_fe[i]
        hausman_results[i] = 'FE'
    else:
        fit_results[i] = fit_results_re[i]
        hausman_results[i] = 'RE'

# Export

In [563]:
from openpyxl import Workbook, load_workbook, utils
from openpyxl.styles import Alignment, Font
import string
import csv

In [564]:
## Open workbook
wb = Workbook()

## Coefficient Results

In [565]:
for i in range(0, len(fit_results)):
    
    ws = wb.create_sheet(title = 'Fit ' + str(i+1) +' Results')
    
    # space out columns
    ws.column_dimensions["A"].width = 35
    
    for col in list('BCDEFG'):
        ws.column_dimensions[col].width = 15
    
    # add data
    reader = (fit_results[i].summary().as_csv()).split('\n')
    
    random_effects_model = True

    for row in reader:
        if row[:5] != 'dummy':
            ws.append(row.split(','))
        else:
            random_effects_model = False
            
    if random_effects_model:
        ws['C1'] = 'Random Effects'
    else:
        ws['C1'] = 'Fixed Effects'

## Save and Close

In [566]:
del wb['Sheet']
wb.save(workbook_file_loc)
wb.close()

# As LaTeX

In [567]:
fit_results_dict = [None] * 4

for i in range(0, len(fit_results)):
    
    coeffs = {}
    stders = {}
    
    # Coefficients
    for j in range(0, len(fit_formulae[i][1:])):
    
        stder = fit_results[i].HC0_se[j]
        coeff = fit_results[i].params[j]
        key = fit_formulae[i][1:][j]
        
        coeffs[key] = coeff
        stders[key] = stder
        
    fit_results_dict[i] = {}
    fit_results_dict[i]['coeffs'] = coeffs.copy()
    fit_results_dict[i]['stders'] = stders.copy()

In [568]:
regressors = ['PrImp_Pct', 'PrImp_Pct_Rebate_Dummy', 'PrImp_AvgAmt', 'PrImp_AvgAmt_Rebate_Dummy','PrImp_ExpAmt', 
     'PrImp_ExpAmt_Rebate_Dummy', 'PrImp_AvgT', 'PrImp_AvgT_Rebate_Dummy', 'All_AvgT', 'All_AvgT_Rebate_Dummy']


for reg in regressors:
    
    line = reg.replace('_Rebate', '$*$Rebate').replace('_', '\\_')
    line2 = ' '
    
    for i in range(0, len(fit_results_dict)):
        
        fit = fit_results_dict[i]
        
        if reg in fit['coeffs']:
            
            coef  = fit['coeffs'][reg]
            stder = fit['stders'][reg]
            tstat = coef / stder
            pval  = 2 * (1- t.cdf(np.abs(tstat), int(fit_results[i].nobs) - len(fit['coeffs']) - 1) )
         
            line = line + ' & ' + str(np.round(coef, decimals = 4)) + get_sig_stars(pval, as_latex = True)
            line2 = line2 + ' & (' + str(np.round(stder, decimals = 4)) + ')'
        
        else:
            
            line = line + ' & '
            line2 = line2 + ' & '
    
    print(line + '\\\\')
    print(line2 + '\\\\ [0.5ex]')

PrImp\_Pct & 0.1432 &  & 0.1778 & \\
  & (0.1097) &  & (0.1025) & \\ [0.5ex]
PrImp\_Pct$*$Rebate\_Dummy & -0.281$^{***}$ &  & -0.2548$^{***}$ & \\
  & (0.0602) &  & (0.06) & \\ [0.5ex]
PrImp\_AvgAmt & 0.067 &  & 0.0073 & \\
  & (0.1306) &  & (0.1247) & \\ [0.5ex]
PrImp\_AvgAmt$*$Rebate\_Dummy & 1.1547 &  & 1.8009 & \\
  & (1.1739) &  & (1.2164) & \\ [0.5ex]
PrImp\_ExpAmt &  & 17.8577$^{***}$ &  & 17.1341$^{***}$\\
  &  & (3.8352) &  & (3.8078)\\ [0.5ex]
PrImp\_ExpAmt$*$Rebate\_Dummy &  & -11.675$^{*}$ &  & -11.962$^{*}$\\
  &  & (4.9257) &  & (4.9188)\\ [0.5ex]
PrImp\_AvgT & 24.0354$^{***}$ & -0.0292$^{*}$ &  & \\
  & (4.7768) & (0.013) &  & \\ [0.5ex]
PrImp\_AvgT$*$Rebate\_Dummy & -0.0019$^{**}$ & 0.0288$^{*}$ &  & \\
  & (0.0006) & (0.0131) &  & \\ [0.5ex]
All\_AvgT &  &  & 20.0124$^{***}$ & -0.0007\\
  &  &  & (4.4945) & (0.0005)\\ [0.5ex]
All\_AvgT$*$Rebate\_Dummy &  &  & -0.0014$^{***}$ & 0.0013\\
  &  &  & (0.0004) & (0.0007)\\ [0.5ex]


In [569]:
obs_line = 'N'
model_line = 'Model'
r_2_line = 'R$^{2}$'
r_2adj_line = 'Adjusted R$^{2}$'
f_stat = 'F Statistic'

for i in range(0, len(fit_results)):

    fit = fit_results[i]
    
    obs_line = obs_line + ' & ' + str(int(fit.nobs))
    model_line = model_line + ' & ' +  hausman_results[i]
    r_2_line = r_2_line + ' & ' +  str(np.round(fit.rsquared, decimals = 3))
    r_2adj_line = r_2adj_line + ' & ' +  str(np.round(fit.rsquared_adj, decimals = 3))
    
    if np.abs(fit.fvalue) > 100:
        fvalue_label = '$>$100$^{***}$'
    else:
        fvalue_label = str(np.round(float(fit.fvalue), decimals = 3)) + get_sig_stars(fit.f_pvalue, as_latex = True) 
        
    f_stat = f_stat + ' & ' + fvalue_label
    

In [570]:
print(' \\\\ \n'.join([model_line, obs_line, r_2_line, r_2adj_line, f_stat]), end = ' \\\\ \n')

Model & RE & FE & RE & FE \\ 
N & 3411 & 2895 & 3411 & 2895 \\ 
R$^{2}$ & 0.504 & 0.01 & 0.507 & 0.01 \\ 
Adjusted R$^{2}$ & 0.429 & 0.008 & 0.433 & 0.008 \\ 
F Statistic & $>$100$^{***}$ & 7.146$^{***}$ & $>$100$^{***}$ & 6.435$^{***}$ \\ 


In [571]:
fit_results[2].f_pvalue

array(0.)

In [572]:
hausman_results

['RE', 'FE', 'RE', 'FE']