In [3]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from scipy.stats import norm
import matplotlib.pyplot as plt

%matplotlib inline

# Functions

In [14]:
# Trimming function
def trim(X, percent):
    
    alpha = (1 - percent)/2
    n, k = np.shape(X)
    t_ind = np.zeros((n, k))
    
    for i in range(0, k):
        upper_bd = np.percentile(X[:,i], (1 - alpha)*100)
        lower_bd = np.percentile(X[:,i], (alpha)*100)
        t_ind[:, i] = [int(lower_bd < x < upper_bd) for x in X[:,i]]
        
    return np.prod(t_ind, axis = 1)

# Conditional Expectation
def CE_1(Y, X, arg, r):
    
    n_arg = np.shape(arg)[0]
    n = np.shape(X)[0]
    h = (n**(-r)) * np.std(X, axis = 0, ddof = 1)
    e = np.zeros((n_arg, 1))
    
    for j in range(0, n_arg):
        k = np.divide(norm.pdf(np.divide((arg[j] - X), h)), h)
        k = np.prod(k, axis = 1)   
        e[j] = (Y.T*k/n)/np.mean(k)
        
    return e

# Semiparametric Least Squares objective
def SLS_1(b, Y, X, X_ind):
    
    v = X * np.matrix(b).T
    EY = CE_1(Y, v, v, 1/5)
    residual = np.power((Y - EY), 2)
    
    return (-0.5 * np.matrix(X_ind)*residual)

# Regress Y on X with SLS
def run_semiparametric_regression(Y, X, guess, trim_percent = 0.98):
    
    obj_f = lambda x_0: -SLS_1(x_0, Y, X, trim(X, trim_percent))
    result = minimize(obj_f, guess, method='BFGS')
    return result

# Hessian to Covariance matrix
def convert_hessian_to_cov(Y, X, results):
    
    sigma_2_hat = np.mean(np.power(Y - X*np.matrix(results.x).T, 2))
    return results.hess_inv * sigma_2_hat

# Marginal effects at a point using CE_1
def compute_marginal_effect(Y, X, ind, point, beta, delta = 0.01):

    point_nudge = np.copy(point)
    point_nudge[0, ind] = point_nudge[0, ind] + delta
    point_nudge = np.matrix(point_nudge)

    v_hat = X*beta
    v_hat_avg = point*beta
    v_hat_avg_nudge = point_nudge*beta
    
    return np.asscalar(CE_1(Y, v_hat, v_hat_avg_nudge, 1/5) - CE_1(Y, v_hat, v_hat_avg, 1/5))/delta

# Range of marginal effects 
def calculate_me_range(Y, X, result, ind, point, delta = .001, delta_range = [0.0001, 0.01], parameter_range = [0, 0.014], n_1 = 200, n_2 = 5):

    beta = np.matrix(result.x).T
    
    me_results = np.zeros(shape = (n_1, 2, n_2))
    
    linspace_1 = np.linspace(parameter_range[0], parameter_range[1], num = n_1)
    linspace_2 = np.linspace(delta_range[0], delta_range[1], num = n_2)
    
    for j in range(0, n_2):

        delta = linspace_2[j]
        #primp_avgt = np.linspace(0.001, 0.5, num = n_2)[j]

        for i in range(0, n_1):
  
            point_temp = np.copy(point)
            point_temp[0, ind] = linspace_1[i]
            
            me_vec = compute_marginal_effect(Y, X, ind, point_temp, beta, delta = delta)

            me_results[i,:,j] = np.array([linspace_1[i], np.asscalar(me_vec)])
            
    return me_results   

# T-stats with first coefficient fixed
def find_tstats(Y, X, results):
    
    V = convert_hessian_to_cov(Y, X, results)

    n = np.shape(results.x)[0]
    theta = results.x/results.x[0]
    t_stats = np.zeros(shape = (n))
    t_stats[0] = np.nan # first t-stat is unknown

    for i in range(1, n):
        t_stats[i] = theta[i] / np.sqrt(V[i,i])

    return t_stats

# Load Data

In [31]:
data_df = pd.read_csv(    '../data/processed/regression_data_levels.csv').dropna()

# Regressions 

In [44]:
# Fits
fit1_formula = 'MktShare ~ PrImp_Pct + PrImp_AvgAmt + PrImp_AvgT'
fit2_formula = 'MktShare ~ PrImp_ExpAmt + PrImp_AvgT'
fit3_formula = 'MktShare ~ PrImp_Pct + PrImp_AvgAmt + All_AvgT'
fit4_formula = 'MktShare ~ PrImp_ExpAmt + All_AvgT'


formulaCols = lambda x: x.replace(' ', '').replace('~', '+').split('+') 
fit_formulae = [fit1_formula, fit2_formula, fit3_formula, fit4_formula]
fit_formulae = [formulaCols(x) for x in fit_formulae]

# Store results
fit_results = [None] * len(fit_formulae)

## Results

In [47]:
sample_size = 10 # None => All obs

for i in range(0, len(fit_formulae)):
    
    print('Regressing with fit %d...' % i )
    
    data = data_df.sample(n = sample_size)[fit_formulae[i]]
    X = np.matrix(data)[:, 1:]
    Y = np.matrix(data)[:, 0]

    guess = X[1, :]
    results = run_semiparametric_regression(Y, X, guess)
    
    fit_results[i] = results
    
print('Complete')  

Regressing with fit 0...
Regressing with fit 1...
Regressing with fit 2...
Regressing with fit 3...
Complete


# Export

In [51]:
from openpyxl import Workbook, load_workbook
from openpyxl.utils.dataframe import dataframe_to_rows

In [None]:
# Params

# workbook_file_loc = '../exhibits/welfare/broker_ratings_naive.xlsx'
# quarters_list = [x + 'Q' + str(y) for x in ['2017', '2016'] for y in range(4, 0, -1)]
# exchange_list = ['NASDAQ', 'NYSE']