In [64]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
import matplotlib.pyplot as plt

%matplotlib inline

# Functions

In [296]:
def trim(X, percent):
    
    alpha = (1 - percent)/2
    n, k = np.shape(X)
    t_ind = np.zeros((n, k))
    
    for i in range(0, k):
        upper_bd = np.percentile(X[:,i], (1 - alpha)*100)
        lower_bd = np.percentile(X[:,i], (alpha)*100)
        t_ind[:, i] = [int(lower_bd < x < upper_bd) for x in X[:,i]]
        
    return np.prod(t_ind, axis = 1)


def pdfn(x):
    return np.exp(-np.multiply(x, x)/2)/np.sqrt(2*np.pi)


# Conditional Expecttation
def CE_1(Y, X, arg, r):
    
    n_arg = np.shape(arg)[0]
    n = np.shape(X)[0]
    h = (n**(-r)) * np.std(X, axis = 0, ddof = 1)
    e = np.zeros((n, 1))
    
    for j in range(0, n_arg):
        k = np.divide(pdfn(np.divide((arg[j] - X), h)), h)
        k = np.prod(k, axis = 1)   
        e[j] = (Y.T*k/n)/np.mean(k)
        
    return e



def SLS_1(b, Y, X, X_ind):
    
    v = X * np.matrix(b).T
    EY = CE_1(Y, v, v, 1/5)
    residual2 = np.power((Y - EY), 2)
    
    return (-0.5 * np.matrix(X_ind)*residual2)

def semiparametricRegression(Y, X, guess, tol = 1e-05):
    
    obj_f = lambda x_0: -SLS_1(x_0, Y, X, trim(X, 0.98))
    result = minimize(obj_f, guess, method='BFGS', options={'gtol': tol})
    return result

def convertHessianToCov(Y, X, results):
    v_hat = X*np.matrix(results.x).T
    sigma_2_hat = np.mean(np.power(Y - CE_1(Y, v_hat, v_hat, 1/5), 2))
    return results.hess_inv * sigma_2_hat

# Load Data

In [4]:
data_dmd = pd.read_csv(    '../data/processed/regression_data_levels_demeaned.csv')

In [323]:
fit1_formula = 'MktShare ~ PrImp_Pct + PrImp_AvgAmt + PrImp_AvgT'
fit2_formula = 'MktShare ~ PrImp_ExpAmt + PrImp_AvgT'
fit3_formula = 'MktShare ~ PrImp_Pct + PrImp_AvgAmt + All_AvgT'
fit4_formula = 'MktShare ~ PrImp_ExpAmt + All_AvgT'

formulaCols = lambda x: x.replace(' ', '').replace('~', '+').split('+') 

## Fit 1

In [251]:
data = data_dmd[formulaCols(fit1_formula)].iloc[0:2000,:]
X = np.matrix(data)[:, 1:]
Y = np.matrix(data)[:, 0]

results = semiparametricRegression(Y, X, [875, 868.01, -11.16])
results

      fun: matrix([[10.19682075]])
 hess_inv: array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])
      jac: array([ 5.12599945e-06, -4.88758087e-06,  7.39097595e-06])
  message: 'Optimization terminated successfully.'
     nfev: 5
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([875.  , 868.01, -11.16])

In [252]:
cov_mat = convertHessianToCov(Y, X, results)
t_stats = results.x / np.power(np.diag(cov_mat), 0.5)
print(t_stats)

[84258.77883653 83585.67156331 -1074.66053922]


## Fit 2

In [253]:
data = data_dmd[formulaCols(fit2_formula)].iloc[0:1000,:]
X = np.matrix(data)[:, 1:]
Y = np.matrix(data)[:, 0]

results = semiparametricRegression(Y, X, [8.340, 61.759])
results

      fun: matrix([[7.10855464]])
 hess_inv: array([[1, 0],
       [0, 1]])
      jac: array([ 9.65595245e-06, -1.31130219e-06])
  message: 'Optimization terminated successfully.'
     nfev: 4
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([ 8.34 , 61.759])

In [254]:
cov_mat = convertHessianToCov(Y, X, results)
t_stats = results.x / np.power(np.diag(cov_mat), 0.5)
print(t_stats)

[ 585.62045397 4336.61074538]


## Fit 3

In [303]:
data = data_dmd[formulaCols(fit3_formula)].iloc[0:1000,:]
X = np.matrix(data)[:, 1:]
Y = np.matrix(data)[:, 0]

results = semiparametricRegression(Y, X, guess = [-1048.77063077,  -146.69711946,   1451.18064426], tol = 1e-06)
results

      fun: matrix([[7.10191902]])
 hess_inv: array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])
      jac: array([-2.38418579e-07,  1.19209290e-07, -1.19209290e-07])
  message: 'Optimization terminated successfully.'
     nfev: 5
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([-1048.77063077,  -146.69711946,  1451.18064426])

In [298]:
cov_mat = convertHessianToCov(Y, X, results)
t_stats = results.x / np.power(np.diag(cov_mat), 0.5)
print(t_stats)

[-8737.77777161 -1222.1993942   3758.98798911]


## Fit 4

In [313]:
data = data_dmd[formulaCols(fit4_formula)].iloc[0:1000,:]
X = np.matrix(data)[:, 1:]
Y = np.matrix(data)[:, 0]

results = semiparametricRegression(Y, X, guess = [-110.29263986,   30.718535398])
results

      fun: matrix([[7.10158872]])
 hess_inv: array([[1, 0],
       [0, 1]])
      jac: array([2.02655792e-06, 7.27176666e-06])
  message: 'Optimization terminated successfully.'
     nfev: 4
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([-110.29263986,   30.7185354 ])

In [288]:
cov_mat = convertHessianToCov(Y, X, results)
t_stats = results.x / np.power(np.diag(cov_mat), 0.5)
print(t_stats)

[-918.91776154  255.9355531 ]


# Scratch

In [215]:
data = data_dmd[formulaCols(fit4_formula)].iloc[0:300,:]
X = np.matrix(data)[:, 1:]
Y = np.matrix(data)[:, 0]

results = semiparametricRegression(Y, X, [1,3])
print(results)
cov_mat = convertHessianToCov(Y, X, results)
t_stats = results.x / np.diag(cov_mat)
print(t_stats)

      fun: 1.3041561482902535
 hess_inv: array([[ 59445852.57084731, -19826824.56660157],
       [-19826824.56660158,   6612791.66599176]])
      jac: array([3.23355198e-06, 8.89599323e-06])
  message: 'Optimization terminated successfully.'
     nfev: 72
      nit: 10
     njev: 18
   status: 0
  success: True
        x: array([-106.29263986,   38.78535398])
[-0.00020545  0.00067391]


In [210]:
results = semiparametricRegression(Y, X, guess = [-1048.77063077,  -146.69711946,   451.18064426])
print(results)

      fun: matrix([[1.29988005]])
 hess_inv: array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])
      jac: array([3.50177288e-06, 7.89761543e-07, 8.37445259e-06])
  message: 'Optimization terminated successfully.'
     nfev: 5
      nit: 0
     njev: 1
   status: 0
  success: True
        x: array([-1048.77063077,  -146.69711946,   451.18064426])


In [203]:
v_hat = X*np.matrix(results.x).T
np.mean(np.power(Y - CE_1(Y, v_hat, v_hat, 1/5), 2))

0.008675363893992758

In [308]:
import numdifftools as nd

In [320]:
data = data_dmd[formulaCols(fit4_formula)].iloc[:,:]
X = np.matrix(data)[:, 1:]
Y = np.matrix(data)[:, 0]
obj_f = lambda x_0: -SLS_1(x_0, Y, X, trim(X, 0.98))

In [328]:
np.linalg.inv(nd.Hessian(obj_f)([-100, 50]))

array([[ 9.20680019e+01, -1.40947035e-03],
       [-1.40947035e-03,  2.75886986e-06]])

In [321]:
obj_f = lambda x_0: (Y - X*np.matrix(x_0).T).T*(Y - X*np.matrix(x_0).T)
result = minimize(obj_f, guess, method='BFGS')
result

      fun: 42.34461997111667
 hess_inv: array([[ 9.20644538e+01, -1.40941618e-03],
       [-1.40941618e-03,  2.75886630e-06]])
      jac: array([-9.53674316e-07, -1.90734863e-06])
  message: 'Optimization terminated successfully.'
     nfev: 32
      nit: 4
     njev: 8
   status: 0
  success: True
        x: array([ 5.66262934e+00, -2.96472454e-04])