<div style="text-align:center"><span style="font-size:2em; font-weight: bold;">  Lecture 8—Ensemble Learning</span></div>

# Programming: Bootstrap

In [1]:
import numpy as np
import pandas as pd
from cleands import *

def generate_mvt_normal(n,r,means=None):
    if means is None: means = np.zeros(size=(r,))
    if r == 1: return np.random.normal(loc=means,size=(n,1))
    P = np.array([[1,1]])
    for i in range(2,r):
        ones = np.ones([i,1])
        zeros = np.zeros([P.shape[0],1])
        ident = np.eye(i)
        upper = np.hstack([ones,ident])
        lower = np.hstack([zeros,P])
        P = np.vstack([upper,lower])
    covariates = np.random.normal(size=(n,P.shape[0]))
    idiosyncratics = np.random.normal(loc=means,size=(n,P.shape[1]))
    covariate_loadings = np.random.uniform(size=P.shape[0])*3-1
    return covariates@np.diagflat(covariate_loadings)@P+idiosyncratics

n = 1000
npx = generate_mvt_normal(n,4,means=np.random.uniform(size=(4,))*2-1)
ones = np.ones((n,1))
onpx = np.hstack([ones,npx])
bvec = np.random.uniform(size=(5,))
npy = np.random.normal(size=(n,))+onpx@bvec

In [2]:
bvec

array([0.99649208, 0.65759964, 0.67019448, 0.03789703, 0.80894204])

In [3]:
model = least_squares_regressor(onpx,npy)
b = model.params
e = model.residuals
b

array([0.99857741, 0.65180893, 0.65601809, 0.06446478, 0.79932951])

In [4]:
## simple bootstrap
outp = []
bootstraps = 10000
bnew = b.copy()
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    bsmodel = least_squares_regressor(onpx[sample],npy[sample])
    outp += [bsmodel.params]
outp = np.array(outp)

In [5]:
## slightly complicated bootstrap (Lose heteroskedasticity information)
outp = []
bootstraps = 10000
bnew = b.copy()
#bnew[3] = 0 # imposing the null
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    newy = onpx@bnew+e[sample]
    bsmodel = least_squares_regressor(onpx,newy)
    outp += [bsmodel.params]
outp = np.array(outp)

In [6]:
## complicated bootstrap (perserves heteroskedasticity)
outp = []
bootstraps = 10000
bnew = b.copy()
#bnew[3] = 0 # imposing the null
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    newy = onpx[sample]@bnew+e[sample]
    bsmodel = least_squares_regressor(onpx[sample],newy)
    outp += [bsmodel.params]
outp = np.array(outp)

In [7]:
outp

array([[1.01811611, 0.65260889, 0.65497005, 0.06343176, 0.80429209],
       [0.94628914, 0.63839137, 0.66630725, 0.0951648 , 0.78776684],
       [0.96397524, 0.64098878, 0.6779881 , 0.06170768, 0.80870227],
       ...,
       [1.01063914, 0.65252113, 0.66067962, 0.04273174, 0.79121159],
       [1.00944009, 0.64061703, 0.64602222, 0.06644586, 0.77985319],
       [0.96740274, 0.65084514, 0.68189818, 0.07614046, 0.80783294]])

In [8]:
outp.mean(0)

array([0.99767431, 0.65153005, 0.65614753, 0.0644406 , 0.79906946])

In [9]:
b

array([0.99857741, 0.65180893, 0.65601809, 0.06446478, 0.79932951])

In [10]:
((outp.mean(0)-b)**2).mean()

1.9566366112067044e-07

In [11]:
outp.std(0)

array([0.03670323, 0.01897582, 0.01743723, 0.01500955, 0.01601504])

In [12]:
np.sqrt(np.diag(model.vcov_params))

array([0.03556362, 0.01954777, 0.01739896, 0.01516543, 0.01562584])

In [13]:
((outp.std(0)-np.sqrt(np.diag(model.vcov_params)))**2).mean()

3.6061404855482153e-07

In [14]:
# confidence interval for betahat
outp.sort(0)
print(outp[int(outp.shape[0]*0.025)])
print(outp[int(outp.shape[0]*0.975)])

[0.9266341  0.61429582 0.6210589  0.03513587 0.76715632]
[1.07011549 0.68913605 0.68969347 0.09421591 0.83043415]


In [15]:
# imposed null hypothesis
lowercv = outp[int(outp.shape[0]*0.025),3]
uppercv = outp[int(outp.shape[0]*0.975),3]
print((lowercv,uppercv))
print(b[3])

(0.03513587424959795, 0.09421591312174485)
0.06446477685607797


In [16]:
def bootstrap(model,dgp,x,e,seed=None,bootstraps:int=1000):
    outp = []
    if seed is not None: np.random.seed(seed)
    for i in range(bootstraps):
        sample = np.random.randint(x.shape[0],size=(x.shape[0],))
        newy = dgp(x[sample],e[sample])
        outp += [model(x[sample],newy)]
    return outp

In [17]:
# complicated bootstrap (standard error)
bs = bootstrap(least_squares_regressor,
          lambda x,e: x@b+e,
          onpx,e)
np.array([i.params for i in bs]).std(0)


array([0.03794109, 0.01871049, 0.01719187, 0.01499218, 0.01668419])

In [18]:
# simple bootstrap
bs = bootstrap(least_squares_regressor,
          lambda x,e: e,
          onpx,npy)
np.array([i.params for i in bs]).std(0)

array([0.03623798, 0.01954642, 0.01747303, 0.01502338, 0.01659688])

# Data Science


## Bagging

Average multiple models fit with bootstrap samples

In [19]:
# Fitted values
np.array([i.predict(onpx) for i in bs]).mean(0)

array([ 3.03496211e+00, -4.36386167e+00,  9.34884086e-01,  3.01220174e+00,
        2.36334625e+00, -1.67637179e+00,  8.24981017e+00, -2.35354421e-01,
       -1.63850109e+00, -4.61590871e+00,  1.10679894e+00,  3.06473071e+00,
        1.66823960e+00,  3.06497663e-01,  1.81699740e+00,  4.36583635e-01,
        3.65155948e+00,  1.76196118e-01,  8.97537672e-01, -6.62043022e-01,
       -8.93091335e-01,  2.81569363e+00, -1.09284876e+00,  5.18458630e+00,
        5.94990710e+00,  1.21297269e+00,  3.75063226e+00, -1.01930775e+00,
       -6.35343532e+00,  2.14656254e+00,  4.69293138e+00, -2.99171369e+00,
        2.22796884e+00,  4.51694486e+00, -1.70311579e+00, -2.16851655e+00,
       -2.05250155e+00,  3.31966764e+00,  4.35745593e+00,  2.02901732e-01,
        2.08387547e+00,  3.97719257e+00,  1.43202993e+00,  3.30585828e+00,
        6.63124553e+00, -2.56741863e+00,  1.51377711e+00,  1.48310045e+00,
        7.89072760e-01, -1.41588821e+00, -3.37956690e+00, -1.93815587e+00,
        3.15210299e-02,  

In [20]:
# Comparison between predicting and averaging vs. averaging and then predicting
np.array([i.predict(onpx) for i in bs]).mean(0)-onpx@np.array([i.params for i in bs]).mean(0)

array([ 2.22044605e-15,  8.88178420e-16, -2.66453526e-15, -8.88178420e-16,
       -3.10862447e-15, -4.44089210e-16,  1.77635684e-15, -3.60822483e-16,
        6.66133815e-16,  8.88178420e-16,  2.22044605e-16, -8.88178420e-16,
       -1.99840144e-15,  1.55431223e-15, -2.22044605e-16,  0.00000000e+00,
        0.00000000e+00, -1.85962357e-15, -1.66533454e-15, -9.99200722e-16,
       -1.77635684e-15, -3.10862447e-15,  1.55431223e-15,  1.77635684e-15,
       -5.32907052e-15,  0.00000000e+00, -3.55271368e-15,  8.88178420e-16,
        0.00000000e+00, -2.66453526e-15,  0.00000000e+00,  3.99680289e-15,
        1.33226763e-15, -8.88178420e-16,  4.44089210e-16,  2.22044605e-15,
       -1.33226763e-15,  0.00000000e+00,  0.00000000e+00,  5.55111512e-17,
        8.88178420e-16, -2.22044605e-15, -1.99840144e-15, -2.66453526e-15,
       -7.10542736e-15, -6.21724894e-15,  1.11022302e-15, -1.33226763e-15,
       -1.22124533e-15, -1.55431223e-15,  8.88178420e-16,  1.11022302e-15,
       -1.38777878e-15, -

In [21]:
deck = np.arange(n)
np.random.shuffle(deck)
test = deck[:int(n*0.2)]
train = deck[int(n*0.2):]

In [22]:
model = least_squares_regressor(onpx[train],npy[train])
err = npy[test]-model.predict(onpx[test])
mspe = (err**2).mean()
mspe

1.0543176113781598

In [23]:
bs = bootstrap(least_squares_regressor,
          lambda x,e: e,
          onpx[train],npy[train])
bs = np.array([i.predict(onpx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

1.0541636840063155

In [30]:
model = rpart(npx[train],npy[train])
err = npy[test] - model.predict(npx[test])
mspe = (err**2).mean()
mspe

2.379576148040935

In [31]:
bs = bootstrap(rpart,
          lambda x,y: y,
          npx[train],npy[train],bootstraps=100)
bs = np.array([i.predict(npx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

  tstat = model.params/np.sqrt(np.diag(model.vcov_params))
  return self.residuals.var()*(self.n_obs-1)/self.degrees_of_freedom
  return self.residuals.var()*(self.n_obs-1)/self.degrees_of_freedom


1.3878261072463158

## Random Forest

Average multiple models fit with subsets of the x variables and bootstrap samples

In [33]:
bs = bootstrap(lambda x,y: rpart(x,y,random_x=True),
          lambda x,y: y,
          npx[train],npy[train],bootstraps=100)
bs = np.array([i.predict(npx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

  tstat = model.params/np.sqrt(np.diag(model.vcov_params))
  return self.residuals.var()*(self.n_obs-1)/self.degrees_of_freedom
  return self.residuals.var()*(self.n_obs-1)/self.degrees_of_freedom


1.4668157519612188

## Boosting

Repeatedly fit trees on the residuals

In [40]:
resid = npy.copy()
mspe = (resid[test]**2).mean()
models = []
for i in range(100):
    model = rpart(npx[train],resid[train],max_level=3)
    resid -= model.predict(npx)
    mspe_new = (resid[test]**2).mean()
    if mspe_new>mspe: break
    models += [model]
    mspe = mspe_new
    print(mspe)

3.799839831245157
2.9316926940460646
2.6739954537506145
2.485904893653924
2.3726622101312187


In [41]:
# Calculate the fitted values
np.array([i.predict(npx) for i in models]).sum(0)

array([-1.71644975,  2.45513264,  2.30106018, -2.41326805,  0.17915881,
        0.17915881, -1.55070027,  2.30106018, -2.41326805, -2.41326805,
       -3.24753003,  3.32999939, -1.90916377, -4.44333387,  3.18816296,
       -0.88218777,  3.16362796,  3.32999939, -2.37217319,  1.48732264,
        0.60021986, -0.88218777, -0.88218777,  2.45513264, -1.35192147,
       -2.41326805, -3.24753003, -0.88218777, -4.08399695, -1.71644975,
       -1.35192147, -2.41326805,  0.17915881, -2.60355253,  2.70061854,
       -1.71644975, -2.60355253,  5.60597322,  2.30106018, -0.01961999,
       -2.41326805,  1.84725448,  3.31770041,  2.87619369, -2.41326805,
       -4.08399695, -2.60355253,  0.17915881, -1.90916377,  1.48732264,
        1.48732264,  2.70061854, -4.44333387, -5.92377829,  5.60597322,
        4.57703401,  1.48732264,  1.04172658,  5.60597322,  1.04172658,
       -1.35192147, -1.61952768, -2.41326805,  1.48732264,  3.18816296,
        0.17915881,  0.17915881, -5.92377829,  1.04172658,  5.60

# Programming challenges

## Gradient boosting

Create plots for gradient boosting.

## Insertion sort

Put together a code to implement the insertion sort algorithm.


## Principal components analysis

Write a PCA class. Add an estimator for the number of principal components to use.