<div style="text-align:center"><span style="font-size:2em; font-weight: bold;">  Lecture 8—Ensemble Learning</span></div>

# Programming: Bootstrap

In [1]:
import numpy as np
import pandas as pd
from cleands import *

def generate_mvt_normal(n,r,means=None):
    if means is None: means = np.zeros(size=(r,))
    if r == 1: return np.random.normal(loc=means,size=(n,1))
    P = np.array([[1,1]])
    for i in range(2,r):
        ones = np.ones([i,1])
        zeros = np.zeros([P.shape[0],1])
        ident = np.eye(i)
        upper = np.hstack([ones,ident])
        lower = np.hstack([zeros,P])
        P = np.vstack([upper,lower])
    covariates = np.random.normal(size=(n,P.shape[0]))
    idiosyncratics = np.random.normal(loc=means,size=(n,P.shape[1]))
    covariate_loadings = np.random.uniform(size=P.shape[0])*3-1
    return covariates@np.diagflat(covariate_loadings)@P+idiosyncratics

n = 1000
npx = generate_mvt_normal(n,4,means=np.random.uniform(size=(4,))*2-1)
ones = np.ones((n,1))
onpx = np.hstack([ones,npx])
bvec = np.random.uniform(size=(5,))
npy = np.random.normal(size=(n,))+onpx@bvec

In [2]:
bvec

array([0.24508132, 0.0685719 , 0.66015188, 0.89693922, 0.9684995 ])

In [3]:
model = least_squares_regressor(onpx,npy)
b = model.params
e = model.residuals
b

array([0.2150529 , 0.04630077, 0.67560823, 0.91110573, 0.97055031])

In [4]:
## simple bootstrap
outp = []
bootstraps = 10000
bnew = b.copy()
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    bsmodel = least_squares_regressor(onpx[sample],npy[sample])
    outp += [bsmodel.params]
outp = np.array(outp)

In [5]:
## slightly complicated bootstrap (Lose heteroskedasticity information)
outp = []
bootstraps = 10000
bnew = b.copy()
#bnew[3] = 0 # imposing the null
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    newy = onpx@bnew+e[sample]
    bsmodel = least_squares_regressor(onpx,newy)
    outp += [bsmodel.params]
outp = np.array(outp)

In [6]:
## complicated bootstrap (perserves heteroskedasticity)
outp = []
bootstraps = 10000
bnew = b.copy()
#bnew[3] = 0 # imposing the null
for i in range(bootstraps):
    sample = np.random.randint(n,size=(n,)) # with replacement
    newy = onpx[sample]@bnew+e[sample]
    bsmodel = least_squares_regressor(onpx[sample],newy)
    outp += [bsmodel.params]
outp = np.array(outp)

In [7]:
outp

array([[0.20239339, 0.05341823, 0.67546902, 0.90736315, 0.97383011],
       [0.22596262, 0.03798595, 0.67651464, 0.93129529, 0.97881974],
       [0.20125461, 0.05928881, 0.64092562, 0.91598544, 0.97114969],
       ...,
       [0.17198774, 0.0404448 , 0.68618899, 0.92730007, 0.96037906],
       [0.24018174, 0.05725799, 0.66354868, 0.94823948, 0.97699405],
       [0.18293272, 0.05315837, 0.6682874 , 0.86864825, 0.9773402 ]])

In [8]:
outp.mean(0)

array([0.21483118, 0.0465182 , 0.67550064, 0.91121726, 0.9708548 ])

In [9]:
b

array([0.2150529 , 0.04630077, 0.67560823, 0.91110573, 0.97055031])

In [10]:
((outp.mean(0)-b)**2).mean()

4.2632844068636336e-08

In [11]:
outp.std(0)

array([0.04008639, 0.01742157, 0.01586613, 0.0218905 , 0.01762798])

In [12]:
np.sqrt(np.diag(model.vcov_params))

array([0.03874266, 0.01712074, 0.0156024 , 0.02175068, 0.01734383])

In [13]:
((outp.std(0)-np.sqrt(np.diag(model.vcov_params)))**2).mean()

4.1318669245581535e-07

In [14]:
# confidence interval for betahat
outp.sort(0)
print(outp[int(outp.shape[0]*0.025)])
print(outp[int(outp.shape[0]*0.975)])

[0.13468223 0.01263729 0.64427221 0.86850242 0.93526737]
[0.29304004 0.08079912 0.70630316 0.95364746 1.00460742]


In [15]:
# imposed null hypothesis
outp.sort(0)
lowercv = outp[int(outp.shape[0]*0.05),3]
uppercv = outp[int(outp.shape[0]*0.95),3]
print((0.1,lowercv,uppercv))
lowercv = outp[int(outp.shape[0]*0.025),3]
uppercv = outp[int(outp.shape[0]*0.975),3]
print((0.05,lowercv,uppercv))
lowercv = outp[int(outp.shape[0]*0.005),3]
uppercv = outp[int(outp.shape[0]*0.995),3]
print((0.01,lowercv,uppercv))
lowercv = outp[int(outp.shape[0]*0.0005),3]
uppercv = outp[int(outp.shape[0]*0.9995),3]
print((0.001,lowercv,uppercv))
print(b[3])

(0.1, 0.875313019113189, 0.947501703999533)
(0.05, 0.8685024203685464, 0.9536474629200189)
(0.01, 0.8546658465441083, 0.9675191213688674)
(0.001, 0.8340495887071591, 0.9807121008811507)
0.9111057293236523


In [16]:
def bootstrap(model,dgp,x,e,seed=None,bootstraps:int=1000):
    outp = []
    if seed is not None: np.random.seed(seed)
    for i in range(bootstraps):
        sample = np.random.randint(x.shape[0],size=(x.shape[0],))
        newy = dgp(x[sample],e[sample])
        outp += [model(x[sample],newy)]
    return outp

In [17]:
def bootstrap(model,x,y,seed=None,bootstraps:int=1000):
    outp = []
    if seed is not None: np.random.seed(seed)
    for i in range(bootstraps):
        sample = np.random.randint(x.shape[0],size=(x.shape[0],))
        outp += [model(x[sample],y[sample])]
    return outp

In [18]:
# bootstrap (standard error)
bs = bootstrap(least_squares_regressor,onpx,npy)
np.array([i.params for i in bs]).std(0)

array([0.03983853, 0.01727009, 0.01536547, 0.02220768, 0.01752002])

# Data Science


## Bagging

Average multiple models fit with bootstrap samples

In [19]:
# Fitted values
np.array([i.predict(onpx) for i in bs]).mean(0)

array([ 2.07176562e+00, -3.47627668e+00,  2.22271101e-01, -4.75635453e+00,
        9.00585952e+00, -1.79876466e+00, -1.13335826e+00, -2.77865661e+00,
        1.96153987e+00,  3.95770085e+00,  2.31698541e-02, -6.29257850e+00,
        6.25557879e-01,  3.69714231e-01,  3.02595911e+00, -3.44133912e+00,
       -2.18560352e+00,  2.60425410e+00, -1.04260240e+00,  3.84747649e+00,
       -4.21846904e+00,  1.27792874e+00, -2.25281149e+00,  1.20805962e+00,
       -1.96022912e+00, -1.06792044e+00,  8.16313520e-01, -4.49023174e+00,
       -1.59202161e+00, -2.02048058e+00,  3.84660538e+00, -2.76909689e+00,
        4.26963620e+00,  7.56895133e-01, -6.09143756e+00,  2.55924537e+00,
        2.28518400e-01, -2.30017153e+00, -7.55824752e+00,  1.31032705e+00,
       -2.83189278e+00,  3.89181496e+00, -6.54237796e+00,  6.01383082e-01,
       -3.96427827e+00, -2.46966059e+00, -4.27174541e+00,  9.52442716e-01,
       -3.14353221e+00, -2.97278475e+00, -2.39890186e+00, -2.26407755e+00,
        3.21616995e+00,  

In [20]:
# Comparison between predicting and averaging vs. averaging and then predicting
np.array([i.predict(onpx) for i in bs]).mean(0)-onpx@np.array([i.params for i in bs]).mean(0)

array([ 1.33226763e-15,  2.66453526e-15, -2.05391260e-15,  1.77635684e-15,
        1.77635684e-15, -1.33226763e-15, -4.44089210e-16, -1.33226763e-15,
        1.11022302e-15,  6.66133815e-15, -1.17961196e-15, -5.32907052e-15,
       -2.44249065e-15, -2.10942375e-15, -4.44089210e-16, -5.32907052e-15,
        5.77315973e-15, -3.55271368e-15,  2.22044605e-16, -4.44089210e-16,
        6.21724894e-15,  2.22044605e-16, -8.88178420e-16, -1.33226763e-15,
       -2.44249065e-15, -8.88178420e-16,  5.55111512e-16,  6.21724894e-15,
        3.55271368e-15, -1.77635684e-15, -3.55271368e-15,  2.22044605e-15,
        8.88178420e-16,  4.44089210e-16,  0.00000000e+00, -3.55271368e-15,
       -4.38538095e-15,  4.44089210e-16, -3.55271368e-15, -1.33226763e-15,
        5.77315973e-15,  1.77635684e-15, -2.66453526e-15, -1.11022302e-16,
       -4.88498131e-15, -3.10862447e-15,  0.00000000e+00,  3.10862447e-15,
        3.99680289e-15,  1.77635684e-15,  0.00000000e+00, -4.44089210e-16,
       -8.88178420e-16, -

In [21]:
deck = np.arange(n)
np.random.shuffle(deck)
test = deck[:int(n*0.2)]
train = deck[int(n*0.2):]

In [22]:
model = least_squares_regressor(onpx[train],npy[train])
err = npy[test]-model.predict(onpx[test])
mspe = (err**2).mean()
mspe

0.9915208089420827

In [23]:
bs = bootstrap(least_squares_regressor,onpx[train],npy[train])
bs = np.array([i.predict(onpx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

0.9912314005146226

In [24]:
model = rpart(npx[train],npy[train])
err = npy[test] - model.predict(npx[test])
mspe = (err**2).mean()
mspe

  return self.residuals.var()*(self.n_obs-1)/self.degrees_of_freedom


2.2929979529092215

In [25]:
bs = bootstrap(rpart,npx[train],npy[train],bootstraps=100)
bs = np.array([i.predict(npx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

  tstat = model.params/np.sqrt(np.diag(model.vcov_params))
  return self.residuals.var()*(self.n_obs-1)/self.degrees_of_freedom


1.1592248875880202

## Random Forest

Average multiple models fit with subsets of the x variables and bootstrap samples

In [26]:
bs = bootstrap(lambda x,y: rpart(x,y,random_x=True),npx[train],npy[train],bootstraps=100)
bs = np.array([i.predict(npx[test]) for i in bs]).mean(0)
err = npy[test]-bs
mspe = (err**2).mean()
mspe

6.973367528569668

## Boosting

Repeatedly fit trees on the residuals

In [52]:
resid = npy.copy()
mspe = (resid[test]**2).mean()
models = []
for i in range(100):
    model = rpart(npx[train],resid[train],max_level=3)
    resid -= model.predict(npx)
    mspe_new = (resid[test]**2).mean()
    if mspe_new>mspe: break
    models += [model]
    mspe = mspe_new
    print(mspe)

7.2127516248561
3.723445802473301
2.95717334498762
2.457946344892841
2.09724836983554
2.031025335694956
1.823271711374237
1.772529067656814
1.6397536059656828
1.4884559905928496


In [51]:
from itertools import product
for maxl,wmax in product([2,3,4,5],[20,15,10,5]):
    resid = npy.copy()
    mspe = (resid[test]**2).mean()
    models = []
    w = 1/np.linspace(wmax,1,100)
    for i in range(100):
        model = rpart(npx[train],resid[train],max_level=maxl)
        resid -= model.predict(npx)*w[i]
        mspe_new = (resid[test]**2).mean()
        if mspe_new>mspe: break
        models += [model]
        mspe = mspe_new
        #print(mspe)
    print(maxl,wmax,mspe)

2 20 1.1448309929289506
2 15 1.1714738390483175
2 10 1.1645475881320058
2 5 1.1725945326102605
3 20 1.1990418584079559
3 15 1.165872087944011
3 10 1.1711415135806305
3 5 1.170940901924854
4 20 1.1589415832662604
4 15 1.1447114335835409
4 10 1.1469461324357137
4 5 1.2487625072154345


ValueError: zero-size array to reduction operation minimum which has no identity

In [53]:
resid = npy.copy()
mspe = (resid[test]**2).mean()
models = []
w = 1/np.linspace(20,1,100)
for i in range(100):
    model = rpart(npx[train],resid[train],max_level=2)
    resid -= model.predict(npx)*w[i]
    mspe_new = (resid[test]**2).mean()
    if mspe_new>mspe: break
    models += [model]
    mspe = mspe_new
    print(mspe)

14.530999019266496
13.953492410185724
13.427649737505728
12.942780657537458
12.49437920484791
12.069819662872819
11.63759526040826
11.261667789066541
10.863789926268959
10.51998157285752
10.196462314778948
9.865235874603194
9.569385795674812
9.237889570416748
8.964104339049825
8.768433565987788
8.449284515448664
8.178171809577668
7.937310119994015
7.668418209302074
7.442916279231599
7.280357358896083
7.012940084361985
6.796197375043394
6.564762120279828
6.365143577331136
6.169430063471484
5.963810197469659
5.740786656154898
5.56715270069962
5.390675599530389
5.270212417136434
5.1092907397069816
4.931976882783437
4.765801209813772
4.618835793558717
4.4815404821548785
4.3457002141513525
4.213966594387063
4.073505523943272
3.9299515324367915
3.805423767463532
3.693632325800749
3.5762105155341395
3.497998614463407
3.3822560583715657
3.256733467454574
3.1395634946521955
3.051981736598715
2.969476758036562
2.8823831034738316
2.7996920011668913
2.7213560967440755
2.6325172606748106
2.54371890

In [72]:
resid = npy.copy()
mspe = (resid[test]**2).mean()
models = []
w = 1/np.linspace(25,1,100)
w = np.concatenate([w,np.log(np.linspace(np.e,np.e**2,100))])
for i in range(200):
    model = rpart(npx[train],resid[train],max_level=2)
    resid -= model.predict(npx)*w[i]
    mspe_new = (resid[test]**2).mean()
    if mspe_new>mspe: break
    models += [model]
    mspe = mspe_new
    print(mspe)

14.653733382015606
14.180672767414235
13.740314304284011
13.330412364757727
12.943008599740276
12.581379795046358
12.229181209375097
11.873990093993914
11.556447370274572
11.253874646870077
10.924765170003207
10.6467481478926
10.360787164076037
10.099852028630309
9.808054552175413
9.562872886067659
9.319003748198307
9.075003792442132
8.909608165969546
8.639979532056282
8.394851221071136
8.18569258486239
7.98124289838579
7.755532428116893
7.6115875738708905
7.375319990210549
7.160420652213864
6.987883277785852
6.8065101852826855
6.628487411615445
6.4550158461160345
6.250925137126517
6.06990556271422
5.9142310297173575
5.756515120807112
5.644155702747054
5.492094596938825
5.340820277302924
5.166154733042465
5.0120377854755365
4.878433700074649
4.739054271194045
4.610376800703807
4.485932264789586
4.345653477817959
4.213669060706059
4.130237066155638
4.018610594362013
3.8983156151354934
3.771651447818262
3.66824299422443
3.560994257363088
3.4923755458137418
3.404143578410498
3.30921888893

In [73]:
i

100

In [57]:
i

99

In [36]:
# Calculate the fitted values
np.array([i.predict(npx) for i in models]).sum(0)

array([ 3.24833554e+00, -2.16132961e+00,  3.66014666e-02, -5.98635079e+00,
        8.98853131e+00, -2.07265821e+00, -3.68940073e-01, -2.79169448e+00,
        1.06149407e+00,  5.19637545e+00,  1.13651275e-02, -6.51875179e+00,
        3.66014666e-02, -5.89607990e-01,  3.24833554e+00, -2.91686364e+00,
       -2.69595063e+00,  1.61596966e+00, -2.01957266e+00,  4.88242390e+00,
       -5.27946927e+00,  6.01349887e-01, -2.25904892e+00,  6.01349887e-01,
       -2.76606800e+00, -9.64666015e-01, -3.74681255e-01, -4.04717514e+00,
       -2.79941112e+00, -2.00408638e+00,  3.67641398e+00, -3.38936042e+00,
        4.25621444e+00,  6.38938825e-01, -6.76251949e+00,  2.18670176e+00,
       -9.70283422e-01, -4.81480962e+00, -7.49478293e+00,  6.01349887e-01,
       -2.69595063e+00,  3.24833554e+00, -6.51875179e+00,  3.24469718e-01,
       -2.91686364e+00, -3.08836304e+00, -2.91686364e+00,  8.52494978e-01,
       -1.70721316e+00, -1.56563913e+00, -1.31462304e+00, -2.79169448e+00,
        3.24833554e+00,  

# Programming challenges

## Gradient boosting

Create plots for gradient boosting.

## Insertion sort

Put together a code to implement the insertion sort algorithm.


## Principal components analysis

Write a PCA class. Add an estimator for the number of principal components to use.