In [6]:
using Plots

┌ Info: Precompiling Plots [91a5bcdd-55d7-5caf-9e0b-520d859cae80]
└ @ Base loading.jl:1278


In [2]:
"""plot line y = w*x+b"""
function plotline(w,b;
                  xmin=-100,xmax=100,label="")
    xsamples = [xmin, xmax]
    plot!(xsamples, [w*x+b for x in xsamples], color=:black, label=label)
end

plotline

In [3]:
"""plot function y=f(x)"""
function plotfunc(f;
                  xmin=-100,xmax=100,nsamples=100,label="")
    xsamples = linspace(xmin,xmax,nsamples)
    plot!(xsamples, [f(x) for x in xsamples], color=:black, label=label)
end

plotfunc

In [4]:
# Generate and plot data
srand(2)
n = 20
function sample_data(num_points)
    x = rand(num_points)*10
    y = .2 + .2*x + .1*sin.(x) + .03*randn(num_points) - .1*(x/6).^2
    return x,y
end
x,y = sample_data(n)

"""function to plot the above data"""
function plotdata(x=x,y=y; margin=.05)
    scatter(x,y, label="data")
    xlabel!("x")
    ylabel!("y")
    range_y = maximum(y) - minimum(y)
    range_x = maximum(x) - minimum(x)
    ylims!((minimum(y)-margin*range_y,maximum(y)+margin*range_y))
    xlims!((minimum(x)-margin*range_x,maximum(x)+margin*range_x))
end
plotdata()

LoadError: UndefVarError: srand not defined

# Approximating with the mean

In [5]:
# the mean solves a very simple least squares problem:
X = ones(n,1)
w = X\y

1-element Array{Float64,1}:
 1.10694

In [6]:
# check the solution to our least squares problem is the mean
abs(mean(y) - w[1])

2.220446049250313e-16

In [7]:
# plot the fit
plotdata()
plotline(0, w[1], label="mean")

# Approximating with a line

In [8]:
X = [copy(x) ones(length(x))]

20×2 Array{Float64,2}:
 3.66796   1.0
 5.23879   1.0
 2.10256   1.0
 8.19338   1.0
 5.01371   1.0
 5.59355   1.0
 5.23559   1.0
 4.1587    1.0
 5.40152   1.0
 6.89567   1.0
 1.43836   1.0
 6.85137   1.0
 0.271377  1.0
 7.85383   1.0
 6.89278   1.0
 2.42258   1.0
 7.11323   1.0
 4.03459   1.0
 7.51799   1.0
 1.51662   1.0

In [9]:
w = X\y

2-element Array{Float64,1}:
 0.180877
 0.225941

In [29]:
# plot the fit
plotdata()
plotline(w[1], w[2], label="linear fit")

In [30]:
# plot fit on out of sample data
plotdata()
plotline(w[1], w[2])

xtest,ytest = sample_data(20)
scatter!(xtest,ytest,label="test")

# Approximating with a polynomial

In [31]:
# first, construct a Vandermonde matrix
max_order = 10

X = zeros(n, max_order+1)
for k=0:max_order
    X[:,k+1] = x.^k
end
X

20×11 Array{Float64,2}:
 1.0  9.09689    82.7533       …      4.26613e8        3.88085e9  
 1.0  2.79531     7.81375         10420.0          29127.2        
 1.0  8.53498    72.8459              2.40338e8        2.05128e9  
 1.0  6.84568    46.8633              3.30179e7        2.2603e8   
 1.0  5.51435    30.4081              4.71468e6        2.59984e7  
 1.0  3.72213    13.8543       …      1.37128e5        5.10407e5  
 1.0  6.13857    37.682               1.23767e7        7.59752e7  
 1.0  0.612078    0.37464             0.0120577        0.00738025 
 1.0  1.81982     3.31176           218.909          398.376      
 1.0  6.73125    45.3097              2.83701e7        1.90966e8  
 1.0  5.11328    26.1456       …      2.38943e6        1.22178e7  
 1.0  9.63534    92.8398              7.1582e8         6.89717e9  
 1.0  5.35266    28.651               3.60684e6        1.93062e7  
 1.0  0.397406    0.157932            0.000247235      9.82528e-5 
 1.0  7.25806    52.6794              

In [32]:
# solve least squares problem
w = X\y

11-element Array{Float64,1}:
  0.166567   
  0.129117   
  0.452351   
 -0.332393   
  0.0571393  
  0.0253821  
 -0.0144204  
  0.00308755 
 -0.000344862
  1.9891e-5  
 -4.68215e-7 

In [33]:
"""computes our polynomial fit evaluated at x"""
function p(x; order = max_order, w = w)
    y = 0
    for k=0:order
        y += w[k+1]*x^k
    end
    return y
end



p

In [34]:
# plot fit
plotdata()
plotfunc(x -> p(x, order=max_order, w=w), xmin=0, xmax=9)

In [35]:
# plot fit on out of sample data
plotdata()
plotfunc(x -> p(x, order=max_order, w=w), xmin=0, xmax=9)

xtest,ytest = sample_data(20)
scatter!(xtest,ytest,label="test")

# Choosing the best model order

In [36]:
max_model_order = 10
rmse = Array{Float64}(max_model_order+1) # array to store root mean square model errors
xtest,ytest = sample_data(50) # generate test set

for model_order = 0:max_model_order
    # form Vandermonde matrix
    X = zeros(n, model_order+1)
    for k=0:model_order
        X[:,k+1] = x.^k
    end
    
    # solve least squares problem
    w = X\y
    
    # compute test error
    ptest = [p(x, order=model_order, w=w) for x in xtest]
    rmse[model_order+1] = sqrt(mean((ytest - ptest).^2))
end
rmse

11-element Array{Float64,1}:
 0.455137 
 0.0658035
 0.0679645
 0.0676416
 0.035888 
 0.0327207
 0.0304345
 0.0303066
 0.030694 
 0.0324812
 0.0352749

In [37]:
plot(rmse)
xlabel!("model order")
ylabel!("rmse")

# Bootstrap estimators

In [38]:
# sample K data sets of n samples each and compute a model on each
# see how the models vary
n = 20
K = 10

models = zeros(K,2)
for k=1:K
    xk,yk = sample_data(n)
    Xk = [xk ones(n)]
    wk = Xk \ yk
    models[k,:] = wk
end

In [39]:
# histogram of the distribution of the first coefficient
# could use to compute, eg, confidence intervals

histogram(models[:,1])

In [40]:
mean(models,1)

1×2 Array{Float64,2}:
 0.16935  0.279851

In [41]:
var(models,1)

1×2 Array{Float64,2}:
 3.73206e-5  0.000806163

In [42]:
# can sample with replacement using rand
rand(1:15, 5)

# eg,
a = 0:.1:1.5
s = rand(1:15, 5)
[s a[s]]

5×2 Array{Float64,2}:
  9.0  0.8
 15.0  1.4
  4.0  0.3
 12.0  1.1
  1.0  0.0

In [43]:
# resample K bootstrap data sets of n samples each and compute a model on each
# see how the models vary
n = 20
K = 100

x,y = sample_data(n)

models = zeros(K,2)
for k=1:K
    mysample = rand(1:n,n)
    xk,yk = x[mysample], y[mysample]
    Xk = [xk ones(n)]
    wk = Xk \ yk
    models[k,:] = wk
end

In [44]:
histogram(models[:,1])

In [45]:
mean(models,1)

1×2 Array{Float64,2}:
 0.172213  0.27259

In [46]:
# as K increases, mean of the bootstrap models should converge to 
# the model fit on the original data set
X = [x ones(n)]
w = X \ y

2-element Array{Float64,1}:
 0.171614
 0.275612

In [47]:
var(models,1)

1×2 Array{Float64,2}:
 3.52031e-5  0.0014969