In [None]:
using LinearAlgebra, Random, Statistics, Plots, SparseArrays
include("proxgrad.jl")
pyplot()

In [None]:
function generate_data(n, w)
    X = randn(n,length(w))
    y = X*w
    return X, y
end

function generate_noisy_data(n, w)
    X = randn(n,length(w))
    y = X*w + .1*randn(n)
    return X, y
end

# OLS is invariant

In [None]:
d = 10
n = 20
w = randn(d)
X, y = generate_data(n, w);

In [None]:
w = X\y
@show X*w

yscale = 5*y
Xscale = 3*X
wscale = Xscale\yscale
@show Xscale*wscale / 5;

@show rel_err = norm(X*w - Xscale*wscale/5) / norm(X*w)

# ridge regression is not scaling invariant

In [None]:
size(X'*X + I)

In [None]:
w = (X'*X + I) \ (X'*y)
@show X*w

yscale = 5*y
Xscale = 3*X
wscale = (Xscale'*Xscale + I) \ (Xscale'*yscale)
@show Xscale * wscale / 5;

@show rel_err = norm(X*w - Xscale*wscale/5) / norm(X*w)

In [None]:
# standardize
function standardize(X,y)
    X_standard = - mean(X,dims=1) .+ X
    X_standard = X_standard * diagm(1 ./ vec(std(X,dims=1)))
    
    y_standard = - mean(y) .+ y
    y_standard = y_standard / std(y)
    
    return X_standard, y_standard
end

Xs, ys = standardize(X,y)
w = (Xs'*Xs + I) \ (Xs'*ys)
@show Xs*w

yscale = 5*y .+ 3000
Xscale = 3*X .+ 200

Xss, yss = standardize(Xscale,yscale)
wscale = (Xss'*Xss + I) \ (Xss'*yss)
@show Xss*wscale;

@show rel_err = norm(Xs*w -  Xss*wscale) / norm(Xs*w)

# let's compare different kinds of regularized regression

In [None]:
function ridge_regression(X,y; λ=1)
    proxgrad(1/n*QuadLoss(), QuadReg(λ), X, y, maxiters=1000)
end

In [None]:
function lasso(X,y; λ=1)
    proxgrad(1/n*QuadLoss(), OneReg(λ), X, y, maxiters=1000)
end

In [None]:
function nnls(X,y)
    proxgrad(1/n*QuadLoss(), NonNegConstraint(), X, y, maxiters=1000)
end

In [None]:
# generate data

d = 30
w_randn = randn(d)
w_sparse = sprandn(d, .5)
w_pos = sprand(d, .5);

In [None]:
w_sparse

In [None]:
# find best model for each type of data
w = w_randn

λridge=.1
λlasso=.1

X,y = generate_data(30, w)
w_ridge = ridge_regression(X,y; λ=λridge)
w_lasso = lasso(X,y; λ=λlasso)
w_nonneg = nnls(X,y);

In [None]:
histogram(w_ridge, label="ridge coefficients", bins=-3:.1:3)

In [None]:
histogram(w_lasso, label="lasso coefficients", bins=-3:.1:3, alpha=.7)
histogram!(w_ridge, label="ridge coefficients", bins=-3:.1:3, alpha=.7)

Which coefficients are more sparse? (More 0 coefficients)
* A) ridge
* B) lasso

In [None]:
histogram(w_nonneg, label="nonnegative coefficients", bins=-3:.1:3, alpha=.7)
histogram!(w_ridge, label="ridge coefficients", bins=-3:.1:3, alpha=.7)

Which coefficients are more sparse? (More 0 coefficients)
* A) ridge
* B) nonnegative least squares

In [None]:
# which fits data best?
Xtest,ytest = generate_data(20,w)

scatter(ytest,Xtest*w_ridge,label="ridge")
scatter!(ytest,Xtest*w_lasso,label="lasso")
scatter!(ytest,Xtest*w_nonneg,label="NNLS")
plot!(ytest,ytest,label="true model")
xlabel!("true value")
ylabel!("predicted value")

In [None]:
# cross validate over lambda
Random.seed!(1)

w = randn(40)
X,y = generate_noisy_data(30, w)
Xtest,ytest = generate_noisy_data(30, w)

ridge_error = Float64[]
lasso_error = Float64[]
λs = 0:.1:1
for λ in λs
    w = ridge_regression(X,y; λ=λ)
    push!(ridge_error, sum((ytest - Xtest*w).^2))
    w = lasso(X,y; λ=λ)
    push!(lasso_error, sum((ytest - Xtest*w).^2))
end
plot(λs, ridge_error, label="ridge")
plot!(λs, lasso_error, label="lasso")
ylabel!("test error")
xlabel!("regularization parameter")