In [1]:
using Plots, Random, LinearAlgebra, Statistics, SparseArrays
pyplot()

Plots.PyPlotBackend()

# Prox of nonnegative regularizer

In [None]:
# prox of nonneg evaluated at z minimizes obj(x) = I(x>=0) + 1/2(x-z)^2

z = -2 # try out different values!
x = -5:.1:5
objx = [xi<0 ? Inf : 0 + 1/2*(xi-z)^2 for xi in x]
plot(x, objx)
xlabel!("x")
ylabel!("obj(x)") 

In [None]:
prox_nonneg(x) = max.(0,x)

In [None]:
x = randn(2)
x /= norm(x)
px = prox_nonneg(x)

plot([x[1]],[x[2]],shape=:o,label="x")
plot!([px[1]],[px[2]],shape=:d,label="prox_nonnneg(x)")
xlims!((-2,2))
ylims!((-2,2))

# Prox of l1 regularizer

In [None]:
# prox of l1 evaluated at z minimizes |x| + 1/2(x-z)^2
z = -3
x = -3:.1:3
objx = [abs(xi) + 1/2*(xi-z)^2 for xi in x]
plot(x, objx)
xlabel!("x")
ylabel!("obj(x)")

In [None]:
function prox_l1(x::Number, alpha=1)
    if x > alpha
        return x-alpha
    elseif x < -alpha
        return x + alpha
    else
        return 0
    end
end

In [None]:
@show x = randn()
@show px = prox_l1(x);

In [None]:
x = -2:.1:2
px = [prox_l1(xi) for xi in x]
plot(x, px)
xlabel!("x")
ylabel!("prox_l1(x)")

In [None]:
@show x = randn(2)
@show px = prox_l1.(x);

# proximal gradient method

$$\nabla \|Xw-y\|^2 = \nabla((Xw-y)^T (Xw-y)) = 2X^T(Xw-y)$$

In [None]:
# proximal gradient method for quadratic loss and nonnegative regularizer
function proxgrad_quad_nonneg(X, y; maxiters = 10, stepsize = 1, w = zeros(size(X,2)))
    w = zeros(size(X,2))
    objval = Float64[]
    for i=1:maxiters
        # gradient step
        g = 2X'*(X*w-y) # compute quadratic gradient
        z = w - stepsize*g
        # prox step
        w = prox_nonneg(z)
        # record objective value
        push!(objval, norm(X*w-y)^2 + any(w<0) ? Inf : 0)
    end
    return w, objval
end

In [None]:
# proximal gradient method for quadratic loss and l1 regularizer
function proxgrad_quad_l1(X, y; maxiters = 10, stepsize = 1., λ = 1., w = zeros(size(X,2)))
    objval = Float64[]
    for i=1:maxiters
        # gradient step
        g = 2X'*(X*w-y) # compute quadratic gradient
        z = w - stepsize*g
        # prox step
        myprox(z) = prox_l1(z, stepsize*λ)
        w = myprox.(z)
        # record objective value
        push!(objval, norm(X*w-y)^2 + norm(w,1))
    end
    return w, objval
end

let's solve the problem $$\text{minimize} \quad (y-w)^2 + |w|$$

In [None]:
y = 2
wvec = -3:.1:3
objwvec = [(wi-y)^2 + abs(wi) for wi in wvec]
plot(wvec, objwvec, label="objective")
xlabel!("w")
ylabel!("obj(w)")

In [None]:
w = [-2]
X, y = ones(1,1), 2*ones(1)
w, obj = proxgrad_quad_l1(X, y; maxiters = 10, stepsize = .2, w = w)

plot(wvec, objwvec, label="objective")
xlabel!("w")
ylabel!("obj(w)")
plot!([w], [obj[end]], color=:red, shape=:o, label="final iterate")

In [None]:
obj

# Introduce LowRankModels

In [None]:
using LowRankModels

In [None]:
# loss function
loss = QuadLoss()

In [None]:
# regularizers
lambda = 1

nonneg = NonNegConstraint()
l1 = OneReg(lambda)
l2 = QuadReg(lambda)

In [None]:
# the quad loss returns the sum of square differences between its first and second argument
evaluate(loss, 2., 3.)

In [None]:
evaluate(3*loss, 2., 3.)

In [None]:
evaluate(1/2*loss, [2., 2.], [3., 3.])

In [None]:
# can also evaluate the gradient wrt the first argument
grad(loss, 2., 0.)

In [None]:
grad(loss, 0., 2.)

In [None]:
grad(L1Loss(), 2., 0.)

In [None]:
# can evaluate the proximal operator of the regularizer
prox(nonneg, [-1, 1])

In [None]:
# can evaluate the proximal operator of lambda times the regularizer
λ = .01
prox(l1, 1, λ)

In [None]:
# chain rule: 
# gradient of ||Xw - y||^2 wrt w is X' * <gradient of ||z-y||^2 wrt z>, 
# where z = X*w

In [None]:
import LowRankModels: evaluate, grad
evaluate(loss::Loss, X::Array{Float64,2}, w, y) = evaluate(loss, X*w, y)
grad(loss::Loss, X::Array{Float64,2}, w, y) = X'*grad(loss, X*w, y)

In [None]:
# proximal gradient method
function proxgrad(loss, reg, X, y; maxiters = 10, stepsize = 1)
    w = zeros(size(X,2))
    objval = Float64[]
    for i=1:maxiters
        # gradient step
        g = grad(loss, X, w, y)
        z = w - stepsize*g
        # prox step
        w = prox(reg, z, stepsize)
        # record objective value
        push!(objval, evaluate(loss, X, w, y) + evaluate(reg, w))
    end
    return w, objval
end

In [None]:
# proximal gradient method
function proxgrad(loss::Loss, reg::Regularizer, X, y;
                  maxiters::Int = 10, stepsize::Number = 1., 
                  ch::ConvergenceHistory = ConvergenceHistory("proxgrad"))
    w = zeros(size(X,2))
    for t=1:maxiters
        t0 = time()
        # gradient step
        g = grad(loss, X, w, y)
        w = w - stepsize*g
        # prox step
        w = prox(reg, w, stepsize)
        # record objective value
        update_ch!(ch, time() - t0, obj = evaluate(loss, X, w, y) + evaluate(reg, w))
    end
    return w
end

In [None]:
Random.seed!(0)
X, y = rand(6,3), rand(6);

In [None]:
ch = ConvergenceHistory("NNLS")
w = proxgrad(QuadLoss(), NonNegConstraint(), X, y; 
             stepsize=.001, maxiters=50,
             ch = ch)

@show ch.objective
plot(ch.objective, label="NNLS")
xlabel!("iteration")
ylabel!("objective")

In [None]:
plot(ch.objective .- ch.objective[end], label="NNLS") # try semilog
xlabel!("iteration")
ylabel!("objective")

# Let's generate some more data and test out these models

In [None]:
function generate_data(n, w)
    X = randn(n,length(w))
    y = X*w
    return X, y
end

function generate_noisy_data(n, w)
    X = randn(n,length(w))
    y = X*w + .1*randn(n)
    return X, y
end

# let's repeat what we did in the regularized regression notebook, using our nifty proximal gradient method

compare different kinds of regularized regression

In [None]:
function ridge_regression(X,y; λ=1, kwargs...)
    w = proxgrad(QuadLoss(), λ*QuadReg(), X, y; kwargs...)
    return w
end

In [None]:
function nnls(X,y; kwargs...)
    w = proxgrad(QuadLoss(), NonNegConstraint(), X, y; kwargs...)
    return w
end

In [None]:
function lasso(X,y; λ=1, kwargs...)
    w = proxgrad(QuadLoss(), λ*OneReg(), X, y; kwargs...)
    return w
end

In [None]:
# generate data

d = 30
w_randn = randn(d)
w_sparse = sprandn(d, .5)
w_pos = sprand(d, .5);

w = w_sparse

X, y = generate_noisy_data(30, w)

In [None]:
maxiters = 10000
stepsize = .1/norm(X)

w_ridge = ridge_regression(X,y, maxiters=maxiters, stepsize=stepsize)
w_nonneg = nnls(X,y, maxiters=maxiters, stepsize=stepsize)
w_lasso = lasso(X,y, maxiters=maxiters, stepsize=stepsize);

In [None]:
histogram(w_ridge, label="ridge coefficients", bins=-3:.1:3, alpha=.7)

In [None]:
histogram(w_lasso, label="lasso coefficients", bins=-3:.1:3, alpha=.7)
histogram!(w_ridge, label="ridge coefficients", bins=-3:.1:3, alpha=.7)

In [None]:
histogram(w_nonneg, label="nonnegative coefficients", bins=-3:.1:3, alpha=.7)
histogram!(w_ridge, label="ridge coefficients", bins=-3:.1:3, alpha=.7)

In [None]:
# which fits data best?
Xtest,ytest = generate_data(20,w)

scatter(ytest,Xtest*w_ridge,label="ridge")
scatter!(ytest,Xtest*w_lasso,label="lasso")
scatter!(ytest,Xtest*w_nonneg,label="NNLS")
plot!(ytest,ytest,label="true model")
xlabel!("true value")
ylabel!("predicted value")

In [None]:
# cross validate over lambda
Random.seed!(1)

w = randn(40)
X,y = generate_noisy_data(30, w)
Xtest,ytest = generate_noisy_data(30, w)

maxiters = 10000
stepsize = .1/norm(X)

ridge_error = Float64[]
lasso_error = Float64[]
λs = 0:.1:2
for λ in λs
    w = ridge_regression(X,y, λ=λ, maxiters=maxiters, stepsize=stepsize)
    push!(ridge_error, sum((ytest - Xtest*w).^2))
    w = lasso(X,y, λ=λ, maxiters=maxiters, stepsize=stepsize);
    push!(lasso_error, sum((ytest - Xtest*w).^2))
end
plot(λs, lasso_error, label="lasso")
plot!(λs, ridge_error, label="ridge")