In [7]:
using Plots, Random, LinearAlgebra, Statistics, SparseArrays
plotly()

Plots.PlotlyBackend()

# Prox of nonnegative regularizer

In [13]:
# prox of nonneg evaluated at z minimizes obj(x) = I(x>=0) + 1/2(x-z)^2

z = 1 # try out different values!
x = -5:.1:5
objx = [xi<0 ? Inf : 0 + 1/2*(xi-z)^2 for xi in x]
plot(x, objx)
xlabel!("x")
ylabel!("obj(x)") 

In [14]:
prox_nonneg(x) = max.(0,x)

prox_nonneg (generic function with 1 method)

In [15]:
x = randn(2)
x /= norm(x)
px = prox_nonneg(x)

plot([x[1]],[x[2]],shape=:o,label="x")
plot!([px[1]],[px[2]],shape=:d,label="prox_nonnneg(x)")
xlims!((-1,1.2))
ylims!((-1,1.2))

# Prox of l1 regularizer

In [16]:
# prox of l1 evaluated at z minimizes |x| + 1/2(x-z)^2
z = 2
x = -3:.1:3
objx = [abs(xi) + 1/2*(xi-z)^2 for xi in x]
plot(x, objx)
xlabel!("x")
ylabel!("obj(x)")

In [17]:
function prox_l1(x::Number, alpha=1)
    if x > alpha
        return x-alpha
    elseif x < -alpha
        return x + alpha
    else
        return 0
    end
end
function prox_l1(x::AbstractArray, alpha=1)
    return reshape(Float64[prox_l1(xi, alpha) for xi in x], size(x))
end

prox_l1 (generic function with 4 methods)

In [7]:
@show x = randn()
@show px = prox_l1(x);

x = randn() = -0.044544268986984666
px = prox_l1(x) = 0


In [8]:
x = linspace(-2, 2, 100)
px = [prox_l1(xi) for xi in x]
plot(x, px)
xlabel!("x")
ylabel!("prox_l1(x)")

In [9]:
@show x = randn(2)
@show px = prox_l1(x);

x = randn(2) = [-0.787789, -0.110904]
px = prox_l1(x) = [0.0, 0.0]


# proximal gradient method

$$\nabla \|Xw-y\|^2 = \nabla((Xw-y)^T (Xw-y)) = 2X^T(Xw-y)$$

In [10]:
# proximal gradient method for quadratic loss and nonnegative regularizer
function proxgrad_quad_nonneg(X, y; maxiters = 10, stepsize = 1, w = zeros(size(X,2)))
    w = zeros(size(X,2))
    objval = Float64[]
    for i=1:maxiters
        # gradient step
        g = 2X'*(X*w-y) # compute quadratic gradient
        z = w - stepsize*g
        # prox step
        w = prox_nonneg(z)
        # record objective value
        push!(objval, norm(X*w-y)^2 + any(w<0) ? Inf : 0)
    end
    return w, objval
end

proxgrad_quad_nonneg (generic function with 1 method)

In [11]:
# proximal gradient method for quadratic loss and l1 regularizer
function proxgrad_quad_l1(X, y; maxiters = 10, stepsize = 1, λ = 1, w = zeros(size(X,2)))
    objval = Float64[]
    for i=1:maxiters
        # gradient step
        g = 2X'*(X*w-y) # compute quadratic gradient
        z = w - stepsize*g
        # prox step
        w = prox_l1(z, stepsize*λ)
        # record objective value
        push!(objval, norm(X*w-y)^2 + norm(w,1))
    end
    return w, objval
end

proxgrad_quad_l1 (generic function with 1 method)

let's solve the problem $$\text{minimize} \quad (y-w)^2 + |w|$$

In [12]:
y = 2
wvec = linspace(-3, 3, 100)
objwvec = [(wi-y)^2 + abs(wi) for wi in wvec]
plot(wvec, objwvec, label="objective")
xlabel!("w")
ylabel!("obj(w)")

In [13]:
w = -2
w, obj = proxgrad_quad_l1(ones(1,1), 2*ones(1); maxiters = 20, stepsize = .1, w = w)

plot(wvec, objwvec, label="objective")
xlabel!("w")
ylabel!("obj(w)")
plot!([w], [obj[end]], color=:red, shape=:o, label="final iterate")

In [14]:
obj

20-element Array{Float64,1}:
 10.71   
  6.0444 
  4.0    
  3.19   
  2.6716 
  2.33982
  2.12749
  1.99159
  1.90462
  1.84896
  1.81333
  1.79053
  1.77594
  1.7666 
  1.76063
  1.7568 
  1.75435
  1.75279
  1.75178
  1.75114

# Introduce LowRankModels

In [15]:
using LowRankModels



In [16]:
# loss function
loss = QuadLoss()

LowRankModels.QuadLoss(1.0, LowRankModels.RealDomain())

In [17]:
# regularizers
lambda = 1

nonneg = NonNegConstraint()
l1 = OneReg(lambda)
l2 = QuadReg(lambda)

LowRankModels.QuadReg(1.0)

In [18]:
# the quad loss returns the sum of square differences between its first and second argument
evaluate(loss, 2., 3.)

1.0

In [19]:
evaluate(3*loss, 2., 3.)

3.0

In [20]:
evaluate(1/2*loss, [2., 2.], [3., 3.])

1.0

In [21]:
# can also evaluate the gradient wrt the first argument
grad(loss, 2., 0.)

4.0

In [22]:
grad(loss, 0., 2.)

-4.0

In [23]:
grad(L1Loss(), 2., 0.)

1.0

In [24]:
# can evaluate the proximal operator of the regularizer
prox(nonneg, [-1, 1])

2-element Array{Int64,1}:
 0
 1

In [25]:
# can evaluate the proximal operator of lambda times the regularizer
λ = .01
prox(l1, 1, λ)

0.99

In [26]:
# chain rule: 
# gradient of ||Xw - y||^2 wrt w is X' * <gradient of ||z-y||^2 wrt z>, 
# where z = X*w

In [27]:
import LowRankModels: evaluate, grad
evaluate(loss::Loss, X::Array{Float64,2}, w, y) = evaluate(loss, X*w, y)
grad(loss::Loss, X::Array{Float64,2}, w, y) = X'*grad(loss, X*w, y)

grad (generic function with 19 methods)

In [28]:
# proximal gradient method
function proxgrad(loss, reg, X, y; maxiters = 10, stepsize = 1)
    w = zeros(size(X,2))
    objval = Float64[]
    for i=1:maxiters
        # gradient step
        g = grad(loss, X, w, y)
        z = w - stepsize*g
        # prox step
        w = prox(reg, z, stepsize)
        # record objective value
        push!(objval, evaluate(loss, X, w, y) + evaluate(reg, w))
    end
    return w, objval
end

proxgrad (generic function with 1 method)

In [29]:
# proximal gradient method
function proxgrad(loss::Loss, reg::Regularizer, X, y;
                  maxiters::Int = 10, stepsize::Number = 1., 
                  ch::ConvergenceHistory = ConvergenceHistory("proxgrad"))
    w = zeros(size(X,2))
    for t=1:maxiters
        t0 = time()
        # gradient step
        g = grad(loss, X, w, y)
        w = w - stepsize*g
        # prox step
        w = prox(reg, w, stepsize)
        # record objective value
        update_ch!(ch, time() - t0, obj = evaluate(loss, X, w, y) + evaluate(reg, w))
    end
    return w
end

proxgrad (generic function with 2 methods)

In [30]:
srand(0)
X, y = rand(6,3), rand(6);

In [31]:
ch = ConvergenceHistory("NNLS")
w = proxgrad(QuadLoss(), NonNegConstraint(), X, y; 
             stepsize=.1, maxiters=50,
             ch = ch)

@show ch.objective
plot(ch.objective)
xlabel!("iteration")
ylabel!("objective")

ch.objective = [0.655586, 0.546513, 0.487293, 0.451002, 0.428631, 0.414751, 0.407283, 0.402936, 0.39954, 0.39679, 0.394556, 0.39274, 0.391265, 0.390065, 0.38909, 0.388298, 0.387653, 0.38713, 0.386704, 0.386359, 0.386077, 0.385849, 0.385663, 0.385512, 0.38539, 0.385287, 0.385199, 0.385121, 0.385051, 0.384988, 0.38493, 0.384876, 0.384827, 0.384781, 0.384739, 0.3847, 0.384663, 0.38463, 0.384598, 0.384569, 0.384541, 0.384516, 0.384493, 0.384471, 0.38445, 0.384431, 0.384414, 0.384397, 0.384382, 0.384368]


In [32]:
plot(ch.objective - ch.objective[end]) # try semilog
xlabel!("iteration")
ylabel!("objective")

# Let's generate some more data and test out these models

In [33]:
function generate_data(n, w)
    X = randn(n,length(w))
    y = X*w
    return X, y
end

function generate_noisy_data(n, w)
    X = randn(n,length(w))
    y = X*w + .1*randn(n)
    return X, y
end

generate_noisy_data (generic function with 1 method)

# let's repeat what we did in the regularized regression notebook, using our nifty proximal gradient method

compare different kinds of regularized regression

In [34]:
function ridge_regression(X,y; λ=1, kwargs...)
    w = proxgrad(QuadLoss(), λ*QuadReg(), X, y; kwargs...)
    return w
end

ridge_regression (generic function with 1 method)

In [35]:
function nnls(X,y; kwargs...)
    w = proxgrad(QuadLoss(), NonNegConstraint(), X, y; kwargs...)
    return w
end

nnls (generic function with 1 method)

In [36]:
function lasso(X,y; λ=1, kwargs...)
    w = proxgrad(QuadLoss(), λ*OneReg(), X, y; kwargs...)
    return w
end

lasso (generic function with 1 method)

In [37]:
# generate data

d = 30
w_randn = randn(d)
w_sparse = sprandn(d, .5)
w_pos = sprand(d, .5);

w = w_pos

X, y = generate_noisy_data(30, w)

([0.147604 -0.315522 … -1.13485 0.0200871; 0.403393 1.31209 … 0.890369 -0.15671; … ; 0.186545 0.907637 … -0.36707 -0.946525; 0.195274 0.720839 … -0.620002 1.00196], [-2.77543, -0.167133, 0.733351, -0.463421, -1.57937, 2.46772, -4.12558, -4.07051, -3.25865, -3.47735  …  1.76497, -3.62625, 3.22828, 3.86394, 4.80149, -3.58891, 0.0357586, 1.00687, -2.88054, 3.99267])

In [38]:
maxiters = 100
stepsize = .01/norm(X)

w_ridge = ridge_regression(X,y, maxiters=maxiters, stepsize=stepsize)
w_nonneg = nnls(X,y, maxiters=maxiters, stepsize=stepsize)
w_lasso = lasso(X,y, maxiters=maxiters, stepsize=stepsize);

In [39]:
histogram(w_ridge)

In [40]:
histogram(w_lasso, bins=50)

In [41]:
histogram(w_nonneg)

In [42]:
# which fits data best?
Xtest,ytest = generate_data(20,w)

scatter(ytest,Xtest*w_ridge,label="ridge")
scatter!(ytest,Xtest*w_lasso,label="lasso")
scatter!(ytest,Xtest*w_nonneg,label="NNLS")
plot!(ytest,ytest,label="true model")
xlabel!("true value")
ylabel!("predicted value")

In [43]:
# cross validate over lambda

w = .5*randn(40)
X,y = generate_noisy_data(30, w)
Xtest,ytest = generate_noisy_data(30, w)

error = Float64[]
λs = 0:.1:10
for λ in λs
    w = ridge_regression(X,y; λ=λ, maxiters=maxiters, stepsize=stepsize)
    push!(error, sum((ytest - Xtest*w).^2))
end
plot(λs, error)
xlabel!("lambda")
ylabel!("error")