In [1]:
using LinearAlgebra, Distributions, Random
using CSV, DataFrames, DelimitedFiles

In [2]:
data = CSV.read("../data/lr.csv", DataFrame);
data = Matrix(data);

In [3]:
X = data[:, 1:end - 1];
y = data[:, end];

In [4]:
n, p = size(X)

(100000, 10)

In [5]:
function σ(x)
    return one(x) / (one(x) + exp(-x))
end

σ (generic function with 1 method)

In [6]:
# return expected gradient with respect to current β.
function grad(X, y, β)
    n_obs = size(X)[1]
    prob = σ.(X * β)
    g  = X' * (prob .- y)
    return g
end

grad (generic function with 1 method)

In [7]:
function batch_shuffle(X, y, mb_size)
    n = size(X)[1]
    perm = randperm(n)[1:mb_size]
    X_mini = @view X[perm, :]
    y_mini = @view y[perm, :]
    return X_mini, y_mini
end

batch_shuffle (generic function with 1 method)

In [8]:
function step(βₖ, αₖ, ∇βₖ)
    βₖ₊₁ = @. βₖ - αₖ * ∇βₖ
    return βₖ₊₁
end

step (generic function with 1 method)

In [9]:
function loss(X, y, β)
    ŷ = σ.(X * β)
    loss_vector = @. y * log(ŷ) + (1 - y) * log(1 - ŷ)
    l = -sum(loss_vector)
    
    return l
end

loss (generic function with 1 method)

# SGD 

In [36]:
function sgd(X, y, β₀, α, eps, mb, max_iter = 10000)
    βᵢ = β₀
    ∇βᵢ = ones(p)
    niter = 0
    norm_∇βᵢ = 100
    
    while(norm_∇βᵢ > eps && niter < max_iter)
        
        X_mini, y_mini = batch_shuffle(X, y, mb)
        ∇βᵢ = grad(X_mini, y_mini, βᵢ)
        βᵢ₊₁ = step(βᵢ, α, ∇βᵢ)
        βᵢ = βᵢ₊₁
        
        #iteration wise metrics
        niter+=1
        norm_∇βᵢ = norm(∇βᵢ / mb)
        
        if (niter % 10) == 0
            @show niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)
        end
    end
    
    return βᵢ, niter
end      

sgd (generic function with 2 methods)

In [55]:
mb_sgd = 2500
βₛ, Nₛ = sgd(X, y, zeros(p), 1e-3, 0.02, mb_sgd)

(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (10, 0.030867645975344853, 1055.4561435987707)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (20, 0.019722285071791735, 1019.5602041328088)


([0.5833312226405079; 1.3927746532067395; … ; 1.0908295898693345; 0.1938477536407033;;], 20)

In [56]:
#number of gradient calculations
Nₛ * mb_sgd

50000

In [57]:
# check to make sure our algorithm is working.
grad(X, y, βₛ) / n
loss(X, y, βₛ) / n

0.4215091333652087

In [58]:
mb_size = 1000
X_mini, y_mini = batch_shuffle(X, y, mb_size)
grad(X_mini, y_mini, βₛ) / mb_size
loss(X_mini, y_mini, βₛ) / mb_size

0.4407288286841396

# Retrospective Approximation

In [59]:
function mb_update(mbₖ, Cₖ)
    return mbₖ * Cₖ
end

mb_update (generic function with 1 method)

In [60]:
eps = [1, 0.2, 0.16, 0.11, 0.09, 0.05, 0.02]
mbs = [2, 20, 50, 100, 200, 500, 1000];

In [61]:
function ra(mbs, eps, β₀)
    
    @assert(size(mbs) == size(eps))
    nₒ = size(mbs)[1]
    niter_total = 0
    ngrad_total = 0
    βᵢ = β₀
    @show nₒ
    
    # outer iterations
    for i in 1:nₒ
        X_inner, y_inner = batch_shuffle(X, y, mbs[i])
        @show i eps[i] mbs[i]
        βᵢ₊₁, niter_inner = sgd(X_inner, y_inner, βᵢ, 1e-3, eps[i], mbs[i])
        @show niter_inner
        
        niter_total += niter_inner
        ngrad_total += niter_inner * mbs[i]
        βᵢ = βᵢ₊₁
    end
    return βᵢ, niter_total, ngrad_total
end

ra (generic function with 1 method)

In [65]:
βᵣ, Nᵣ, ngradᵣ =  ra(mbs, eps, zeros(p))

nₒ = 7
i = 1
eps[i] = 1.0
mbs[i] = 2
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (10, 1.1542024020829027, 1.3316210393387382)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (20, 1.1202370005048166, 1.2801272333722535)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (30, 1.0875357944753503, 1.2316063997144542)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (40, 1.0560890986037725, 1.1858624541239222)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (50, 1.0258769305761173, 1.1427102906145923)
niter_inner = 59
i = 2
eps[i] = 0.2
mbs[i] = 20
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (10, 0.4751788087272859, 12.535911668236547)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (20, 0.4307801717633318, 11.728502147915194)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (30, 0.3928346112477327, 11.060637333738086)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (40, 0.3605394153764849, 10.50126128539027)
(niter, norm_∇βᵢ, loss(X_mini, y_mini, βᵢ)) = (50, 0.33304784348837113, 10.026631974364667)
(nite

([0.45860256165698593; 1.284638509845886; … ; 1.0678178671220218; 0.13161502098696476;;], 317, 28708)

In [63]:
norm(grad(X, y, βᵣ) / n)

0.03250632237619804

In [64]:
prob = σ.(X * β) 
yᵢ = Bern(probᵢ)

LoadError: UndefVarError: β not defined

# SGD with stepsize decay

In [308]:
function sgd_decay(X, y, β₀, α, ϵ, mb; lag = 50, max_iter = 1000)
    βᵢ = β₀
    ∇βᵢ = ones(p)
    niter = 0
    norm_∇βᵢ = 100
    min_norm_∇βᵢ = 1000
    no_imp_iter = 0
    
    while(norm_∇βᵢ > ϵ && niter < max_iter)
        
        #take steps.
        X_mini, y_mini = batch_shuffle(X, y, mb)
        ∇βᵢ = grad(X_mini, y_mini, βᵢ)
        βᵢ₊₁ = step(βᵢ, α, ∇βᵢ)
        βᵢ = βᵢ₊₁
        
        #iteration wise metrics
        niter+=1
        norm_∇βᵢ = norm(∇βᵢ / mb)
        l = loss(X_mini, y_mini, βᵢ) / mb
        
        # count iterations for which there has been no improvement in loss.
        if norm_∇βᵢ <= min_norm_∇βᵢ
            no_imp_iter = 0
            min_norm_∇βᵢ = norm_∇βᵢ
        else
            no_imp_iter+=1
        end
        
        #display metrics
        if (niter % 500) == 0
            @show norm_∇βᵢ, α, min_norm_∇βᵢ
        end
        
        # if no improvement in norm for "lag" iterations, reduce learning rate.
        if no_imp_iter >= lag
            α = α * 0.2
            @show norm_∇βᵢ, α, min_norm_∇βᵢ
            no_imp_iter = 0
        end
    end
    
    return βᵢ, niter, α
end

sgd_decay (generic function with 3 methods)

In [310]:
βₑ, Nₑ, αₑ =  sgd_decay(X, y, zeros(p), 1e-3, 0.01, 1500, lag = 50, max_iter = 10000)

(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.038812752684404206, 0.0002, 0.016594675830095298)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.016592623567614713, 4.0e-5, 0.013492138944955517)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.040160345715200774, 8.000000000000001e-6, 0.013492138944955517)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.024751539085444776, 1.6000000000000004e-6, 0.012156606619395879)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.03693136039578351, 3.200000000000001e-7, 0.012156606619395879)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.03515042679099938, 3.200000000000001e-7, 0.011468708848815548)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.03766135892320959, 6.400000000000003e-8, 0.011468708848815548)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.03424072340098075, 1.2800000000000007e-8, 0.011468708848815548)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.021590494146212256, 2.5600000000000015e-9, 0.011468708848815548)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.038182438913260675, 5.120000000000003e-10, 0.011468708848815548)
(norm_∇βᵢ, α, min_norm_∇βᵢ) = (0.03369033169014424, 1.0240

([0.5810305491296903; 1.4207680030632701; … ; 1.109164227389375; 0.1712325585170614;;], 1646, 2.6843545600000043e-23)

In [311]:
βₑ

10×1 Matrix{Float64}:
  0.5810305491296903
  1.4207680030632701
  0.4295348045020326
 -0.15885682282788663
  1.0069205917132373
 -0.32529142699927577
 -0.6220489067412067
  0.40485511669316526
  1.109164227389375
  0.1712325585170614

In [312]:
norm(grad(X, y, βₑ) / n)

0.0027623269936933293

# RA with decay SGD

In [313]:
function ra_decay(mbs, eps, β₀)
    
    @assert(size(mbs) == size(eps))
    nₒ = size(mbs)[1]
    niter_total = 0
    ngrad_total = 0
    βᵢ = β₀
    @show nₒ
    
    # outer iterations
    for i in 1:nₒ
        X_inner, y_inner = batch_shuffle(X, y, mbs[i])
        @show i eps[i] mbs[i]
        βᵢ₊₁, niter_inner = sgd_decay(X_inner, y_inner, βᵢ, 1e-3, eps[i], mbs[i])
        @show niter_inner
        
        niter_total += niter_inner
        ngrad_total += niter_inner * mbs[i]
        βᵢ = βᵢ₊₁
    end
    return βᵢ, niter_total, ngrad_total
end

ra_decay (generic function with 1 method)

In [314]:
ra_decay(mbs, eps, zeros(p))

nₒ = 7
i = 1
eps[i] = 1.0
mbs[i] = 2
niter_inner = 1
i = 2
eps[i] = 0.2
mbs[i] = 20
niter_inner = 169
i = 3
eps[i] = 0.16
mbs[i] = 50
niter_inner = 58
i = 4
eps[i] = 0.11
mbs[i] = 100
niter_inner = 20
i = 5
eps[i] = 0.09
mbs[i] = 200
niter_inner = 10
i = 6
eps[i] = 0.05
mbs[i] = 500
niter_inner = 12
i = 7
eps[i] = 0.02
mbs[i] = 1000
niter_inner = 15


([0.4843171301404079; 1.3706241374411625; … ; 0.9144653622641016; 0.19949178583093152;;], 285, 31282)