# Parameterization Experiments

In this notebook, we'll illustrate the process of learning a *parameterized* form for $\Omega$ from data with true labels. 

In [13]:
using StatsBase
using Combinatorics
using Plots
using Optim

include("jl/omega.jl")
include("jl/HSBM.jl")
include("jl/read_data.jl")
include("jl/inference.jl")
include("jl/objectives.jl")
include("jl/hypergraph_louvain.jl");

In [7]:
function initializeExperiment(dataset, kmax_)
    """
    read a data set, form Ω using a currently hard-coded functional form, and 
    return important data used in optimizeParameters and plotParameters
    """
    H, Z = read_hypergraph_data(dataset,kmax_)

    kmax = maximum(keys(H.E))
    kmin = minimum(keys(H.E))
    K = 1:kmax
        
    α0 = vcat(rand(kmax),  1.0 .+ rand(kmax));
    n = 1*length(H.D)

    function ω(p, α)
        k = sum(p)
        return sum(p)/sum((p .* (1:length(p)).^α[k])) / n^(α[kmax+k]*k)
    end

    Ω = buildΩ(ω, α0, kmax)
    
    return(H, Z, Ω, α0, kmin, kmax)
    
end

function optimizeParameters(objective, α0; n_outer = 50, bounds_γ = (-3, 3), bounds_β = (0, 100))
    
    """
    Custom coordinate-wise optimizer. Specific to the functional form used in initializeExperiment().
    In particular, assumes that α = (β, γ), where β controls homogeneity and γ controls size density.  
    """

    α = α0
    kmax = length(α)÷2
    
    res = 0
    
    function inner_obj(α, a, k)
        α_ = copy(α)
        α_[k] = a[1]
        return objective(α_)
    end
    
    for i = 1:n_outer
        # optimization in γ
        for k = (kmax+1):(2*kmax)
            res = optimize(a -> inner_obj(α, a, k), bounds_γ[1], bounds_γ[2]) # very slow and simple -- no gradient information
            α[k] = Optim.minimizer(res)[1]

        end
        # optimization in β
        for k = 1:kmax
            res = optimize(a -> inner_obj(α, a, k), bounds_β[1], bounds_β[2])
            α[k] = Optim.minimizer(res)[1]
        end
    end
    
    ll = -Optim.minimum(res)
    return α, ll
end

optimizeParameters (generic function with 1 method)

In [9]:
H, Z, Ω, α0, kmin, kmax = initializeExperiment("contact-primary-school", 5);
n = length(Z)

242

In [20]:
Ẑ = collect(1:n)

obj = formObjective(H, Z, Ω)  # best estimates from real partition
α, ll = optimizeParameters(obj, α0)

([38.19660112501051, 1.2353680159713913, 4.919163488304201, 5.72356055553026, 4.352308670823825, 2.9999986890149417, 0.8684992133442138, 1.0704318769313845, 1.2965946988262813, 1.5240034391357193], -187328.22817845453)

In [23]:
@time Ẑ = SuperNodeLouvain(H,kmax,Ω;α=α) # bug (UndefVarError: changed not defined)

# also a general problem: if no nodes move clusters even after we've made a 
# prima facie reasonable estimate for the parameters then we are going to have significant issues. 

SuperNode Louvain: Phase 1

Louvain Iteration 1
Louvain Iteration 2
Louvain Iteration 3
Louvain Iteration 4
Louvain Iteration 5
Louvain Iteration 6
SuperNode Louvain: Phase 2

Louvain Iteration 1
Louvain Iteration 2
SuperNode Louvain: Phase 3

Louvain Iteration 1
617.877323 seconds (5.40 G allocations: 415.556 GiB, 13.54% gc time)


242-element Array{Int64,1}:
  1
  1
  2
  3
  3
  4
  5
  6
  7
  8
  4
  5
  9
  ⋮
 41
 27
  4
 12
 19
 19
  4
 35
 48
  3
 24
 20

In [24]:
unique(Ẑ)

53-element Array{Int64,1}:
  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
  ⋮
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53