In [39]:
using Pkg; Pkg.activate(".")
using HypergraphModularity

using StatsBase

[32m[1m Activating[22m[39m environment at `~/hypergraph_modularities_code/Project.toml`


In [40]:
# dataset = "contact-primary-school-classes"
dataset = "email-Enron-full"
# dataset = "walmart-trips"
# dataset = "TrivagoClickout"
kmax_ = 38

H = read_hypergraph_data(dataset,kmax_, false)
α = zeros(2*kmax_);

kmax = maximum(keys(H.E))
kmin = minimum(keys(H.E))

n = length(H.D)

# all-or-nothing
function ω(p,α)
    k = p[2]
    δ = p[1]
    return ((1+(1-δ))*n)^α[k] / (n^α[k + kmax])
end

Ω = allOrNothingIntensityFunction(ω, kmax);

In [41]:
α̂ = α
verbose = false
maxits = 100
randflag = true
weighted=false

# Zwarm = collect(1:n)  # no warm start

println("")

println("DYADIC")
print(rpad("iteration", 20))
print(rpad("L_D", 15))
print(rpad("Q", 15))
print(rpad("NMI", 10))
print(rpad("groups", 10))
println(rpad("time (s)", 10))
println(rpad("",  80, "-"))

Ẑ = collect(1:n)
Ẑ₁ = Ẑ

γ = 5.0
for i = 1:10
    
    tic = time()
    
    Ẑ = CliqueExpansionModularity(H, γ, false, true);
    toc = time()-tic
    
    γ = computeDyadicResolutionParameter(H, Ẑ)
    ω_in, ω_out = computeDyadicResolutionParameter(H, Ẑ; mode="ω", weighted=weighted)
    α̂ = learnParameters(H, Ẑ, Ω, α̂; n_iters = 10, amin = -50, amax = 50)
    Q = modularity(H, Ẑ, Ω; α = α̂)
    
    L_D = dyadicLogLikelihood(H, Ẑ, ω_in, ω_out)
    
    print(rpad("$i", 20))
    print(rpad("$(round(L_D))", 15))
    print(rpad("$(round(Q))", 15))
    print(rpad("$(round(mutualInformation(Ẑ₁,Ẑ,true), digits = 2))", 10))
    print(rpad("$(length(unique(Ẑ)))", 10))
    println(rpad("$(round(toc; digits=3))", 10))
    
    Ẑ₁ = Ẑ
end


DYADIC
iteration           L_D            Q              NMI       groups    time (s)  
--------------------------------------------------------------------------------
1                   5612.0         -34687.0       0.68      18        0.03      
2                   3221.0         -35385.0       0.77      11        0.011     
3                   2726.0         -35621.0       0.77      11        0.01      
4                   1694.0         -35929.0       0.8       10        0.012     
5                   598.0          -35870.0       0.67      9         0.009     
6                   -652.0         -36067.0       0.82      9         0.008     
7                   -1016.0        -36029.0       0.91      8         0.055     
8                   -1115.0        -35905.0       0.94      8         0.011     
9                   -1135.0        -35794.0       0.97      8         0.009     
10                  -1115.0        -35648.0       0.97      8         0.009     


In [42]:
D = countmap(Ẑ₁)
[(key, val) for (key, val) in D if val > 1]

3-element Array{Tuple{Int64,Int64},1}:
 (4, 85)
 (5, 43)
 (1, 15)

In [43]:
α̂ = α
verbose = false
maxits = 100
randflag = true

# Zwarm = collect(1:n)  # no warm start
Z_ = CliqueExpansionModularity(H, 10.0);

Ẑ = Z_
Ẑ₂ = Z_
Ω = estimateΩEmpirically(H, Ẑ₂; aggregator = p -> [length(p) == 1, sum(p)])
Q = modularity(H, Z_, Ω; α = α)
println("Warmstart partition has $(maximum(Z_)) clusters and modularity $(round(Q)).")
println("")

println("POLYADIC")
print(rpad("iteration", 20))
print(rpad("Q", 15))
print(rpad("NMI", 10))
print(rpad("groups", 10))
println(rpad("time (s)", 10))
println(rpad("",  65, "-"))

for i = 1:10
    
    γ = computeDyadicResolutionParameter(H, Ẑ)
    Z_ = CliqueExpansionModularity(H, γ);
    
    tic = time()
    Ẑ₂ = SuperNode_PPLouvain(H, Ω; α = α, verbose = false, Z0 = Z_)
    toc = time()-tic
        
    Ω = estimateΩEmpirically(H, Ẑ₂; aggregator = p -> [length(p) == 1, sum(p)])
    Q = modularity(H, Ẑ₂, Ω; α = α̂)
    
    print(rpad("$i", 20))
    print(rpad("$(round(Q))", 15))
    print(rpad("$(round(mutualInformation(Ẑ₂,Ẑ,true), digits = 2))", 10))
    print(rpad("$(length(unique(Ẑ₂)))", 10))
    println(rpad("$(round(toc; digits=3))", 10))
    
    Ẑ = Ẑ₂
end

Warmstart partition has 32 clusters and modularity -30439.0.

POLYADIC
iteration           Q              NMI       groups    time (s)  
-----------------------------------------------------------------
1                   -30108.0       0.67      13        0.033     
2                   -30102.0       0.58      11        0.089     
3                   -30010.0       0.65      11        0.021     
4                   -30058.0       0.85      11        0.02      
5                   -30189.0       0.75      11        0.076     
6                   -30091.0       0.77      11        0.021     
7                   -30138.0       0.86      11        0.021     
8                   -30031.0       0.86      11        0.021     
9                   -30231.0       0.79      11        0.021     
10                  -30180.0       0.79      11        0.018     


In [45]:
D = countmap(Ẑ₂)
[(key, val) for (key, val) in D if val > 1]

6-element Array{Tuple{Int64,Int64},1}:
 (2, 36)
 (7, 24)
 (6, 26)
 (3, 23)
 (5, 17)
 (1, 17)

In [48]:
Ω.ω([0, 37], α)

4.087174066115985e-133

In [31]:
mutualInformation(Ẑ₁, Ẑ₂, true)

0.5089291990917897

So it looks like we can see meaningfully different partition structure, but primarily when kmax is relatively low. When kmax is higher there is probably enough signal that both algorithms can find essentially the same partition quite reliably. Bit of a bummer, as there's not really a justifiable reason for restricting kmax to be so low. Maybe we need a different data set to demonstrate our algorithm showing qualitatively different performance in a context that matters?

Alternative: use the slow version with different parameterization, and show that that does something interesting. 

In [18]:
function ω(p, α)
    k = sum(p)
    return sum(p)/sum((p .* (1:length(p)).^α[k])) / n^(α[kmax+k]*k)
end

Ω = partitionIntensityFunction(ω, kmax);
α̂ = α
verbose = false
maxits = 100
randflag = true
weighted=false

println("")

println("polyadic")
print(rpad("iteration", 20))
print(rpad("Q", 15))
print(rpad("NMI", 10))
print(rpad("groups", 10))
println(rpad("time (s)", 10))
println(rpad("",  65, "-"))

Ẑ = collect(1:n)
Ẑ₂ = Ẑ

γ = 1.0
Ẑ = CliqueExpansionModularity(H, γ);
α̂ = learnParameters(H, Ẑ, Ω, α̂; n_iters = 500, amin = -50, amax = 50)

for i = 1:10
    
    tic = time()
    Ẑ = SuperNodeLouvain(H,kmax,Ω;α=α̂, verbose=false, scan_order = "random")
    toc = time()-tic
    α̂ = learnParameters(H, Ẑ, Ω, α̂; n_iters = 500, amin = -50, amax = 50)
    Q = modularity(H, Ẑ, Ω; α = α̂)
    
    print(rpad("$i", 20))
    print(rpad("$(round(Q))", 15))
    print(rpad("$(round(mutualInformation(Ẑ₂,Ẑ,true), digits = 2))", 10))
    print(rpad("$(length(unique(Ẑ)))", 10))
    println(rpad("$(round(toc; digits=3))", 10))
    
    Ẑ₂ = Ẑ
end


polyadic
iteration           Q              NMI       groups    time (s)  
-----------------------------------------------------------------


InterruptException: InterruptException:

In [243]:
D = countmap(Ẑ₂)
[(key, val) for (key, val) in D if val > 10]

2-element Array{Tuple{Int64,Int64},1}:
 (2, 296)
 (1, 1419)