Data from this paper: 

https://linqs.soe.ucsc.edu/data

In [7]:
using Pkg; Pkg.activate(".")
using HypergraphModularity

using DelimitedFiles
using Clustering

[32m[1m Activating[22m[39m environment at `~/hypergraph_modularities_code/Project.toml`


In [2]:
function read_cora(kmax = 50)
    
    recode = Dict(
         "Neural_Networks"        => 1,
         "Rule_Learning"          => 2,
         "Reinforcement_Learning" => 3,
         "Probabilistic_Methods"  => 4,
         "Theory"                 => 5,
         "Genetic_Algorithms"     => 6,
         "Case_Based"             => 7,
    )
    
    Z = []
    E = Dict{Int64, Dict{Array{Int64}, Int64}}()
    H = nothing
    
    E_L = [[] for i in 1:1433]
    
    i = 1
    open("data/cora/cora.content") do file
        for ln in eachline(file)
            S = split(ln, "\t")
            push!(Z, S[end]) # add category to Z

            # eliminate first id 
            # and label
            S = S[2:(end-1)]

            # get words 
            words = findall(x -> (x ≈ 1.0), parse.(Int64, S))
            
            for w in words
                push!(E_L[w], i)
            end
            
            i += 1
        end
    
        for e in E_L
            if length(e) > 0
                sort!(e)
                k = length(e)
                if !(k ∈ keys(E))
                    E[k] = Dict(e => 1)
                else
                    E[k][e] = get(E[k], e, 0) + 1
                end
            end
        end
        
        KMAX = maximum(keys(E))
        n = maximum([maximum(key) for k in keys(E) for key in keys(E[k])])
        for k ∈ 2:KMAX
            if !(k ∈ keys(E))
                E[k] = Dict()
            end
        end
        
        
        for k in keys(E)
            if k > kmax
                delete!(E, k)
            end
        end
        
        N = 1:n
        H = HypergraphModularity.hypergraph(N, E, zero(N))
        HypergraphModularity.computeDegrees!(H);
        
    end
    Z = [recode[z] for z in Z]
    
    return H, Z
#     return E
end

read_cora (generic function with 2 methods)

In [29]:
kmax = 10
H, Z = read_cora(kmax);
n = length(H.D)

2708

In [4]:
# all or nothing
Ω = estimateΩEmpirically(H, Z; aggregator = p -> [length(p) == 1, sum(p)], bigNums = true);
Q_H = modularity(H, Z, Ω; α = nothing);
println(round(Q_H))

# # number of groups
# Ω = estimateΩEmpirically(H, Z; aggregator = p -> [length(p) == 1, sum(p)], bigNums = true);
# Q_H = modularity(H, Z, Ω; α = nothing);
# println(round(Q_H))


# smoothed parameterization
# α0 = vcat(repeat([0.0], kmax), 1:kmax)

# function ω(p, α)
#     k = sum(p)
#     return sum(p)/sum((p .* (1:length(p)).^α[k])) / n^(α[kmax+k]*k)
# end
# Ω = partitionIntensityFunction(ω, kmax);

# # warmstart
# α = learnParameters(H, Z, Ω, α0; n_iters = 10, amin = -10, amax = 20)

# Q_H = modularity(H, Z, Ω; α = α)

# println(round(Q_H))

-27531.0


In [42]:
H, Z = kcore(H, Z, 2)

(hypergraph
  N: Array{Int64}((0,)) Int64[]
  E: Dict{Int64,Dict}
  D: Array{Int64}((0,)) Int64[]
, Int64[])

In [40]:
function evalPartition(H, Ẑ, Z; weighted = false)
    """
    evaluate the dyadic modularity, dyadic log likelihood, 
    and polyadic modularity for 
    """
    # dyadic parameters
    γ = computeDyadicResolutionParameter(H, Ẑ)
    ωᵢ, ωₒ = computeDyadicResolutionParameter(H, Ẑ; mode="ω", weighted=weighted)
    
    # dyadic measures
    Q_D = dyadicModularity(H, Ẑ, γ; weighted=weighted)
    L_D = dyadicLogLikelihood(H, Ẑ, ωᵢ, ωₒ; constants = true)
    
    # polyadic measures
    Ω = estimateΩEmpirically(H, Ẑ; aggregator = p -> [length(p) == 1, sum(p)])
    Q_H = modularity(H, Ẑ, Ω; α = nothing)
    L_H = logLikelihood(H, Ẑ, Ω; α = nothing)
    return Dict("Q_D" => Q_D, "L_D" => L_D, "Q_H" => Q_H, "ARI" => randindex(Z, Ẑ)[1])
end

function printMetrics((D, Z)::Tuple{Dict, Vector{Int64}}, label)
    print(rpad("   " * label, 25))
    print(rpad("$(round(D["Q_D"],digits = 2))", 15))
    print(rpad("$(round(D["L_D"],digits = 2))", 15))
    print(rpad("$(round(D["Q_H"]))", 15))
    print(rpad("$(length(unique(Z)))", 10))
    print(rpad("$(D["ARI"])", 10))
    println("")
end

function printHeader()
    print(rpad("data", 25))
    print(rpad("Q_D", 15))
    print(rpad("L_D", 15))
    print(rpad("Q_H", 15))
    print(rpad("k", 10))
    print(rpad("ARI", 10))
    println("")
    print(rpad("", 100, "-"))
    println("")
end

function polyadicExperiment(H, n_rounds = 10)
    
    γ = 0.01
    S = -Inf
    
    n = length(H.D)
    best_Z = collect(1:n)
    best_D = nothing
    
    α = nothing
    
    # warmstart
    Ẑ = CliqueExpansionModularity(H, γ)
    Ω = estimateΩEmpirically(H, Ẑ; aggregator = p -> [length(p) == 1, sum(p)], bigNums = true)
    
    for i ∈ 1:n_rounds
        Ẑ = SuperNode_PPLouvain(H, Ω; α = α, verbose = false, Z0 = collect(1:n), clusterpenalty = n)
        Ω = estimateΩEmpirically(H, Ẑ; aggregator = p -> [length(p) == 1, sum(p)], bigNums = true)

        D = evalPartition(H, Ẑ, Z)

        if D["Q_H"] > S
            best_D = D
            best_Z = Ẑ
        end
    end
    
    return best_D, best_Z
end

polyadicExperiment (generic function with 2 methods)

In [41]:
D = polyadicExperiment(H, 20)
printHeader()
printMetrics(D,"test")

data                     Q_D            L_D            Q_H            k         ARI       
----------------------------------------------------------------------------------------------------
   test                  1.0            NaN            -27644.0       6         -0.0005204524814337093


In [47]:
kmax = maximum(keys(H.E))
n = length(H.D)
function ω(p, α)
    k = sum(p)
    return sum(p)/sum((p .* (1:length(p)).^α[k])) / n^(α[kmax+k]*k)
end

Ω = partitionIntensityFunction(ω, kmax);

In [55]:
α = vcat(repeat([0.0], kmax), 1:kmax)

γ = 50.0
Z = CliqueExpansionModularity(H, γ, false, true)

for i in 1:10
    α = learnParameters(H, Z, Ω, α; n_iters = 10, amin = -10, amax = 10)
    Z = SuperNodeLouvain(H,kmax,Ω;α=α, verbose=false, scan_order ="random")
    Q_H = modularity(H, Z, Ω; α = α)
    print(rpad("$(round(Q_H, digits = 0))", 15))
    println(length(unique(Z)))
end

-31731.0       695
-29938.0       689
-29953.0       689
-29944.0       685
-29933.0       688
-29958.0       688
-29938.0       686
-29971.0       689
-29939.0       689
-29921.0       687


In [None]:
length(unique(Z))