# Data Comparisons

A comparative study of dyadic and polyadic methods on a selection of data sets, always using all-or-nothing cuts. 

Consider replacing the modularity with the full polyadic likelihood function. Should also check to make sure that this function is still right -- unit test. 

In [2]:
using Pkg; Pkg.activate(".")
using HypergraphModularity

using StatsBase
using SparseArrays
using SpecialFunctions

[32m[1m Activating[22m[39m environment at `~/hypergraph_modularities_code/Project.toml`


In [15]:
function evalPartition(H, Z; weighted = false)
    """
    evaluate the dyadic modularity, dyadic log likelihood, 
    and polyadic modularity for 
    """
    # dyadic parameters
    γ = computeDyadicResolutionParameter(H, Z)
    ωᵢ, ωₒ = computeDyadicResolutionParameter(H, Z; mode="ω", weighted=weighted)
    
    # dyadic measures
    Q_D = dyadicModularity(H, Z, γ; weighted=weighted)
    L_D = Float64(dyadicLogLikelihoodDirect(H, Z, ωᵢ, ωₒ))
    
    # polyadic measures
    Ω = estimateΩEmpirically(H, Z; aggregator = p -> [length(p) == 1, sum(p)])
    Q_H = modularity(H, Z, Ω; α = nothing)
    L_H = logLikelihood(H, Z, Ω; α = nothing)
    return Dict("Q_D" => Q_D, "L_D" => L_D, "Q_H" => Q_H)
end

evalPartition (generic function with 1 method)

In [16]:
function printMetrics((D, Z, NMI)::Tuple{Dict, Vector{Int64}, Float64}, label)
    print(rpad("   " * label, 25))
    print(rpad("$(round(D["Q_D"],digits = 2))", 15))
    print(rpad("$(round(D["L_D"],digits = 2))", 15))
    print(rpad("$(round(D["Q_H"]))", 15))
    print(rpad("$(length(unique(Z)))", 10))
    print(rpad("$(round(NMI,digits = 3))", 10))
    println("")
end

function printHeader()
    print(rpad("data", 25))
    print(rpad("Q_D", 15))
    print(rpad("L_D", 15))
    print(rpad("Q_H", 15))
    print(rpad("k", 10))
    print(rpad("NMI (true)", 10))
    println("")
    print(rpad("", 90, "-"))
    println("")
end

printHeader (generic function with 1 method)

In [17]:
function dyadicExperiment(H, score = "Q_D", n_rounds = 10, Z̄ = nothing)
    
    S = -Inf
    
    n = length(H.D)
    best_Z = collect(1:n)
    best_D = nothing
    for γ₀ ∈ [1.0]
        Z = CliqueExpansionModularity(H, γ₀, false, true);
        γ = computeDyadicResolutionParameter(H, Z)
        for i ∈ 1:n_rounds
            Z = CliqueExpansionModularity(H, γ, false, true);
            γ = computeDyadicResolutionParameter(H, Z)

            D = evalPartition(H, Z)

            if D[score] > S
                best_D = D
                best_Z = Z
            end
        end
    end
    
    NMI = isnothing(Z̄) ? NaN : mutualInformation(best_Z, Z̄, true)
    
    return best_D, best_Z, NMI
end

dyadicExperiment (generic function with 4 methods)

In [18]:
function polyadicExperiment(H, n_rounds = 10, Z̄ = nothing)
    
    S = -Inf
    
    n = length(H.D)
    best_Z = collect(1:n)
    best_D = nothing
    
    α = nothing
    
    for γ ∈ [1.0, 5.0, 10.0, 50.0]
    
        # warmstart
        Z_ = CliqueExpansionModularity(H, γ)
        Ω = estimateΩEmpirically(H, Z_; aggregator = p -> [length(p) == 1, sum(p)], bigNums = true)

        for i ∈ 1:n_rounds
            Z = SuperNode_PPLouvain(H, Ω; α = α, verbose = false, Z0 = collect(1:n))
            Ω = estimateΩEmpirically(H, Z; aggregator = p -> [length(p) == 1, sum(p)], bigNums = true)

            D = evalPartition(H, Z)

            if D["Q_H"] > S
                best_D = D
                best_Z = Z
            end
        end
        
        NMI = isnothing(Z̄) ? NaN : mutualInformation(best_Z, Z̄, true)

        return best_D, best_Z, NMI
    end
end

polyadicExperiment (generic function with 3 methods)

In [19]:
function experiment(H, n_rounds = 10, Z̄ = nothing)    
#     println("  Dyadic (Modularity)")
    printMetrics(dyadicExperiment(H, "Q_D", n_rounds, Z̄), "dyadic (Q)")
#     println("  Dyadic (Likelihood)")
    printMetrics(dyadicExperiment(H, "L_D", n_rounds, Z̄), "dyadic (L)")
#     println("  Polyadic (Modularity)")
    printMetrics(polyadicExperiment(H, n_rounds, Z̄), "polyadic (L)")
end

experiment (generic function with 3 methods)

In [21]:
datasets = Dict(
    "email-Enron-full"               => 20,
    "contact-high-school-classes"    => 6,
    "contact-primary-school-classes" => 6,
    "congress-bills"                 => 20,
#     "walmart-trips"                  => 4,
#     "stats-coauth" => 10
)

printHeader()

for f in keys(datasets)
    kmax = datasets[f]
    if f == "stats-coauth"
        H = read_stats_data(kmax)
        Z̄ = nothing
    elseif f == "email-Enron-full"
        H = read_hypergraph_data(f, kmax, 2, false)
        Z̄ = nothing
    else
        H, Z̄ = read_hypergraph_data(f, kmax, 2, true)
    end
    println(f)
    experiment(H, 10, Z̄)
end

data                     Q_D            L_D            Q_H            k         NMI (true)
------------------------------------------------------------------------------------------
congress-bills
   dyadic (Q)            0.62           Inf            -6.413828e+06  2         0.0       
   dyadic (L)            0.62           Inf            -6.413793e+06  2         0.0       
   polyadic (L)          0.62           Inf            -6.413556e+06  2         0.0       
contact-high-school-classes
   dyadic (Q)            0.62           -17306.47      -112598.0      5         0.809     
   dyadic (L)            0.62           -17306.47      -112598.0      5         0.809     
   polyadic (L)          0.55           -16922.87      -109920.0      9         1.0       
email-Enron-full
   dyadic (Q)            0.44           -7316.73       -29173.0       9         NaN       
   dyadic (L)            0.44           -7358.38       -29218.0       9         NaN       
   polyadic (L)          0.33 

1. Consider cross-initializing to check whether partitions are local optima of other methods. 
2. Fix data set and vary $\Omega$, show that doing so leads to different likelihoods/clusters. Could be part of the case study.

Performance story -- what's the right way to tell this story?