In [1]:
include("jl/inference.jl")
include("jl/objectives.jl")
using Printf

In [2]:
function read_hypergraph_data(dataname::String, maxsize::Int64=25)
    labels = Int64[]
    open("data/$dataname/node-labels-$dataname.txt") do f
        for line in eachline(f)
            push!(labels, parse(Int64, line))
        end
    end
    n = length(labels)

    E = Dict{Integer, Dict}()
    open("data/$dataname/hyperedges-$dataname.txt") do f
        for line in eachline(f)
            edge = [parse(Int64, v) for v in split(line, ',')]
            sort!(edge)
            if length(edge) > maxsize; continue; end
            sz = length(edge)
            if !haskey(E, sz)
                E[sz] = Dict{}()
            end
            E[sz][edge] = 1
        end
    end
    
    D = zeros(Int64, n)
    for (sz, edges) in E
        for (e, _) in edges
            D[e] .+= 1
        end
    end
    
    N = 1:n
    
    return hypergraph(N, E, D), labels
end
;

In [3]:
# A bunch of move-based aggregation functions,
# all stratified by hyperedge size

identity(p::Vector{Int64}) = p

function discount_cut(p::Vector{Int64}, α=1.0)
    discount = sum(p .^ α) - maximum(p) ^ α
    return (sum(p), discount)
end

function sum_of_ext_degs(p::Vector{Int64})
    soed = length(p) - 1
    return (sum(p), soed)
end

function all_or_nothing(p::Vector{Int64})
    is_aon = length(p) == 1
    return (sum(p), is_aon)
end

function rainbow(p::Vector{Int64})
    is_rainbow = length(p) == sum(p) && length(p) > 1
    return (sum(p), is_rainbow)
end


function estimate_all(H, labels)
    aggs = [identity, discount_cut, sum_of_ext_degs, 
            all_or_nothing, rainbow]
    return [estimateΩEmpirically(H, labels; min_val=0, 
                                 aggregator=agg) for agg in aggs]
end

estimate_all (generic function with 1 method)

In [4]:
function show_estimates(H, labels, maxk)
    Ω̂s = estimate_all(H, labels)
    for k = 1:maxk
        for p in partitions(k)
            estimates = [Ω̂(p; α=1, mode="partition") for Ω̂ in Ω̂s]
            strs = join([@sprintf("%.3e", est) for est in estimates], ", ")
            println("$p\n\t$strs\n")
        end
    end
end

show_estimates (generic function with 1 method)

In [5]:
function comparisons(H, labels, krange)
    @time Ω̂ = estimateΩEmpirically(H, labels; min_val=0,
                                    aggregator=discount_cut)
    for k in krange
        p1 = [k]
        p2 = [k - 1, 1]
        p3 = [ceil(Int64, k / 2), floor(Int64, k / 2)]
        e1 = Ω̂(p1; α=1, mode="partition")
        e2 = Ω̂(p2; α=1, mode="partition")
        e3 = Ω̂(p3; α=1, mode="partition")    
        r1 = round(e1 / e2, digits=4)
        r2 = round(e1 / e3, digits=4)
        println("\t $p1 / $p2 $r1   $p1 / $p3 $r2")
    end
end

comparisons (generic function with 1 method)

In [8]:
for (dataset, krange) in [("contact-primary-school", 2:4), 
                          ("walmart-trips", 2:10),
                          ("TrivagoClickout", 2:12), 
                          ("congress-bills", 2:12)]
    println("$dataset...")
    H, labels = read_hypergraph_data(dataset)
    comparisons(H, labels, krange)
end

contact-primary-school...
  0.032847 seconds (242.09 k allocations: 20.447 MiB)
	 [2] / [1, 1] 1.6772   [2] / [1, 1] 1.6772
	 [3] / [2, 1] 13.3441   [3] / [2, 1] 13.3441
	 [4] / [3, 1] 23.2015   [4] / [2, 2] 87.1206
walmart-trips...
  1.247617 seconds (2.54 M allocations: 751.053 MiB, 20.34% gc time)
	 [2] / [1, 1] 2.7346   [2] / [1, 1] 2.7346
	 [3] / [2, 1] 2.7092   [3] / [2, 1] 2.7092
	 [4] / [3, 1] 3.6565   [4] / [2, 2] 4.9379
	 [5] / [4, 1] 4.6371   [5] / [3, 2] 6.1936
	 [6] / [5, 1] 5.456   [6] / [3, 3] 10.179
	 [7] / [6, 1] 7.8185   [7] / [4, 3] 17.587
	 [8] / [7, 1] 8.1443   [8] / [4, 4] 28.8033
	 [9] / [8, 1] 8.6439   [9] / [5, 4] 52.4397
	 [10] / [9, 1] 11.0399   [10] / [5, 5] 103.4761
TrivagoClickout...
  2.635525 seconds (5.66 M allocations: 995.895 MiB, 36.10% gc time)
	 [2] / [1, 1] NaN   [2] / [1, 1] NaN
	 [3] / [2, 1] 1005.4393   [3] / [2, 1] 1005.4393
	 [4] / [3, 1] 1178.5841   [4] / [2, 2] 23684.2647
	 [5] / [4, 1] 1656.1087   [5] / [3, 2] 20785.7698
	 [6] / [5, 1] 155

In [None]:
aggregators = Dict(
    "Identity"                => identity,
    "Discount Cut"            => discount_cut,
    "Sum of Exterior Degrees" => sum_of_ext_degs,
    "All or Nothing"          => all_or_nothing,
    "Rainbow"                 => rainbow
    )

function likelihoods(dataset)
    
    H, labels = read_hypergraph_data(dataset, 10)
    for key ∈ keys(aggregators)
        Ω̂ = estimateΩEmpirically(H, labels; min_val=1e-30,
                                    aggregator=aggregators[key])

        ll = sum(L(H, labels, Ω̂; α = 0, bigInt=true))
        ll = round(Float64(ll, RoundDown),digits=1)
        
        println(rpad("   $key ", 30, " "), "L = $ll")
    end
end

# for dataset ∈ ["walmart-trips"]
for dataset ∈ ["contact-primary-school", "walmart-trips", "TrivagoClickout", "congress-bills"]
    println(dataset)
    likelihoods(dataset)
end

contact-primary-school
   Identity                   L = -87116.4
   Discount Cut               L = -87537.3
   Sum of Exterior Degrees    L = -87124.0
   Rainbow                    L = -104843.8
   All or Nothing             L = -100816.5
walmart-trips
   Identity                   L = -1.7457772e6
   Discount Cut               L = -1.7532442e6
   Sum of Exterior Degrees    L = -1.7462504e6
   Rainbow                    L = -1.7772907e6
   All or Nothing             

A few things are interesting about these results. First, the `Identity` aggregator always achieves the highest likelihood; this is expected since it contains the maximal number of parameters. The other aggregators vary in their relationship to the data. While `Rainbow` appears never to be competitive, the `Sum of Exterior Degrees` often performs nearly as well as the `Identity`. It is possible that the `Sum of Exterior Degrees` implicitly contains a large number of parameters, which would explain this behavior. One interesting approach would be to try to punish complexity associated with having many parameters -- an information criterion or explicit Bayesian prior would both be ways to do this. 

Note that we have restricted the data to edges of size 8 and below in this case. 
We get somewhat odd results on Trivago when using larger hyperedges (e.g. up to size 15). I think this is because there are several partitions 
