In [3]:
using Revise
using Pkg; Pkg.activate(".")
using HypergraphModularity
using StatsBase
using Random 
using SpecialFunctions

using CSV
using DataFrames

using RCall

Random.seed!(1234);

[32m[1m Activating[22m[39m environment at `~/hypergraph_modularities_code/Project.toml`
┌ Info: Precompiling HypergraphModularity [0c934d27-dd44-49d7-950f-bd4be7819e54]
└ @ Base loading.jl:1260
│ - If you have HypergraphModularity checked out for development and have
│   added DelimitedFiles as a dependency but haven't updated your primary
│   environment's manifest file, try `Pkg.resolve()`.
│ - Otherwise you may need to report an issue with HypergraphModularity
  ** incremental compilation may be fatally broken for this module **

  ** incremental compilation may be fatally broken for this module **



In [4]:
function readCommittees(path)
    DF = DataFrame!(CSV.File(path));
    
    sub = combine(groupby(DF, :new_id)) do sdf
               sdf[argmax(sdf.new_id), :]
               end

    Z = sub[:, "party"];

    E = Dict()

    for sub ∈ groupby(DF, [:session, :committee])
        e = sub[!,:new_id]
        k = length(e)
        if !(k ∈ keys(E))
            E[k] = Dict()
        end
        E[k][e] = get(E[k], e, 0) + 1
    end

    for k in 1:maximum(keys(E))
        if !(k ∈ keys(E))
            E[k] = Dict()
        end
    end

    N = unique(DF[!, :new_id])

    H = hypergraph(N = N, E = E)
    
    for k ∈ keys(H.E)
        if k > 85
            pop!(H.E, k)
        end
    end
    
    HypergraphModularity.computeDegrees!(H);
    
    return H, Z
end

readCommittees (generic function with 1 method)

In [5]:
H, Z = readCommittees("data/congress-committees/house_committees.csv")
# S = evalSums(Z,H, maximum(Z))[3]

Dict{Array{Int64,1},BigInt} with 1763 entries:
  [51, 20] => 58877546076017568655633167974877046870687337086587979923212567477…
  [38, 5]  => 65423780105149189061429096608163632956083872515095221019074174552…
  [11, 6]  => 33729999678297595666583641880508899515246086384705174857107768800…
  [56, 11] => 47370194731308042477735666029724880075040550776669727678342104931…
  [44, 3]  => 18433903871714429530631500149534119820036550860291578281868210346…
  [68, 2]  => 44199524979883212411484781127786168860026998447026822873522466037…
  [52, 18] => 11460667844374974105964585391505172577727550591055770129946779266…
  [21, 14] => 50914188919191350508460984974254936174823658119738967307420393800…
  [64, 7]  => 98642344789082251061148318554145147003426829679257048694612324812…
  [50, 16] => 34499702831617373344697356415267124089444240564756332558602827729…
  [57, 23] => 17728854762593277755915702312652515671962639962768201128122772638…
  [39, 33] => 275790576174673235258070366776634336568700200880

In [6]:
# A = Vector{Float64}()
# O = Vector{String}()
# D = Vector{String}()
# K = Vector{Int64}()

function compare_Ω!(H, Z, data, O, Q, AICs, BICs; data_label = "", two_groups = false)
        
    kmax = maximum(keys(H.E))
    kmin = maximum([minimum(keys(H.E)), 2])
    n = length(H.D)
    
    V = big(sum(H.D))
    
    function ω_m(p, α)
        k = big(sum(p))
        kmax = length(α) ÷ 2
        return k/sum((p .* (1:length(p)).^α[k])) / V^(α[kmax+k]*k)
    end

    function ω_d(p, α)
        k = big(sum(p))
        kmax = length(α) ÷ 2
        dcut = big((sum(p*p') - p'*p)/2)
        return k/((1 + dcut)^α[k])/ V^(α[kmax+k]*k)
    end
    
    function ω_c(p, α)
        k = sum(p)
        δ = length(p) == 1 ? p[1] : p[1] - p[2]
        μ = k % 2 == 0 ? 2.0 : 1.0
        x = -μ + δ*log(μ) - logfactorial(δ)
        x = exp(x)
        return x^α[k]/V^(α[k+kmax]*k)
    end
    
    num_parameters = 0
    for aff in ["AON", "numGroups","committee", "rainbow", "cutThresh"]
        print("    ")
        print(rpad(aff, 15))
        α = vcat(repeat([1.0], kmax), 1:kmax);
        if aff == "AON"
            Ω̂ = estimateΩEmpirically(H, Z; aggregator = p -> [length(p) == 1, sum(p)], bigNums = true, two_groups = two_groups)
            num_parameters = 2*(kmax - kmin + 1)
        elseif aff == "numGroups"
            Ω̂ = estimateΩEmpirically(H, Z; aggregator = p -> [length(p), sum(p)], bigNums = true, two_groups = two_groups)
            k_poss = minimum([kmax, length(unique(Z))])
            num_parameters = sum(minimum([k_poss, k]) for k in kmin:kmax)
        elseif aff == "rainbow"
            Ω̂ = estimateΩEmpirically(H, Z; aggregator = p -> [length(p) != sum(p), sum(p)], bigNums = true, two_groups = two_groups)
            num_parameters = 2*(kmax - kmin + 1)
        elseif aff == "cutThresh"
            function agg_cut(p)
                k = sum(p)
                return [(sum(p*p') - p'*p) / (k*(k-1)) > 0.25, k] 
            end
            Ω̂ = estimateΩEmpirically(H, Z; aggregator = agg_cut, bigNums = true, two_groups = two_groups)
            num_parameters = 2*(kmax - kmin + 1)
        elseif aff == "committee"
            function agg(p)
                if length(p) == 1 return [true, sum(p)] end
                return [(p[1] - p[2])/sum(p) < 0.25, sum(p)]
            end
            Ω̂ = estimateΩEmpirically(H, Z; aggregator = agg, bigNums = true, two_groups = two_groups)
#             α = learnParameters(H, Z, Ω̂, α; max_iters = 50, verbose = false, tol = 1e-2) 
            num_parameters = 2*(kmax - kmin + 1)
        end
        
        q = modularity(H, Z, Ω̂; α = α)
        
        push!(data, data_label)
        push!(O, aff)
        push!(Q, q)
        
        AIC = 2*q - 2*num_parameters
        push!(AICs, AIC)
        print("")
        print(rpad(round(Float64(AIC), digits = 2), 30))
        
        N   = sum(big(n)^k/factorial(big(k)) for k in kmin:kmax) # number of observations, very large
        BIC = 2*q - num_parameters*log(N)
        push!(BICs, BIC)
        print("")
        println(rpad(round(Float64(BIC), digits = 2), 30))
    end
end

compare_Ω! (generic function with 1 method)

In [7]:
datasets = [
            "house-committees",
            "senate-committees", 
            "SN-congress-bills",
            "HR-congress-bills",
            "contact-primary-school-classes",
            "contact-high-school-classes",
            "walmart-trips", 
            "TrivagoClickout"]

data     = Vector{String}()
O        = Vector{String}()
Q        = Vector{Float64}()
AICs     = Vector{Float64}()
BICs     = Vector{Float64}()

for d in datasets

    println(d)
    println("")
    
    if d == "senate-committees"
        H, Z = readCommittees("data/congress-committees/senate_committees.csv")
        two_groups = true
    elseif d == "house-committees"
        H, Z = readCommittees("data/congress-committees/house_committees.csv")
        two_groups = true
    else
        H, Z = read_hypergraph_data(d, 25)
        two_groups = false
    end
    println("    Number of nodes: $(length(H.D))")
    println("    Number of groups: $(length(unique(Z)))")
    println("    Mean degree: $(round((mean(H.D)), digits = 2))")

    num = sum(sum(collect(values(H.E[k])))*k for k in keys(H.E) if length(H.E[k]) > 0)
    denom = sum(sum(collect(values(H.E[k]))) for k in keys(H.E) if length(H.E[k]) > 0)

    mean_k = num/denom

    println("    Mean edge size: $(round((mean_k), digits = 2))")
    println("")
    print(rpad("    Ω", 20))
    print(rpad("AIC", 30))
    println(rpad("BIC", 30))
    println(rpad("    ", 60, "-"))
    compare_Ω!(H, Z, data, O, Q, AICs, BICs; data_label = d, two_groups = two_groups)
    println("")
end

house-committees

    Number of nodes: 1290
    Number of groups: 2
    Mean degree: 9.18
    Mean edge size: 34.79

    Ω               AIC                           BIC                           
    --------------------------------------------------------
    AON            -222204.24                    -271275.32                    
    numGroups      -222204.24                    -271275.32                    
    committee      -222119.25                    -271190.33                    
    rainbow        -222212.12                    -271283.2                     
    cutThresh      -222202.18                    -271273.26                    

senate-committees

    Number of nodes: 282
    Number of groups: 2
    Mean degree: 19.0
    Mean edge size: 17.24

    Ω               AIC                           BIC                           
    --------------------------------------------------------
    AON            -92239.49                     -97934.86                     
 

In [18]:
R"""
library(tidyverse)

tibble(
    data = $(data),
    omega = $O, 
    Q = $Q, 
    AIC = $(AICs), 
    BIC = $(BICs)
) %>% 
    write_csv("fig/omega_throughput/omega_comparisons.csv")
"""

RObject{VecSxp}
# A tibble: 0 x 5
# … with 5 variables: data <chr>, omega <chr>, Q <dbl>, AIC <dbl>, BIC <dbl>


In [26]:
R"""
read_csv("fig/omega_throughput/omega_comparisons.csv") %>% 
    filter(data == "walmart-trips") %>% 
    filter(omega != "majorization")
"""

│ cols(
│   data = col_character(),
│   omega = col_character(),
│   Q = col_double(),
│   AIC = col_double(),
│   BIC = col_double()
│ )
└ @ RCall /home/phil/.julia/packages/RCall/Qzssx/src/io.jl:160


RObject{VecSxp}
# A tibble: 4 x 5
  data          omega             Q        AIC        BIC
  <chr>         <chr>         <dbl>      <dbl>      <dbl>
1 walmart-trips AON       -5376125. -10752345. -10763139.
2 walmart-trips numGroups -5351973. -10704387. -10753858.
3 walmart-trips dyadic    -5365952. -10732000. -10742793.
4 walmart-trips committee -5378264. -10756625. -10767418.
