# Data Descriptives

In [14]:
using Revise
using Pkg; Pkg.activate(".")
using HypergraphModularity

using CSV
using StatsBase
using DataFrames
using Statistics

[32m[1m Activating[22m[39m environment at `~/hypergraph_modularities_code/Project.toml`


In [7]:
function majority_size(e, Z)
    p = partitionize(Z[e])
    return p[1]/sum(p)
end

majority_size (generic function with 1 method)

In [11]:
function readCommittees(path)
    DF = DataFrame!(CSV.File(path));
    
    sub = combine(groupby(DF, :new_id)) do sdf
               sdf[argmax(sdf.new_id), :]
               end

    Z = sub[:, "party"];

    E = Dict()

    for sub ∈ groupby(DF, [:session, :committee])
        e = sub[!,:new_id]
        k = length(e)
        if !(k ∈ keys(E))
            E[k] = Dict()
        end
        E[k][e] = get(E[k], e, 0) + 1
    end

    for k in 1:maximum(keys(E))
        if !(k ∈ keys(E))
            E[k] = Dict()
        end
    end

    N = unique(DF[!, :new_id])

    H = hypergraph(N = N, E = E)
    
#     for k ∈ keys(H.E)
#         if k > 65
#             pop!(H.E, k)
#         end
#     end
    
    HypergraphModularity.computeDegrees!(H);
    
    return H, Z
end

readCommittees (generic function with 1 method)

In [44]:
function print_descriptives(d)
    
    
    if d == "senate-committees"
        H, Z = readCommittees("data/congress-committees/senate_committees.csv")
        two_groups = true
    elseif d == "house-committees"
        H, Z = readCommittees("data/congress-committees/house_committees.csv")
        two_groups = true
    else
        H, Z = read_hypergraph_data(d, 25)
        two_groups = false
    end


    H.E[1] = Dict()

    kmin = max(minimum(keys(H.E)), 2)
    kmax = maximum(keys(H.E))

    n = length(H.D)
    m = sum(sum(values(H.E[k])) for k in keys(H.E) if length(H.E[k]) > 0)
    c̄ = mean(H.D)
    sc = std(H.D)
    
    ks = [length(e) for k in keys(H.E) for e in keys(H.E[k])]
    k̄ = mean(ks)
    sk = std(ks) 
    
    maxk = maximum(ks)
    
        
    
    
    # mean size of largest majority on hyperedge
    
#     p = mean(majority_size(e, Z) for k in keys(H.E) for e in keys(H.E[k]) )
    p = mean(length(partitionize(Z[e])) == 1 for k in keys(H.E) for e in keys(H.E[k]) )
    
    println("")
    print(rpad(d, 35, " "))
    print(rpad("$n", 10, " "))
    print(rpad("$m", 10, " "))
    print(rpad("$(round(c̄, digits = 2)) ($(round(sc, digits = 2)))", 20, " "))
    print(rpad("$(round(k̄, digits = 2)) ($(round(sk, digits = 2)))", 20, " "))
    print(rpad("$(maxk)", 10, " "))
    
    
#     println("$dataset : The data has $n nodes and $m edges. The mean degree is $(round(c̄, digits = 2)) and the mean edge size is $(round(k̄, digits = 2)).")
end

print_descriptives (generic function with 1 method)

In [45]:
H, Z = read_hypergraph_data("SN-congress-bills")
Dict(k => sum(values(H.E[k])) for k in keys(H.E) if length(H.E[k]) > 0)


std(ks)




5.523521625889569

The table below shows the number of nodes, number of edges, mean degree, and mean edge dimension for each data set. Each of the data have been restricted to contain only edges of size no larger than 10. 

In [46]:
print(rpad("data", 35, " "))
print(rpad("n", 10, " "))
print(rpad("m", 10, " "))
print(rpad("c̄", 21, " "))
print(rpad("k̄", 21, " "))
print(rpad("maxk", 10, " "))
println("")
print(rpad("", 105, "-"))

for dataset in ["SN-congress-bills",
                "HR-congress-bills", 
                "contact-primary-school-classes", 
                "TrivagoClickout", 
                "walmart-trips", 
                "contact-high-school-classes", 
                "senate-committees", 
                "house-committees"]
    print_descriptives(dataset)
end

data                               n         m         c̄                   k̄                   maxk      
---------------------------------------------------------------------------------------------------------
SN-congress-bills                  294       20006     493.38 (406.3)      7.25 (5.52)         25        
HR-congress-bills                  1494      43047     273.98 (282.68)     9.51 (7.21)         25        
contact-primary-school-classes     242       12704     126.98 (55.26)      2.42 (0.55)         5         
TrivagoClickout                    172738    220758    4.02 (6.96)         4.15 (2.01)         25        
walmart-trips                      88860     65979     5.09 (26.71)        6.86 (5.33)         25        
contact-high-school-classes        327       7818      55.63 (27.1)        2.33 (0.53)         5         
senate-committees                  282       315       19.0 (14.64)        17.45 (6.64)        31        
house-committees                   1290     

In [36]:
H, Z = read_hypergraph_data("HR-congress-bills", 30)

(hypergraph
  N: Array{Int64}((1494,)) [1, 2, 3, 4, 5, 6, 7, 8, 9, 10  …  1485, 1486, 1487, 1488, 1489, 1490, 1491, 1492, 1493, 1494]
  E: Dict{Int64,Dict}
  D: Array{Int64}((1494,)) [528, 864, 621, 298, 836, 886, 382, 125, 223, 314  …  38, 65, 43, 50, 31, 28, 32, 31, 3, 18]
, [1, 1, 1, 1, 2, 1, 1, 1, 1, 2  …  2, 2, 2, 1, 1, 1, 1, 2, 1, 1])

In [37]:
[mean(length(partitionize(Z[e])) == 1 for e in keys(H.E[k])) for k in 2:30]

29-element Array{Float64,1}:
 0.6423719958202717
 0.503089951934081
 0.4164179104477612
 0.36363636363636365
 0.32040164308534913
 0.2909732728189612
 0.2711768050028425
 0.23157248157248156
 0.21226415094339623
 0.22489082969432314
 0.20325833979829325
 0.19083969465648856
 0.20430965682362331
 ⋮
 0.1381936887921654
 0.11858190709046455
 0.13005050505050506
 0.12483912483912483
 0.11877394636015326
 0.09099350046425256
 0.07537091988130563
 0.14672686230248308
 0.10465116279069768
 0.09970674486803519
 0.10542168674698796
 0.12195121951219512