# Advanced Visualizations

## Set-Up

In [1]:
using DrWatson
@quickactivate "CategoricalDataScience"

In [2]:
using CairoMakie
using CSV
using GeoMakie
using Polylabel
using DataFrames
using GeoDataFrames
using Distributions
using StatsBase
using Statistics
using SwarmMakie
using ZipFile
using Shapefile
using IPUMS

In [4]:
country = DataFrame()
for filename in readdir(joinpath("..", "..", "data", "exp_raw", "arda"))
    state_name = titlecase(splitext(split(filename, "state_")[2])[1])
    println("Processing $state_name")
    df = CSV.read(
        joinpath("..", "..", "data", "exp_raw", "arda", filename),
        DataFrame,
        types=Dict(
            "Adherents" => Union{Missing, Int32},
            "Congregations" => Union{Missing, Int32},
            "Religious Bodies" => String),
        silencewarnings=true) |> DataFrame
    df = df[completecases(df, ["Religious Bodies", "Tradition", "Family"]), :]
    df = df[(strip.(df[:, "Religious Bodies"])) .|> x -> !isempty(x), :]
    df = coalesce.(df, 0)
    df = groupby(df, [:Tradition]) # groupby the tradition
    df = combine(
        df,
        [:Adherents, :Congregations] =>
        ((a, c) -> (Adherents=sum(a), Congregations=sum(c))) =>
        AsTable) # add adherents and congregation when we do the groupby
    df.State .= state_name # add a feature that says which state the data is from
    country = vcat(country, df) # concatenate this into a larger dataframe
end

Processing Alabama
Processing Alaska
Processing Arizona
Processing Arkansas
Processing California
Processing Colorado
Processing Connecticut
Processing Delaware
Processing District Of Columbia
Processing Florida
Processing Georgia
Processing Hawaii
Processing Idaho
Processing Illinois
Processing Indiana
Processing Iowa
Processing Kansas
Processing Kentucky
Processing Louisiana
Processing Maine
Processing Maryland
Processing Massachusetts
Processing Michigan
Processing Minnesota
Processing Mississippi
Processing Missouri
Processing Montana
Processing Nebraska
Processing Nevada
Processing New Hampshire
Processing New Jersey
Processing New Mexico
Processing New York
Processing North Carolina
Processing North Dakota
Processing Ohio
Processing Oklahoma
Processing Oregon
Processing Pennsylvania
Processing Rhode Island
Processing South Carolina
Processing South Dakota
Processing Tennessee
Processing Texas
Processing Utah
Processing Vermont
Processing Virginia
Processing Washington
Processing 

In [5]:
country

Row,Tradition,Adherents,Congregations,State
Unnamed: 0_level_1,String31,Int64,Int64,String
1,Evangelical Protestant,4077,2732,Alabama
2,Mainline Protestant,212,340,Alabama
3,Catholic,0,162,Alabama
4,Black Protestant,0,1471,Alabama
5,Latter-day Saints,0,77,Alabama
6,Jehovah's Witnesses,0,157,Alabama
7,Islam,0,37,Alabama
8,Hinduism,0,7,Alabama
9,Buddhism,585,11,Alabama
10,Judaism,375,9,Alabama


In [None]:
arda = DataFrame()
# exclude district of columbia
for state_name in unique(country.State)
    if state_name != "District Of Columbia"
        row = DataFrame(State=state_name)
        for tradition in sort(unique(country.Tradition))
            adherents = sum(country.Adherents[country.Tradition .== tradition .&& country.State .== state_name])
            congregations = sum(country.Congregations[country.Tradition .== tradition .&& country.State .== state_name])
            row = hcat(row, DataFrame("$tradition Adherents" => adherents, "$tradition Congregations" => congregations))
        end
        arda = vcat(arda, row)
    end
end
arda

Row,State,Black Protestant Adherents,Black Protestant Congregations,Buddhism Adherents,Buddhism Congregations,Catholic Adherents,Catholic Congregations,Evangelical Protestant Adherents,Evangelical Protestant Congregations,Hinduism Adherents,Hinduism Congregations,Islam Adherents,Islam Congregations,Jehovah's Witnesses Adherents,Jehovah's Witnesses Congregations,Judaism Adherents,Judaism Congregations,Latter-day Saints Adherents,Latter-day Saints Congregations,Mainline Protestant Adherents,Mainline Protestant Congregations,Orthodox Adherents,Orthodox Congregations,Other Adherents,Other Congregations,Other Christians Adherents,Other Christians Congregations
Unnamed: 0_level_1,String,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,Alabama,0,1471,585,11,0,162,4077,2732,0,7,0,37,0,157,375,9,0,77,212,340,828,18,0,12,0,25
2,Alaska,1179,21,713,12,0,89,4116,709,160,1,400,2,0,31,464,1,0,81,1481,170,808,92,690,14,498,9
3,Arizona,2142,119,0,33,0,269,7116,2852,0,8,0,35,0,306,0,29,0,935,681,436,2442,39,0,44,34,61
4,Arkansas,1482,877,100,12,0,129,6105,3706,200,1,0,12,0,134,1086,4,0,71,430,840,1413,14,0,8,600,32
5,California,368,1259,0,499,320,3,9566,6453,0,50,0,308,0,0,0,200,0,8,446,2790,3744,326,0,317,572,279
6,Colorado,1504,99,0,52,0,269,7553,2455,0,2,0,23,0,211,0,19,0,316,524,678,1225,37,0,36,34,58
7,Connecticut,1544,142,455,17,0,361,6416,1162,0,4,0,49,0,133,243,61,0,36,1062,737,4340,59,0,22,0,41
8,Delaware,3847,83,567,4,0,45,6605,483,0,3,0,9,0,38,543,5,0,12,1115,249,1073,7,293,7,40,11
9,Florida,563,1499,0,90,0,561,6923,4928,0,32,0,157,0,0,297,146,0,278,641,1810,3863,172,0,88,274,203
10,Georgia,172,1525,430,30,0,194,7328,2838,0,18,0,99,0,455,910,45,0,167,604,589,3316,59,0,56,161,57


In [None]:
state_census_data = Shapefile.Table(joinpath("..", "..", "data", "exp_raw", "census", "tl_2024_us_state.zip")) |> DataFrame
sort!(state_census_data, :NAME)
# drop the columns with only one unique value
select!(state_census_data, Not(["LSAD", "MTFCC", "FUNCSTAT"]))
# remove rows where name is commonwealth of the northern mariana islands, guam, american samoa, and puerto rico
state_census_data = state_census_data[
    (state_census_data.NAME .!= "Commonwealth of the Northern Mariana Islands") .&
    (state_census_data.NAME .!= "Guam") .&
    (state_census_data.NAME .!= "American Samoa") .&
    (state_census_data.NAME .!= "Puerto Rico") .&
    (state_census_data.NAME .!= "District of Columbia") .&
    (state_census_data.NAME .!= "United States Virgin Islands"), :]
state_census_data

Row,geometry,REGION,DIVISION,STATEFP,STATENS,GEOID,GEOIDFQ,STUSPS,NAME,ALAND,AWATER,INTPTLAT,INTPTLON
Unnamed: 0_level_1,Polygon?,String?,String?,String?,String?,String?,String?,String?,String?,Int64?,Int64?,String?,String?
1,Polygon(13638 Points),3,6,01,01779775,01,0400000US01,AL,Alabama,131185561946,4581813708,+32.7395785,-086.8434469
2,Polygon(15329 Points),4,9,02,01785533,02,0400000US02,AK,Alaska,1479508971743,244710526650,+63.3473560,-152.8397334
3,Polygon(10816 Points),4,8,04,01779777,04,0400000US04,AZ,Arizona,294366118294,853991999,+34.2039362,-111.6063449
4,Polygon(29782 Points),3,7,05,00068085,05,0400000US05,AR,Arkansas,134658517854,3122715710,+34.8955256,-092.4446262
5,Polygon(12103 Points),4,9,06,01779778,06,0400000US06,CA,California,403673433805,20291632828,+37.1551773,-119.5434183
6,Polygon(7513 Points),4,8,08,01779779,08,0400000US08,CO,Colorado,268418973518,1185541418,+38.9937669,-105.5087122
7,Polygon(3204 Points),1,1,09,01779780,09,0400000US09,CT,Connecticut,12541999507,1816115183,+41.5798637,-072.7466572
8,Polygon(5345 Points),3,5,10,01779781,10,0400000US10,DE,Delaware,5046692239,1399219008,+38.9985661,-075.4416440
9,Polygon(20603 Points),3,5,12,00294478,12,0400000US12,FL,Florida,138965379385,45968913048,+28.3989775,-082.5143005
10,Polygon(30758 Points),3,5,13,01705317,13,0400000US13,GA,Georgia,149485762701,4419221858,+32.6295789,-083.4235109


In [None]:
ddi = parse_ddi(datadir("exp_raw", "ipums_cps", "cps_00001.xml"))
ipums_data = load_ipums_extract(ddi, datadir("exp_raw", "ipums_cps", "cps_00001.dat"))

In [11]:
simplified = ipums_data[:, Not(:SERIAL, :HWTFINL, :ASECFLAG, :CPSID, :HFLAG, :ASECWTH, :STAMPMO, :PERNUM, :WTFINL, :CPSIDV, :CPSIDP, :ASECWT, :HOUSSUB, :SCHLLUNCH, :UCUNION, :CESOCCONTCT, :CESOCIALIZE, :FREVER, :OFFPOV, :TCIGDAY, :TAGESMK, :TAMSMK, :FOODSTMP, :MONTH, :YEAR)]
state_map = colmetadata(simplified, :STATECENSUS, "category labels") |> Dict
dropmissing!(simplified)
simplified.STATE .= [state_map[x] for x in simplified.STATECENSUS]
simplified = simplified[:, Not(:STATECENSUS)]
simplified_avgs = groupby(simplified, :STATE) |>
  x -> combine(x, [:HHINCOME, :PROPTAX, :FOODSTAMP] =>
  ((h, p, f) -> (HHINCOME_AVG = mean(h), PROPTAX_AVG = mean(p), FOODSTAMP_AVG = mean(f))) => AsTable)
sort!(simplified_avgs, :STATE)
# remove district of columbia
simplified_avgs = simplified_avgs[simplified_avgs.STATE .!= "District of Columbia", :]
simplified_avgs

Row,STATE,HHINCOME_AVG,PROPTAX_AVG,FOODSTAMP_AVG
Unnamed: 0_level_1,String,Float64,Float64,Float64
1,Alabama,73663.7,648.106,462.29
2,Alaska,94280.4,2254.97,359.597
3,Arizona,75101.4,1015.51,489.473
4,Arkansas,69274.7,733.436,419.968
5,California,92490.5,2144.59,317.446
6,Colorado,93898.9,1468.85,263.7
7,Connecticut,1.1211e5,4506.04,275.09
8,Delaware,83628.7,2354.49,353.252
9,Florida,76914.8,2396.99,359.767
10,Georgia,77260.5,1246.93,463.951


In [None]:
final_data = hcat(state_census_data, simplified_avgs, arda)
select!(final_data, Not(["NAME", "STATE"]))

Row,geometry,REGION,DIVISION,STATEFP,STATENS,GEOID,GEOIDFQ,STUSPS,ALAND,AWATER,INTPTLAT,INTPTLON,HHINCOME_AVG,PROPTAX_AVG,FOODSTAMP_AVG,State,Black Protestant Adherents,Black Protestant Congregations,Buddhism Adherents,Buddhism Congregations,Catholic Adherents,Catholic Congregations,Evangelical Protestant Adherents,Evangelical Protestant Congregations,Hinduism Adherents,Hinduism Congregations,Islam Adherents,Islam Congregations,Jehovah's Witnesses Adherents,Jehovah's Witnesses Congregations,Judaism Adherents,Judaism Congregations,Latter-day Saints Adherents,Latter-day Saints Congregations,Mainline Protestant Adherents,Mainline Protestant Congregations,Orthodox Adherents,Orthodox Congregations,Other Adherents,Other Congregations,Other Christians Adherents,Other Christians Congregations
Unnamed: 0_level_1,Polygon?,String?,String?,String?,String?,String?,String?,String?,Int64?,Int64?,String?,String?,Float64,Float64,Float64,String,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64,Int64
1,Polygon(13638 Points),3,6,01,01779775,01,0400000US01,AL,131185561946,4581813708,+32.7395785,-086.8434469,73663.7,648.106,462.29,Alabama,0,1471,585,11,0,162,4077,2732,0,7,0,37,0,157,375,9,0,77,212,340,828,18,0,12,0,25
2,Polygon(15329 Points),4,9,02,01785533,02,0400000US02,AK,1479508971743,244710526650,+63.3473560,-152.8397334,94280.4,2254.97,359.597,Alaska,1179,21,713,12,0,89,4116,709,160,1,400,2,0,31,464,1,0,81,1481,170,808,92,690,14,498,9
3,Polygon(10816 Points),4,8,04,01779777,04,0400000US04,AZ,294366118294,853991999,+34.2039362,-111.6063449,75101.4,1015.51,489.473,Arizona,2142,119,0,33,0,269,7116,2852,0,8,0,35,0,306,0,29,0,935,681,436,2442,39,0,44,34,61
4,Polygon(29782 Points),3,7,05,00068085,05,0400000US05,AR,134658517854,3122715710,+34.8955256,-092.4446262,69274.7,733.436,419.968,Arkansas,1482,877,100,12,0,129,6105,3706,200,1,0,12,0,134,1086,4,0,71,430,840,1413,14,0,8,600,32
5,Polygon(12103 Points),4,9,06,01779778,06,0400000US06,CA,403673433805,20291632828,+37.1551773,-119.5434183,92490.5,2144.59,317.446,California,368,1259,0,499,320,3,9566,6453,0,50,0,308,0,0,0,200,0,8,446,2790,3744,326,0,317,572,279
6,Polygon(7513 Points),4,8,08,01779779,08,0400000US08,CO,268418973518,1185541418,+38.9937669,-105.5087122,93898.9,1468.85,263.7,Colorado,1504,99,0,52,0,269,7553,2455,0,2,0,23,0,211,0,19,0,316,524,678,1225,37,0,36,34,58
7,Polygon(3204 Points),1,1,09,01779780,09,0400000US09,CT,12541999507,1816115183,+41.5798637,-072.7466572,1.1211e5,4506.04,275.09,Connecticut,1544,142,455,17,0,361,6416,1162,0,4,0,49,0,133,243,61,0,36,1062,737,4340,59,0,22,0,41
8,Polygon(5345 Points),3,5,10,01779781,10,0400000US10,DE,5046692239,1399219008,+38.9985661,-075.4416440,83628.7,2354.49,353.252,Delaware,3847,83,567,4,0,45,6605,483,0,3,0,9,0,38,543,5,0,12,1115,249,1073,7,293,7,40,11
9,Polygon(20603 Points),3,5,12,00294478,12,0400000US12,FL,138965379385,45968913048,+28.3989775,-082.5143005,76914.8,2396.99,359.767,Florida,563,1499,0,90,0,561,6923,4928,0,32,0,157,0,0,297,146,0,278,641,1810,3863,172,0,88,274,203
10,Polygon(30758 Points),3,5,13,01705317,13,0400000US13,GA,149485762701,4419221858,+32.6295789,-083.4235109,77260.5,1246.93,463.951,Georgia,172,1525,430,30,0,194,7328,2838,0,18,0,99,0,455,910,45,0,167,604,589,3316,59,0,56,161,57


In [None]:
# save this dataframe to a csv file
CSV.write(joinpath("..", "..", "data", "exp_new", "arda_ipums_census.csv"), final_data)

"../../data/exp_new/final_data.csv"

In [None]:
#f, a, p = poly(state_census_data.geometry; axis = (; type = GeoAxis))
#tp = Makie.text!(a, Polylabel.polylabel.(state_census_data.geometry); text = string.(state_census_data.STUSPS), align = (:center, :center))
#tp.fontsize = 5
#f
# save f to a file
#save("census_map.png", f)