In [1]:
from pathlib import Path
import pandas as pd

# ------------------------------------------------------------------
# 0   Paths and helpers
# ------------------------------------------------------------------
DATA_DIR   = Path("/net/dali/home/mscbio/rul98/GeoPred/COVID19USFlows-WeeklyFlows/weekly_flows/county2county")
# XWALK_FILE = Path("/net/dali/home/mscbio/rul98/TrendDetection/geocorr2014.csv")

def read_weekly(f):
    """Return only the columns we need, with correct dtypes."""
    use = ["geoid_o", "geoid_d", "pop_flows"]
    return pd.read_csv(
        f,
        usecols = use,
        dtype   = {"geoid_o": "string",
                   "geoid_d": "string",
                   "pop_flows": "float"}
    )

# ------------------------------------------------------------------
# 1   Sum pop_flows over *all* weekly files 2019-2021
# ------------------------------------------------------------------
all_weeks = pd.concat(
    [read_weekly(f) for f in DATA_DIR.glob("weekly_county2county_*.csv")],
    ignore_index=True
)

county2county = (
    all_weeks
      .groupby(["geoid_o", "geoid_d"], as_index=False)["pop_flows"]
      .sum()
      .rename(columns={"pop_flows": "pop_total"})
)


In [2]:
county2county

Unnamed: 0,geoid_o,geoid_d,pop_total
0,01001,01001,35750772.0
1,01001,01003,763410.0
2,01001,01005,22953.0
3,01001,01007,31913.0
4,01001,01009,11652.0
...,...,...,...
5082277,72153,72145,2809.0
5082278,72153,72147,58.0
5082279,72153,72149,6396.0
5082280,72153,72151,749.0


In [3]:
# ------------------------------------------------------------------
# 2   Load county → HRR cross-walk (with allocation factors)
# ------------------------------------------------------------------
xwalk = (
    pd.read_csv("/net/dali/home/mscbio/rul98/TrendDetection/geocorr2014.csv", dtype=str)
      .loc[:, ["county", "hrr", "afact"]]
      .assign(afact=lambda d: pd.to_numeric(d["afact"], errors="coerce"))
      .query("afact > 0")             # exclude zero-share fragments
)
xwalk = xwalk.dropna()
xwalk["afact"] = xwalk.groupby("county")["afact"].transform(lambda s: s / s.sum())
xwalk

Unnamed: 0,county,hrr,afact
1,01001,001,0.053
2,01001,007,0.947
3,01003,006,0.960
4,01003,134,0.040
5,01005,002,1.000
...,...,...,...
4666,56039,274,0.004
4667,56039,423,0.996
4668,56041,423,1.000
4669,56043,274,1.000


In [4]:
# ------------------------------------------------------------------
# 3   Attach xwalk twice → allocate flow to every HRR pair
# ------------------------------------------------------------------
flows = (
    county2county
      # ----- origin side -----
      .merge(
          xwalk.rename(columns={"county": "geoid_o",
                                "hrr":    "o_hrr",
                                "afact":  "o_afact"}),
          on="geoid_o",
          how="left"
      )
      # ----- destination side -----
      .merge(
          xwalk.rename(columns={"county": "geoid_d",
                                "hrr":    "d_hrr",
                                "afact":  "d_afact"}),
          on="geoid_d",
          how="left"
      )
)

flows = flows.dropna()
flows = flows.drop_duplicates()

# Allocate the flow to each HRR pair
flows["flow_hrr2hrr"] = (
    flows["pop_total"] * flows["o_afact"] * flows["d_afact"]
)

# ------------------------------------------------------------------
# 4   Aggregate to HRR × HRR (directional) -------------------------
# ------------------------------------------------------------------
hrr2hrr = (
    flows.groupby(["o_hrr", "d_hrr"], as_index=False)["flow_hrr2hrr"]
         .sum()
         .rename(columns={"flow_hrr2hrr": "pop_total"})
)


In [5]:
hrr2hrr

Unnamed: 0,o_hrr,d_hrr,pop_total
0,001,001,2.604481e+09
1,001,002,4.926061e+06
2,001,005,1.418571e+08
3,001,006,3.447031e+07
4,001,007,6.377620e+07
...,...,...,...
93631,457,450,1.262712e+04
93632,457,451,2.612800e+04
93633,457,452,2.837768e+03
93634,457,456,4.308198e+03


In [6]:
hrr_matrix = (
    hrr2hrr.pivot(index="o_hrr", columns="d_hrr", values="pop_total")
           .fillna(0)
)
hrr_matrix

d_hrr,001,002,005,006,007,009,010,011,012,014,...,445,446,447,448,449,450,451,452,456,457
o_hrr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
001,2.604481e+09,4.926061e+06,1.418571e+08,3.447031e+07,6.377620e+07,4.617061e+07,174823.765000,2.238076e+05,8.782702e+05,50889.307102,...,71058.244067,2.476860e+04,5.251045e+04,3.724211e+04,1.409120e+05,3.163158e+04,3.060834e+05,1.634575e+04,1.677687e+04,6.675270e+04
002,6.370944e+06,4.050937e+08,5.813454e+05,2.652643e+06,1.239875e+07,6.735670e+05,43942.007000,3.559908e+04,1.373213e+05,7111.923145,...,11710.100802,3.175656e+03,7.943292e+03,7.803976e+03,2.293590e+04,5.833479e+03,4.503798e+04,3.260922e+03,2.639683e+03,9.483919e+03
005,1.025113e+08,6.471039e+05,6.929948e+08,4.959324e+06,2.978249e+06,2.143078e+06,94008.910843,1.037647e+05,3.802801e+05,23843.305591,...,23235.027532,7.665935e+03,1.400877e+04,1.422384e+04,1.118434e+05,1.416125e+04,1.091423e+05,6.370223e+03,5.544553e+03,2.324623e+04
006,1.641768e+07,1.680784e+06,1.368648e+06,9.266913e+08,8.355176e+06,3.087418e+06,72236.534000,8.766528e+04,3.280230e+05,19602.285370,...,24973.082954,1.751515e+04,3.802616e+04,1.844541e+04,7.241539e+04,2.156097e+04,1.282921e+05,9.777662e+03,1.165070e+04,3.341934e+04
007,5.048689e+07,9.174904e+06,1.626010e+06,9.088123e+06,5.243224e+08,1.934169e+06,40609.603000,4.482187e+04,1.622797e+05,10273.738240,...,15362.428469,5.302566e+03,9.200757e+03,6.634208e+03,2.441165e+04,4.742873e+03,5.420658e+04,3.408000e+03,3.258147e+03,1.237120e+04
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450,6.540832e+04,1.439607e+04,2.799400e+04,1.222680e+05,2.694542e+04,7.793758e+03,105758.146442,1.557116e+05,4.147761e+05,34490.027784,...,11415.316822,7.103845e+06,2.940749e+06,6.887094e+06,9.866634e+06,2.042411e+08,4.370980e+06,4.410744e+06,4.352025e+07,5.478848e+04
451,5.677989e+05,6.698382e+04,2.090154e+05,7.086975e+05,1.834784e+05,1.695993e+05,251535.703013,1.718226e+06,4.118984e+06,407282.096149,...,104745.268417,1.115331e+07,2.463741e+07,4.440374e+06,8.012425e+07,6.574176e+06,2.313901e+09,2.819382e+07,4.724092e+06,1.212573e+05
452,4.153286e+04,9.734659e+03,1.561994e+04,7.934683e+04,1.388872e+04,6.829379e+03,35524.757041,9.753747e+04,2.646793e+05,22502.015996,...,7097.030740,4.315769e+07,7.921185e+06,7.191706e+05,1.526874e+07,5.053981e+06,2.749226e+07,1.278158e+08,1.077548e+06,1.878893e+04
456,4.237675e+04,8.273559e+03,1.685851e+04,9.900703e+04,1.292342e+04,5.946569e+03,41334.696835,7.526409e+04,2.105687e+05,17043.942563,...,4433.783245,7.633479e+06,4.106871e+06,6.726649e+05,3.211667e+06,3.892641e+07,2.712437e+06,1.005129e+06,1.145583e+08,2.019696e+04


In [7]:
hrr_matrix.to_csv('HHS_COVID19USWeeklyFlowsSum.csv')