In [1]:
%reload_ext autoreload
%autoreload 2

from pathlib import Path

import polars as pl

In [2]:
juris = pl.read_excel(
    "../../config/hpms.xlsx",
    sheet_name="Table 6",
    table_name="juris",
)
juris

juris,long_name,juris_type,juris_group,in_sandag
str,str,str,str,bool
"""Carlsbad""","""City of Carlsbad""","""Local""","""Carlsbad""",true
"""Chula Vista""","""City of Chula Vista""","""Local""","""Chula Vista""",true
"""Coronado""","""City of Coronado""","""Local""","""Coronado""",true
"""Del Mar""","""City of Del Mar""","""Local""","""Del Mar""",true
"""El Cajon""","""City of El Cajon""","""Local""","""El Cajon""",true
…,…,…,…,…
"""U.S. Marine Corps""","""U.S. Marine Corps""","""Federal""","""U.S. Military""",false
"""U.S. Navy""","""U.S. Navy""","""Federal""","""U.S. Military""",false
"""U.S. Department of Defence""","""U.S. Department of Defence""","""Federal""","""U.S. Military""",false
"""San Diego Unified Port Authori…","""San Diego Unified Port Authori…","""Other""","""San Diego Unified Port Distric…",false


In [3]:
juris_config = pl.read_excel(
    "../../config/hpms.xlsx",
    sheet_name="Table 6",
    table_name="juris_config",
)
juris_config

year,column,row,value,recode
i64,str,i64,str,str
1996,"""C""",842,"""CARLSBAD""","""Carlsbad"""
1996,"""C""",843,"""CHULA VISTA""","""Chula Vista"""
1996,"""C""",844,"""CORONADO""","""Coronado"""
1996,"""C""",845,"""DEL MAR""","""Del Mar"""
1996,"""C""",846,"""EL CAJON""","""El Cajon"""
…,…,…,…,…
2023,"""A""",25,"""STATE HIGHWAYS""","""State Highways"""
2023,"""A""",26,"""STATE PARK SERVICE""","""State Park Service"""
2023,"""A""",27,"""U.S. BUREAU OF LAND MANAGEMENT""","""U.S. Bureau of Land Management"""
2023,"""A""",28,"""U.S. FISH AND WILDLIFE""","""U.S. Bureau of Fish & Wildlife"""


In [4]:
config = pl.read_excel(
    "../../config/hpms.xlsx",
    sheet_name="Table 6",
    table_name="table_6_config",
)
config

year,filename,sheet_name,colshift,rowshift,maintained_miles_rural_column,maintained_miles_urban_column,dvmt_1000_rural_column,dvmt_1000_urban_column
i64,str,str,i64,i64,str,str,str,str
1996,"""1996PRD.xls""","""Sheet1""",0,-2,"""E""","""G""","""M""","""O"""
1997,"""1997PRD.xls""","""Sheet1""",-2,-2,"""G""","""I""","""M""","""O"""
1998,"""1998PRD.xls""","""Table 2-1-6""",-1,-2,"""G""","""I""","""O""","""Q"""
1999,"""1999PRD.xls""","""Table 2-1-6""",-2,-2,"""G""","""I""","""O""","""Q"""
2000,"""2000PRD.xls""","""Table 2-1-6""",-2,-2,"""G""","""I""","""O""","""Q"""
…,…,…,…,…,…,…,…,…
2019,"""2019_PRD.xlsx""","""Table 6""",0,-2,"""C""","""D""","""G""","""H"""
2020,"""2020_PRD.xlsx""","""Table 6""",0,-2,"""C""","""D""","""G""","""H"""
2021,"""2021 HPMS Extract.xlsx""","""Jurisdiction""",0,-1,"""B""","""C""","""E""","""F"""
2022,"""2022 HPMS Extract.xlsx""","""Jurisdiction""",0,-1,"""B""","""C""","""E""","""F"""


In [5]:
alphabet = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ")
EXCEL_COLS = alphabet + [x + y for x in alphabet for y in alphabet]


def colshift(input_col: str, distance: int) -> str:
    if distance == 0:
        return input_col
    else:
        input_col_index = EXCEL_COLS.index(input_col)
        return EXCEL_COLS[input_col_index + distance]


In [6]:
def extract_table_6(
    year: int,
    dir_: str | Path,
    config: pl.DataFrame,
    juris: pl.DataFrame,
    juris_config: pl.DataFrame,
) -> pl.DataFrame:
    config = config.filter(pl.col("year") == year).transpose(include_header=True)

    config_dict = {
        x: y for (x, y) in zip(config.to_dict()["column"], config.to_dict()["column_0"])
    }

    juris_config = juris_config.filter(pl.col("year") == year)

    juris_columns = juris_config["column"].unique()
    use_cols = ",".join(
        [
            colshift(
                config_dict["maintained_miles_rural_column"],
                distance=int(config_dict["colshift"]),
            ),
            colshift(
                config_dict["maintained_miles_urban_column"],
                distance=int(config_dict["colshift"]),
            ),
            colshift(
                config_dict["dvmt_1000_rural_column"],
                distance=int(config_dict["colshift"]),
            ),
            colshift(
                config_dict["dvmt_1000_urban_column"],
                distance=int(config_dict["colshift"]),
            ),
        ]
    )
    df = pl.DataFrame(
        schema={
            "column_1": pl.String,
            "column_2": pl.String,
            "column_3": pl.String,
            "column_4": pl.String,
            "column_5": pl.String,
        }
    )
    for column in juris_columns:
        filtered_juris_config = juris_config.filter(pl.col("column") == column)
        target_rows = (
            filtered_juris_config["row"].cast(pl.Int64) + int(config_dict["rowshift"])
        ).to_list()
        new_df = pl.read_excel(
            source=Path(dir_) / config_dict["filename"],
            sheet_name=config_dict["sheet_name"],
            has_header=False,
            read_options={
                "use_columns": f"{colshift(column, distance=int(config_dict['colshift']))},"
                + use_cols,
            },
            drop_empty_rows=False,
            drop_empty_cols=False,
        )[target_rows]
        df = pl.concat([df, new_df])
    return (
        df.rename(
            {
                "column_1": "juris",
                "column_2": "maintained_miles_rural",
                "column_3": "maintained_miles_urban",
                "column_4": "dvmt_1000_rural",
                "column_5": "dvmt_1000_urban",
            }
        )
        .join(juris_config, left_on="juris", right_on="value", how="left")
        .join(juris, left_on="recode", right_on="juris", how="left")
        .drop_nulls(subset="juris")
        .select(
            pl.date(year, 1, 1).alias("timestamp"),
            pl.col("recode").alias("juris"),
            pl.col("long_name").alias("juris_full"),
            pl.col("juris_type"),
            pl.col("juris_group"),
            pl.col("in_sandag"),
            pl.col("maintained_miles_rural").cast(pl.Float64).fill_null(0.0),
            pl.col("maintained_miles_urban").cast(pl.Float64).fill_null(0.0),
            pl.col("dvmt_1000_rural").cast(pl.Float64).fill_null(0.0),
            pl.col("dvmt_1000_urban").cast(pl.Float64).fill_null(0.0),
        )
        .drop_nulls(subset="juris")
    )

In [7]:
juris["juris"].unique()

juris
str
"""Other State Agencies"""
"""Unincorporated"""
"""U.S. Navy/Marines"""
"""El Cajon"""
"""Santee"""
…
"""Encinitas"""
"""Coronado"""
"""Indian Tribal Nation"""
"""U.S. Marine Corps"""


In [8]:
df = pl.concat(
    [
        extract_table_6(
            year=year,
            dir_="../../data/raw/hpms/",
            config=config,
            juris=juris,
            juris_config=juris_config,
        )
        for year in range(1996, 2024)
    ]
).with_columns(
    pl.col("juris").cast(pl.Enum(juris["juris"].unique())),
    pl.col("juris_type").cast(pl.Enum(juris["juris_type"].unique())),
    pl.col("juris_group").cast(pl.Enum(juris["juris_group"].unique())),
)
df.head(2)

if not Path("../../data/clean/hpms/").exists():
    Path("../../data/clean/hpms/").mkdir(parents=True)


df.write_parquet("../../data/clean/hpms/table_6.parquet")

Could not determine dtype for column 3, falling back to string
Could not determine dtype for column 38, falling back to string
Could not determine dtype for column 47, falling back to string
Could not determine dtype for column 71, falling back to string
Could not determine dtype for column 33, falling back to string
Could not determine dtype for column 43, falling back to string
Could not determine dtype for column 61, falling back to string
Could not determine dtype for column 69, falling back to string


In [17]:
df.filter(pl.col("timestamp").dt.year() == 2020).filter(pl.col("juris_type") == "Local")

timestamp,juris,juris_full,juris_type,juris_group,in_sandag,maintained_miles_rural,maintained_miles_urban,dvmt_1000_rural,dvmt_1000_urban
date,enum,str,enum,enum,bool,f64,f64,f64,f64
2020-01-01,"""Carlsbad""","""City of Carlsbad""","""Local""","""Carlsbad""",true,0.0,429.169,0.0,1857.256
2020-01-01,"""Chula Vista""","""City of Chula Vista""","""Local""","""Chula Vista""",true,32.792,534.144,126.924,1769.898
2020-01-01,"""Coronado""","""City of Coronado""","""Local""","""Coronado""",true,0.0,45.566,0.0,85.696
2020-01-01,"""Del Mar""","""City of Del Mar""","""Local""","""Del Mar""",true,0.0,24.835,0.0,78.511
2020-01-01,"""El Cajon""","""City of El Cajon""","""Local""","""El Cajon""",true,0.0,205.499,0.0,903.49
…,…,…,…,…,…,…,…,…,…
2020-01-01,"""San Diego""","""City of San Diego""","""Local""","""San Diego""",true,20.444,2813.072,16.446,11816.702
2020-01-01,"""San Marcos""","""City of San Marcos""","""Local""","""San Marcos""",true,1.437,185.478,0.647,667.189
2020-01-01,"""Santee""","""City of Santee""","""Local""","""Santee""",true,0.0,112.316,0.0,444.288
2020-01-01,"""Solana Beach""","""City of Solana Beach""","""Local""","""Solana Beach""",true,0.0,43.206,0.0,111.359


In [10]:
(
    df[
        [
            "timestamp",
            "maintained_miles_rural",
            "maintained_miles_urban",
            "dvmt_1000_rural",
            "dvmt_1000_urban",
        ]
    ]
    .group_by(["timestamp"])
    .sum()
    .plot.line(x="timestamp", y="dvmt_1000_urban")
)