- Read files (selecting only needed columns)
- Append all files
- Clean columns
- Add column specifying Fulton as the data source
- Create new columns for site address and owner address
- Export to csv and parquet

In [1]:
import polars as pl
import os

FILES_PATH = "../data/fulton/parcels/"
COL_MAP = dict(pl.read_csv("fulton_cols.csv")[["old_name", "new_name"]].iter_rows())
DTYPE_MAP = dict(pl.read_csv("fulton_cols.csv")[["new_name", "dtype"]].iter_rows())
CONVERT_DTYPES = {
    "Int16": pl.Int16,
    "Int32": pl.Int32,
    "Int64": pl.Int64,
    "Float16": pl.Float32,
    "Float32": pl.Float32,
    "Float64": pl.Float64,
    "String": pl.Utf8,
}


def fill_null_with_zero(df, col):
    return df.with_columns(
        pl.when(pl.col(col).is_null()).then(pl.lit(0)).otherwise(pl.col(col))
        .keep_name()
    )


def fill_null_with_empty(df, col):
    return df.with_columns(
        pl.when(
            (pl.col(col).is_null())
            | (pl.col(col).str.to_uppercase() == "NULL")
            | (pl.col(col).str.to_uppercase() == "NAN")
        )
        .then(pl.lit(""))
        .otherwise(pl.col(col))
        .keep_name()
    )


def list_to_str(lst):
    return "".join(map(str, lst))

In [2]:
dfs = []

for file in os.listdir(FILES_PATH):
    print(f"Reading {file}")
    dfs.append(
        pl.read_excel(
            source=FILES_PATH + file,
            read_csv_options={
                "infer_schema_length": 0,
                "columns": list(COL_MAP.keys())
            },
            xlsx2csv_options={"ignore_formats": ["float"]},
        ).with_columns(
            pl.lit(file)
            .alias("source_file")
        )
    )

for df in dfs:
    df = df.select(~pl.col("Parid").str.contains("COUNT"))

df = pl.concat(dfs)
df = df.rename(COL_MAP)

Reading 14th_2010-2011.xlsx
Reading 14th_2012-2013.xlsx
Reading 14th_2014.xlsx
Reading 14th_2015-2016.xlsx
Reading 14th_2017.xlsx
Reading 14th_2018.xlsx
Reading 14th_2019.xlsx
Reading 14th_2020.xlsx
Reading 14th_2021.xlsx
Reading 14th_2022.xlsx
Reading 17th_2010-2011.xlsx
Reading 17th_2012-2013.xlsx
Reading 17th_2014.xlsx
Reading 17th_2015.xlsx
Reading 17th_2016.xlsx
Reading 17th_2017.xlsx
Reading 17th_2018.xlsx
Reading 17th_2019.xlsx
Reading 17th_2020.xlsx
Reading 17th_2021.xlsx
Reading 17th_2022.xlsx
Reading atl_2011-2012.xlsx
Reading atl_2013-2014.xlsx
Reading atl_2015.xlsx
Reading atl_2016.xlsx
Reading atl_2017.xlsx
Reading atl_2018.xlsx
Reading atl_2019.xlsx
Reading atl_2020.xlsx
Reading atl_2021.xlsx
Reading atl_2022.xlsx
Reading nf_2010-2011.xlsx
Reading nf_2012-2013.xlsx
Reading nf_2014.xlsx
Reading nf_2015.xlsx
Reading nf_2016.xlsx
Reading nf_2017.xlsx
Reading nf_2018.xlsx
Reading nf_2019.xlsx
Reading nf_2020.xlsx
Reading nf_2021.xlsx
Reading nf_2022.xlsx
Reading sf_2010-2011.

In [3]:
for key, val in DTYPE_MAP.items():
    DTYPE_MAP[key] = CONVERT_DTYPES[val]

# Enforcing numbers
for column in DTYPE_MAP.keys():
    if (DTYPE_MAP[column] in pl.NUMERIC_DTYPES) and df[column].dtype == pl.Utf8:
        df = df.with_columns(
            pl.col(column).str.extract_all(r"[0-9]").apply(list_to_str).keep_name()
        )
    df = df.select(pl.col(column).cast(DTYPE_MAP[column]), pl.all().exclude(column))

for column in df.columns:
    if df[column].dtype in pl.NUMERIC_DTYPES:
        df = fill_null_with_zero(df, column)
    else:
        df = fill_null_with_empty(df, column)

df = df.with_columns(
    pl.concat_str(
        [
            pl.col("site_addrno"),
            pl.lit(" "),
            pl.col("site_addrdir"),
            pl.lit(" "),
            pl.col("site_addrstr"),
            pl.lit(" "),
            pl.col("site_addrsuf"),
            pl.lit(" "),
            pl.col("site_addrsuf2"),
        ]
    )
    .str.replace_all(r"\s+", " ")
    .str.strip()
    .alias("street_addr")
)

df = df.with_columns(
    pl.concat_str(
        [
            pl.col("owner_addrno"),
            pl.lit(" "),
            pl.col("owner_addradd"),
            pl.lit(" "),
            pl.col("owner_addrdir"),
            pl.lit(" "),
            pl.col("owner_addrstr"),
            pl.lit(" "),
            pl.col("owner_addrsuf"),
            pl.lit(" "),
            pl.col("owner_addrsuf2"),
            pl.lit(" "),
            pl.col("owner_unitno"),
        ]
    )
    .str.replace_all(r"\s+", " ")
    .str.strip()
    .alias("owner_addr")
)

df = df.with_columns(pl.lit("FULTON").alias("source_county"))

df.write_csv("all_parcels_fulton.csv")
df.write_parquet("all_parcels_fulton.parquet")

### Check Output

In [22]:
result = pl.read_parquet("all_parcels_fulton.parquet")

In [24]:
S_ADDR_VARS = ["site_addrno", "site_addrdir", "site_addrstr", "site_addrsuf", "site_addrsuf2", "street_addr"]
O_ADDR_VARS = ["owner_addrno", "owner_addradd", "owner_addrdir", "owner_addrstr", "owner_addrsuf", "owner_addrsuf2", "owner_unitno", "owner_addr"]

In [25]:
result[S_ADDR_VARS].sample(5)

site_addrno,site_addrdir,site_addrstr,site_addrsuf,site_addrsuf2,street_addr
str,str,str,str,str,str
"""100""","""""","""BROWNTOWN""","""RD""","""NW""","""100 BROWNTOWN …"
"""0""","""""","""KENNETH""","""ST""","""SE""","""0 KENNETH ST S…"
"""1826""","""""","""LIBERTY""","""LN""","""""","""1826 LIBERTY L…"
"""1195""","""""","""LONGCOURTE""","""""","""NW""","""1195 LONGCOURT…"
"""300""","""""","""WEST PEACHTREE…","""ST""","""""","""300 WEST PEACH…"


In [26]:
result[O_ADDR_VARS].sample(5)

owner_addrno,owner_addradd,owner_addrdir,owner_addrstr,owner_addrsuf,owner_addrsuf2,owner_unitno,owner_addr
str,str,str,str,str,str,str,str
"""8607""","""""","""""","""SUNDALE""","""DR""","""""","""""","""8607 SUNDALE D…"
"""""","""""","""""","""""","""""","""""","""324""","""324"""
"""250""","""""","""""","""PHARR""","""RD""","""NE""","""405""","""250 PHARR RD N…"
"""5596""","""""","""""","""DENDY""","""TRCE""","""""","""""","""5596 DENDY TRC…"
"""9497""","""""","""""","""THORNTON""","""BLVD""","""""","""""","""9497 THORNTON …"


In [27]:
result["owner_zip"].sample(5)

owner_zip
str
"""30338"""
"""30331"""
"""30310"""
"""30315"""
"""30327"""


In [20]:
result = result.with_columns(
    pl.col("street_addr")
    .str.strip()
    .keep_name()
).with_columns(
    pl.col("owner_addr")
    .str.strip()
    .keep_name()
)

In [21]:
result.write_csv("all_parcels_fulton.csv")
result.write_parquet("all_parcels_fulton.parquet")