In [1]:
import pandas as pd
import os
import requests
from api import CENSUS_API_KEY
import ssl
from util.convertfile import *

In [2]:
xlsx_path = "~/Documents/UH/hon4350/water_quality/data/TRACT_ZIP_122024.xlsx"
csv_path = "~/Documents/UH/hon4350/water_quality/data/TRACT_ZIP_122024.csv"

xslx_to_csv(xlsx_path, csv_path)

Conversion completed! CSV file saved as: /Users/minhnguyen/Documents/UH/hon4350/water_quality/data/TRACT_ZIP_122024.csv


In [3]:
crosswalk = pd.read_csv("~/Documents/UH/hon4350/water_quality/data/TRACT_ZIP_122024.csv", dtype=str)
# crosswalk

In [4]:
crosswalk["STATE"] = crosswalk["TRACT"].str[:2]  # first 2 digits of TRACT
crosswalk_tx = crosswalk[crosswalk["STATE"] == "48"].copy()

crosswalk_tx["GEOID"] = crosswalk_tx["TRACT"]

crosswalk_tx_primary = crosswalk_tx[crosswalk_tx["RES_RATIO"].astype(float) > 0.5]

# primary --> this is for exploratory, actual dataset used is down below
crosswalk_tx_primary


Unnamed: 0,TRACT,ZIP,USPS_ZIP_PREF_CITY,USPS_ZIP_PREF_STATE,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO,STATE,GEOID
156097,48001950100,75763,FRANKSTON,TX,0.6626348479895391,0.6388888888888888,0.8823529411764706,0.663560411311054,48,48001950100
156099,48001950401,75803,PALESTINE,TX,1.0,0.7692307692307693,1.0,0.976,48,48001950401
156102,48001950402,75861,TENNESSEE COLONY,TX,1.0,0.0,0.0,0.8333333333333334,48,48001950402
156105,48001950500,75802,PALESTINE,TX,0.5857030015797788,0.5520282186948854,0.02247191011235955,0.5736013986013986,48,48001950500
156108,48001950600,75803,PALESTINE,TX,0.7495378927911276,0.20851063829787234,0.7142857142857143,0.7130261660978384,48,48001950600
...,...,...,...,...,...,...,...,...,...,...
169903,48505950402,78076,ZAPATA,TX,0.8442622950819673,0.7272727272727273,0.7368421052631579,0.8413120567375887,48,48505950402
169910,48507950100,78829,BATESVILLE,TX,0.8831168831168831,1.0,1.0,0.8941176470588236,48,48507950100
169912,48507950200,78872,LA PRYOR,TX,1.0,1.0,0.0,1.0,48,48507950200
169915,48507950301,78839,CRYSTAL CITY,TX,0.9945652173913043,0.9777777777777777,1.0,0.9939148073022313,48,48507950301


In [5]:
years = list(range(2016, 2024))  # 2016–2023
STATE_FIPS = "48"  # Texas

# Store data per year
all_data = []

for year in years:
    print(f"Fetching data for {year}...")
    url = f"https://api.census.gov/data/{year}/acs/acs5"

    params = {
        "get": "NAME,B01003_001E",
        "for": "tract:*",
        "in": f"state:{STATE_FIPS}",
        "key": CENSUS_API_KEY
    }

    response = requests.get(url, params=params)

    if response.ok:
        data = response.json()
        df = pd.DataFrame(data[1:], columns=data[0])
        df["year"] = year
        all_data.append(df)
    else:
        print(f"Failed for {year}: {response.status_code}")

# Combine all years
acs_df = pd.concat(all_data, ignore_index=True)

# Rename columns
acs_df = acs_df.rename(columns={
    "B01003_001E": "total_population",
    "NAME": "tract_name"
})

acs_df

Fetching data for 2016...
Fetching data for 2017...
Fetching data for 2018...
Fetching data for 2019...
Fetching data for 2020...
Fetching data for 2021...
Fetching data for 2022...
Fetching data for 2023...


Unnamed: 0,tract_name,total_population,state,county,tract,year
0,"Census Tract 3503, Harris County, Texas",6580,48,201,350300,2016
1,"Census Tract 4102, Harris County, Texas",5458,48,201,410200,2016
2,"Census Tract 4113, Harris County, Texas",3396,48,201,411300,2016
3,"Census Tract 4119, Harris County, Texas",3458,48,201,411900,2016
4,"Census Tract 4202, Harris County, Texas",2605,48,201,420200,2016
...,...,...,...,...,...,...
48639,Census Tract 9504.02; Zapata County; Texas,2141,48,505,950402,2023
48640,Census Tract 9501; Zavala County; Texas,1033,48,507,950100,2023
48641,Census Tract 9502; Zavala County; Texas,1166,48,507,950200,2023
48642,Census Tract 9503.01; Zavala County; Texas,1917,48,507,950301,2023


In [6]:
# Ensure strings and correct padding
acs_df["state"] = acs_df["state"].astype(str).str.zfill(2)
acs_df["county"] = acs_df["county"].astype(str).str.zfill(3)
acs_df["tract"] = acs_df["tract"].astype(str).str.zfill(6)

# Combine into GEOID
acs_df["GEOID"] = acs_df["state"] + acs_df["county"] + acs_df["tract"]

# Ensure Strings and correct padding for cross walk
crosswalk["ZIP"] = crosswalk["ZIP"].str.zfill(5)
crosswalk["TRACT"] = crosswalk["TRACT"].str.zfill(11)
crosswalk["RES_RATIO"] = crosswalk["RES_RATIO"].astype(float)

# Filter for Texas only (FIPS = '48') if needed
crosswalk["STATE"] = crosswalk["TRACT"].str[:2]
crosswalk_tx = crosswalk[crosswalk["STATE"] == "48"].copy()

# Keep only ZIP with highest RES_RATIO for each tract
crosswalk_best = crosswalk_tx.sort_values("RES_RATIO", ascending=False).drop_duplicates("TRACT")

# Add GEOID for merging
crosswalk_best["GEOID"] = crosswalk_best["TRACT"]

# Merge ACS data with ZIP data using GEOID
merged_df = pd.merge(acs_df, crosswalk_best[["GEOID", "ZIP"]], on="GEOID", how="left")


In [7]:
merged_df

Unnamed: 0,tract_name,total_population,state,county,tract,year,GEOID,ZIP
0,"Census Tract 3503, Harris County, Texas",6580,48,201,350300,2016,48201350300,77089
1,"Census Tract 4102, Harris County, Texas",5458,48,201,410200,2016,48201410200,
2,"Census Tract 4113, Harris County, Texas",3396,48,201,411300,2016,48201411300,
3,"Census Tract 4119, Harris County, Texas",3458,48,201,411900,2016,48201411900,
4,"Census Tract 4202, Harris County, Texas",2605,48,201,420200,2016,48201420200,77025
...,...,...,...,...,...,...,...,...
48639,Census Tract 9504.02; Zapata County; Texas,2141,48,505,950402,2023,48505950402,78076
48640,Census Tract 9501; Zavala County; Texas,1033,48,507,950100,2023,48507950100,78829
48641,Census Tract 9502; Zavala County; Texas,1166,48,507,950200,2023,48507950200,78872
48642,Census Tract 9503.01; Zavala County; Texas,1917,48,507,950301,2023,48507950301,78839


In [15]:
merged_df["total_population"] = merged_df["total_population"].astype(int)
zip_df = merged_df.groupby(["ZIP", "year"], as_index=False).agg({
    "total_population": "sum"
})

zip_df

Unnamed: 0,ZIP,year,total_population
0,75001,2016,9738
1,75001,2017,9386
2,75001,2018,9445
3,75001,2019,9609
4,75001,2020,16449
...,...,...,...
10783,79996,2019,5126
10784,79996,2020,5303
10785,79996,2021,5480
10786,79996,2022,5419


In [16]:
zip_df.to_csv('~/Documents/UH/hon4350/water_quality/data/main/filtered_census_acs_5yr.csv', index=False)