In [1]:
import pandas as pd
import os
import requests
from api import CENSUS_API_KEY
import ssl
from util.convertfile import *

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
years = list(range(2016, 2021))  # 2016–2023
STATE_FIPS = 48

# Store data for each year
all_data = []

# Variables to fetch
acs_variables = ",".join([
    "NAME", "B01003_001E", "B19013_001E", "B19083_001E",
    "B03002_003E", "B03002_004E", "B03002_006E", "B03002_012E",
    "B25034_001E", "B25034_007E", "B25034_008E", "B25034_009E",
    "B25034_010E", "B25034_011E"
])

for year in years:
    print(f"Fetching ZCTA data for {year}...")
    url = f"https://api.census.gov/data/{year}/acs/acs5"
    params = {
        "get": acs_variables,
        "for": "zip code tabulation area:*",
        "in": f"state:{STATE_FIPS}",
        "key": CENSUS_API_KEY  
    }

    response = requests.get(url, params=params)
    if response.ok:
        data = response.json()
        df = pd.DataFrame(data[1:], columns=data[0])
        df["year"] = year
        all_data.append(df)
    else:
        print(f"Failed for {year}: {response.status_code}")

# Combine all years
acs_df = pd.concat(all_data, ignore_index=True)

# Rename columns
acs_df = acs_df.rename(columns={
    "NAME": "zcta_name",
    "zip code tabulation area": "zcta",
    "B01003_001E": "total_population",
    "B19013_001E": "median_household_income",
    "B19083_001E": "gini_index",
    "B03002_003E": "white_alone",
    "B03002_004E": "black_alone",
    "B03002_006E": "asian_alone",
    "B03002_012E": "hispanic_alone",
    "B25034_001E": "total_housing_units",
    "B25034_007E": "built_1970_1979",
    "B25034_008E": "built_1960_1969",
    "B25034_009E": "built_1950_1959",
    "B25034_010E": "built_1940_1949",
    "B25034_011E": "built_before_1940"
})

# Convert columns to numeric
cols_to_convert = [
    "total_population", "median_household_income", "gini_index",
    "white_alone", "black_alone", "asian_alone", "hispanic_alone",
    "total_housing_units", "built_1970_1979", "built_1960_1969",
    "built_1950_1959", "built_1940_1949", "built_before_1940"
]
acs_df[cols_to_convert] = acs_df[cols_to_convert].apply(pd.to_numeric, errors="coerce")

# Avoid division by zero
acs_df["total_population"] = acs_df["total_population"].replace(0, pd.NA)
acs_df["total_housing_units"] = acs_df["total_housing_units"].replace(0, pd.NA)

# Calculate race/ethnicity proportions
acs_df["pct_white"] = acs_df["white_alone"] / acs_df["total_population"]
acs_df["pct_black"] = acs_df["black_alone"] / acs_df["total_population"]
acs_df["pct_asian"] = acs_df["asian_alone"] / acs_df["total_population"]
acs_df["pct_hispanic"] = acs_df["hispanic_alone"] / acs_df["total_population"]

# Calculate % built before 1980
acs_df["housing_pre1980"] = (
    acs_df["built_1970_1979"] + acs_df["built_1960_1969"] +
    acs_df["built_1950_1959"] + acs_df["built_1940_1949"] +
    acs_df["built_before_1940"]
)
acs_df["pct_pre1980_housing"] = acs_df["housing_pre1980"] / acs_df["total_housing_units"]

# Round proportions
acs_df[[
    "pct_white", "pct_black", "pct_asian", "pct_hispanic", "pct_pre1980_housing"
]] = acs_df[[
    "pct_white", "pct_black", "pct_asian", "pct_hispanic", "pct_pre1980_housing"
]].round(3)


Fetching ZCTA data for 2016...
Fetching ZCTA data for 2017...
Fetching ZCTA data for 2018...
Fetching ZCTA data for 2019...
Fetching ZCTA data for 2020...
Failed for 2020: 400


In [4]:
acs_df

Unnamed: 0,zcta_name,total_population,median_household_income,gini_index,white_alone,black_alone,asian_alone,hispanic_alone,total_housing_units,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_before_1940,state,zcta,year,pct_white,pct_black,pct_asian,pct_hispanic,housing_pre1980,pct_pre1980_housing
0,ZCTA5 76088,11298,74688,0.3828,10168,38,23,940,4456,448,269,167,131,150,48,76088,2016,0.899982,0.003363,0.002036,0.083201,1165,0.261445
1,ZCTA5 76103,13685,36523,0.4665,3744,3046,377,6345,5660,683,1004,1407,861,1134,48,76103,2016,0.273584,0.222579,0.027548,0.463646,5089,0.899117
2,ZCTA5 76119,47704,32318,0.4268,5595,21265,1936,17963,15509,1958,3017,3565,942,619,48,76119,2016,0.117286,0.44577,0.040584,0.376551,10101,0.651299
3,ZCTA5 76127,1974,63594,0.1034,853,450,9,626,35,13,4,13,0,0,48,76127,2016,0.432118,0.227964,0.004559,0.317123,30,0.857143
4,ZCTA5 76357,663,45000,0.5423,641,0,13,3,278,51,20,46,22,60,48,76357,2016,0.966817,0.0,0.019608,0.004525,199,0.715827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,ZCTA5 76825,6908,48017,0.4296,3984,87,60,2358,3389,525,476,219,411,765,48,76825,2019,0.576723,0.012594,0.008686,0.341343,2396,0.706993
7736,ZCTA5 78219,17239,37374,0.4315,2654,5170,298,8662,5938,1582,1350,644,69,43,48,78219,2019,0.153953,0.299901,0.017286,0.502465,3688,0.621085
7737,ZCTA5 78223,56055,42762,0.4482,8501,2285,413,44431,19857,2615,2572,4359,1666,702,48,78223,2019,0.151655,0.040764,0.007368,0.792632,11914,0.59999
7738,ZCTA5 78225,14472,35431,0.4456,331,1,31,14074,4674,308,324,1132,1767,822,48,78225,2019,0.022872,0.000069,0.002142,0.972499,4353,0.931322


In [5]:
acs_df[acs_df["total_population"].isna()].head()

Unnamed: 0,zcta_name,total_population,median_household_income,gini_index,white_alone,black_alone,asian_alone,hispanic_alone,total_housing_units,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_before_1940,state,zcta,year,pct_white,pct_black,pct_asian,pct_hispanic,housing_pre1980,pct_pre1980_housing
50,ZCTA5 76523,,-666666666,-666666666.0,0,0,0,0,,0,0,0,0,0,48,76523,2016,,,,,0,
548,ZCTA5 75390,,-666666666,-666666666.0,0,0,0,0,,0,0,0,0,0,48,75390,2016,,,,,0,
582,ZCTA5 77440,,-666666666,-666666666.0,0,0,0,0,,0,0,0,0,0,48,77440,2016,,,,,0,
633,ZCTA5 78951,,-666666666,-666666666.0,0,0,0,0,,0,0,0,0,0,48,78951,2016,,,,,0,
682,ZCTA5 77428,,-666666666,-666666666.0,0,0,0,0,,0,0,0,0,0,48,77428,2016,,,,,0,


In [None]:
# Remove rows with invalid ACS placeholder codes
invalid_values = [-666666666, -666666666.0, -222222222, -222222222.0]

merged_df = acs_df[
    ~acs_df["median_household_income"].isin(invalid_values) &
    ~acs_df["gini_index"].isin(invalid_values)
]

In [8]:
merged_df

Unnamed: 0,zcta_name,total_population,median_household_income,gini_index,white_alone,black_alone,asian_alone,hispanic_alone,total_housing_units,built_1970_1979,built_1960_1969,built_1950_1959,built_1940_1949,built_before_1940,state,zcta,year,pct_white,pct_black,pct_asian,pct_hispanic,housing_pre1980,pct_pre1980_housing
0,ZCTA5 76088,11298,74688,0.3828,10168,38,23,940,4456,448,269,167,131,150,48,76088,2016,0.899982,0.003363,0.002036,0.083201,1165,0.261445
1,ZCTA5 76103,13685,36523,0.4665,3744,3046,377,6345,5660,683,1004,1407,861,1134,48,76103,2016,0.273584,0.222579,0.027548,0.463646,5089,0.899117
2,ZCTA5 76119,47704,32318,0.4268,5595,21265,1936,17963,15509,1958,3017,3565,942,619,48,76119,2016,0.117286,0.44577,0.040584,0.376551,10101,0.651299
3,ZCTA5 76127,1974,63594,0.1034,853,450,9,626,35,13,4,13,0,0,48,76127,2016,0.432118,0.227964,0.004559,0.317123,30,0.857143
4,ZCTA5 76357,663,45000,0.5423,641,0,13,3,278,51,20,46,22,60,48,76357,2016,0.966817,0.0,0.019608,0.004525,199,0.715827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7735,ZCTA5 76825,6908,48017,0.4296,3984,87,60,2358,3389,525,476,219,411,765,48,76825,2019,0.576723,0.012594,0.008686,0.341343,2396,0.706993
7736,ZCTA5 78219,17239,37374,0.4315,2654,5170,298,8662,5938,1582,1350,644,69,43,48,78219,2019,0.153953,0.299901,0.017286,0.502465,3688,0.621085
7737,ZCTA5 78223,56055,42762,0.4482,8501,2285,413,44431,19857,2615,2572,4359,1666,702,48,78223,2019,0.151655,0.040764,0.007368,0.792632,11914,0.59999
7738,ZCTA5 78225,14472,35431,0.4456,331,1,31,14074,4674,308,324,1132,1767,822,48,78225,2019,0.022872,0.000069,0.002142,0.972499,4353,0.931322


In [10]:
merged_df["total_population"].isnull().any()

False

In [11]:
merged_df.to_csv('~/Documents/UH/hon4350/water_quality/data/main/filtered_census_acs_5yr_zcta.csv', index=False)