In [None]:
import pandas as pd
import numpy as np
from census import Census
from us import states
import requests
import time
from datetime import datetime
import urllib.parse

In [None]:
# Read in dataset with zip codes for each parish 
tract_codes = pd.read_excel('parish_tracts_zips.xlsx')

# Explode to create and array of all zips
tract_codes['zip_codes'] = tract_codes['zip_codes'].astype(str).apply(lambda x: list(set(x.split(','))))

zips_exploded = tract_codes.explode('zip_codes').reset_index(drop=True)
zips_list = zips_exploded['zip_codes'].unique()

# Read in variable labels and keys for each year
api_variables = pd.read_csv('API_variables.csv')

api_variables.columns = [str(col) for col in api_variables.columns]

api_years = [str(col) for col in api_variables.columns[1:]]


In [None]:
# Parameters
api_key = 'b57979c3aa135040be060824bfd78feeb9502fab'
start_year = 2015
last_year = 2020
start_year_2 = 2020
current_year = (datetime.now().year) -1

# Batching the zips and variables to prevent errors
zip_batch_size = 5
variable_batch_size = 4

# Pull age and population data ACS Housing and Demographics Survey

In [None]:
rows = []

# Loop through each year
for year in range(start_year, last_year):
    year = str(year)
    if year not in api_variables.columns:
        print(f"Skipping year {year}")
        continue
    
    # Drop if year is not in the api_variables csv
    year_vars = api_variables[year].dropna().tolist()
    dp_variables = [v for v in year_vars if v.startswith('DP05')]

    if not dp_variables:
        continue

    # split into batches
    variable_batches = [
        dp_variables[i:i + variable_batch_size]
        for i in range(0, len(dp_variables), variable_batch_size)
    ]

    zip_batches = [
        zips_list[i:i + zip_batch_size]
        for i in range(0, len(zips_list), zip_batch_size)
    ]

    # The api link
    url = f"https://api.census.gov/data/{year}/acs/acs5/profile"

    # Loop through variable batches
    for var_batch in variable_batches:
        # join once per batch
        var_str = ",".join(var_batch)

        # Loop through zips
        for zip_batch in zip_batches:
            zip_str = ",".join(str(z) for z in zip_batch)
            params = {
                'get': var_str,
                'for': f"zip code tabulation area:{zip_str}",
                'key': api_key
            }

            # The new years after 2020 required state parameters
            if int(year) < 2020:
                params['in'] = 'state:53'

            response = requests.get(url, params=params)

            # Raise and error if the request was not successfull
            if response.status_code == 200:
                print(f"Success for {year} vars {var_str} / zips {zip_str} (status {response.status_code})")

            if response.status_code != 200 or not response.text.strip():
                print(f"Failed or empty for vars {var_str} / zips {zip_str} (status {response.status_code})")
            
            data = response.json()
            print(data)

            # Without State parameter
            headers = data[0]
            for row in data[1:]:
                if int(year) >= 2020:
                    estimates = row[:-1]
                    ztca = row[-1]
                    for i, est in enumerate(estimates):
                        rows.append({
                            'variable': headers[i],
                            'estimate': est,
                            'zip': ztca,
                            'year': year
                        })
                else:
                    estimates = row[:-2]
                    ztca = row[-1]
                    for i, est in enumerate(estimates):
                        rows.append({
                            'variable': headers[i],
                            'estimate': est,
                            'zip': ztca,
                            'year': year
                        })
            
# build DataFrame once at the end
all_data_15_19 = pd.DataFrame(rows)


In [None]:
rows = []

# Loop through each year
for year in range(start_year_2, current_year):
    year = str(year)
    if year not in api_variables.columns:
        print(f"Skipping year {year}")
        continue
    
    # Drop if year is not in the api_variables csv
    year_vars = api_variables[year].dropna().tolist()
    dp_variables = [v for v in year_vars if v.startswith('DP05')]

    if not dp_variables:
        continue

    # split into batches
    variable_batches = [
        dp_variables[i:i + variable_batch_size]
        for i in range(0, len(dp_variables), variable_batch_size)
    ]

    zip_batches = [
        zips_list[i:i + zip_batch_size]
        for i in range(0, len(zips_list), zip_batch_size)
    ]

    # The api link
    url = f"https://api.census.gov/data/{year}/acs/acs5/profile"

    # Loop through variable batches
    for var_batch in variable_batches:
        # join once per batch
        var_str = ",".join(var_batch)

        # Loop through zips
        for zip_batch in zip_batches:
            zip_str = ",".join(str(z) for z in zip_batch)
            params = {
                'get': var_str,
                'for': f"zip code tabulation area:{zip_str}",
                'key': api_key
            }

            # The new years after 2020 required state parameters
            if int(year) < 2020:
                params['in'] = 'state:53'

            response = requests.get(url, params=params)

            # Raise and error if the request was not successfull
            if response.status_code == 200:
                print(f"Success for {year} vars {var_str} / zips {zip_str} (status {response.status_code})")

            if response.status_code != 200 or not response.text.strip():
                print(f"Failed or empty for vars {var_str} / zips {zip_str} (status {response.status_code})")
            
            data = response.json()
            print(data)

            # Without State parameter
            headers = data[0]
            for row in data[1:]:
                if int(year) >= 2020:
                    estimates = row[:-1]
                    ztca = row[-1]
                    for i, est in enumerate(estimates):
                        rows.append({
                            'variable': headers[i],
                            'estimate': est,
                            'zip': ztca,
                            'year': year
                        })
                else:
                    estimates = row[:-2]
                    ztca = row[-1]
                    for i, est in enumerate(estimates):
                        rows.append({
                            'variable': headers[i],
                            'estimate': est,
                            'zip': ztca,
                            'year': year
                        })
            
# build DataFrame once at the end
all_data_20_now = pd.DataFrame(rows)

In [None]:
all_data = pd.concat([all_data_15_19, all_data_20_now], ignore_index=True)

In [None]:
all_data

In [None]:
# Save intermediate
all_data.to_csv('dp_census_raw.csv')

# Pull Median Income Data ACS Housing and Demographics Survey

In [None]:
rows = []

# Loop through each year
for year in range(start_year, current_year):
    year = str(year)

    if year not in api_variables.columns:
        print(f"Skipping year {year}")
        continue

    year_variables = api_variables[year].dropna().tolist()

    # There is only one variable pulls
    sc_variables = [var for var in year_variables if var.startswith('S1903')]
    print(f"For {year} variables: {sc_variables} length {len(sc_variables)}")

    zip_batches = [
        zips_list[i:i + zip_batch_size]
        for i in range(0, len(zips_list), zip_batch_size)
    ]


    for zips in zip_batches:
        for zip_code in zips:
                for variable in sc_variables:
                    url = f'https://api.census.gov/data/{year}/acs/acs5/subject'
                
                    if int(year) >= 2020:
                        params = {
                            'get': variable,
                            'for': f"zip code tabulation area:{zip_code}",
                            'key': api_key
                        }
                        response = requests.get(url, params=params)
                    else:
                        params = {
                            'get': variable,
                            'for': f"zip code tabulation area:{zip_code}",
                            'in': 'state:53',
                            'key': api_key
                        }
                        response = requests.get(url, params=params)

                    if response.status_code == 200:

                        if not response.text.strip():
                            print(f"Empty for {zip_code} in {year} for variables {variable}")
                            continue

                        data = response.json()

                        headers = data[0]
                        for row in data[1:]:
                            if int(year) >= 2020:
                                est, ztca = row[:3]
                                rows.append({
                                    'variable': headers[0],
                                    'estimate': est,
                                    'zip': ztca,
                                    'year': year
                                    })
                            else:
                                estimates = row[:-2]
                                ztca = row[-1]
                                for i, est in enumerate(estimates):
                                    rows.append({
                                        'variable': headers[i],
                                        'estimate': est,
                                        'zip': ztca,
                                        'year': year
                                        })
                                    

# build DataFrame once at the end
all_inc_data = pd.DataFrame(rows)

In [None]:
# Save intermediate
all_inc_data.to_csv('med_census_raw.csv')

# Reshape and merge school/parish ID 

In [None]:
api_variables_long = pd.melt(api_variables, id_vars=['variable_name'], 
                             value_vars= api_years,
                             var_name='year',
                             value_name='variable_key'
                             )

In [None]:
all_df['year'] = all_df['year'].astype(str)
all_df = all_df.merge(api_variables_long, left_on=['variable', 'year'], right_on=['variable_key', 'year'], how='left')
all_df.drop(columns='variable_key', inplace=True)

In [None]:
all_df

In [None]:
all_df['variable_name'].value_counts()

In [None]:
# Change zip type
all_df['zip'] = all_df['zip'].astype(str)
zips_exploded['zip_codes'] = zips_exploded['zip_codes'].astype(str)

# Merge census data with zips 
merged_df = zips_exploded.merge(all_df, left_on='zip_codes', right_on='zip', how='left')

# Drop redudant columns 
merged_df.drop(columns=['zip_codes', 'census_tracts'], inplace=True)

# Aggregrate data
census_agg = merged_df.groupby(['school_id', 'year', 'variable_name']).agg({
    'estimate': 'mean'
}).reset_index()

# Change Year
census_agg['academic_year'] = (
    census_agg['year'].astype(int).astype(str) + "_" +
    (census_agg['year'].astype(int) + 1).astype(str).str[-2:]
)

In [None]:
# Save Intermediate
census_agg.to_csv('school_aggregate_census.csv')

# Public School API 

In [None]:
# API endpoint for enrollment data 
# Has 1,000 row limit
url = 'https://data.wa.gov/resource/rxjk-6ieq.json'

In [None]:
# API endpont for assessment data
# Has 1,000 row limit
# 2014_15 to 2021_22
url = 'https://data.wa.gov/resource/292v-tb9r.json'

# 2022_23 
url = 'https://data.wa.gov/resource/xh7m-utwp.json'

# 2023_24
url = 'https://data.wa.gov/resource/x73g-mrqp.json'
