# Affordable Housing Data - ETL

This notebook contains the code for the ETL of the Affordable Housing Data.

In [18]:
import numpy as np
import pandas as pd

## Extract

In [19]:
census_housing = pd.read_csv("./data/JAX-CensusHousing.csv")
census_incomes = pd.read_csv("./data/JAX-CensusHouseholdIncome.csv")
nhpd = pd.read_csv("./data/JAX-NHPD.csv")

print("Census Housing Data Samples: ", census_housing["geoid"].unique().shape[0])
print(
    "Census Household Income Data Samples: ", census_incomes["geoid"].unique().shape[0]
)
print("NHPD Data Samples: ", nhpd["geoid"].unique().shape[0])

Census Housing Data Samples:  208
Census Household Income Data Samples:  208
NHPD Data Samples:  208


## Transform: Data Restructuring

In [20]:
FILLER_COLUMNS: list[str] = [
    "feature id",
    "feature label",
    "shid",
    "indicator time",
    "indicator unit",
    "indicator format",
    "indicator source",
]


def ReshapeData(df: pd.DataFrame) -> pd.DataFrame:
    # Remove filler columns
    df_cleaned: pd.DataFrame = df.drop(columns=FILLER_COLUMNS)

    # Pivot the table on geoid to have features as columns
    df_pivoted: pd.DataFrame = df_cleaned.pivot_table(
        index="geoid", columns="indicator name", values="value"
    ).reset_index()

    return df_pivoted


def MergeMasterTable(tables: list[pd.DataFrame], on: str = "geoid") -> pd.DataFrame:
    base_table: pd.DataFrame = tables[0]
    for table in tables[1:]:
        base_table = base_table.merge(table, on=on, how="inner")

    print("Merged Master Table Shape (before cleaning): ", base_table.shape)
    # base_table = base_table.dropna(axis=1, thresh=int(0.9 * base_table.shape[0]))
    # base_table = base_table.dropna(axis=0)

    print("Merged Master Table Shape (after cleaning): ", base_table.shape)
    return base_table


master_table = MergeMasterTable(
    [
        ReshapeData(census_housing),
        ReshapeData(census_incomes),
        ReshapeData(nhpd),
        ReshapeData(pd.read_csv("./data/JAX-CHAS-RO.csv")),
        ReshapeData(pd.read_csv("./data/JAX-CHAS-CB.csv")),
        ReshapeData(pd.read_csv("./data/JAX-Demo.csv")),
        ReshapeData(pd.read_csv("./data/JAX-CensusTransport.csv")),
        ReshapeData(pd.read_csv("./data/JAX-Edu.csv")),
        ReshapeData(pd.read_csv("./data/JAX-Family.csv")),
    ]
)

print(master_table.columns.tolist())

Merged Master Table Shape (before cleaning):  (207, 215)
Merged Master Table Shape (after cleaning):  (207, 215)
['geoid', 'HUD Subsidized Housing Units', 'Housing Insecurity Among Adults', 'Housing Units (USDA)', 'Housing Units with No Vehicles (USDA)', 'Housing Units with SNAP Benefits (USDA)', 'LIHTC Qualified Census Tract', 'Median Home Costs as a Percentage of Income with a Mortgage', 'Median Home Costs as a Percentage of Income without a Mortgage', 'Median Home Rent', 'Median Home Value', 'Overcrowded Housing Units', 'Percent HUD Assisted Housing with Minority Householder', 'Potential for Lead Paint (Housing Built 1979 and Earlier)', 'Total Housing Units', 'Total Occupied Housing Units', 'Total Owner Occupied Housing Units', 'Total Renter Occupied Housing Units', 'Total Vacant Housing Units', 'Walkability Employment-Housing Mix Ranking', '80/20 Household Income Disparity Ratio', 'Average Household Income in HUD Assisted Housing Units', 'Home Owner Excessive Housing Costs', 'House

In [21]:
master_table2 = pd.DataFrame({"Geo ID": master_table["geoid"]})


# Transportation Feature Engineering
master_table2["Public Transport Commute Rate"] = (
    master_table["People Commuting to Work by Public Transit"]
    / master_table["Total Occupied Housing Units"]
)
master_table2["Private Vehicle Commute Rate"] = (
    master_table["People Commuting to Work by Drive Alone"]
    + master_table["People Commuting to Work by Drive Carpool"]
) / master_table["Total Occupied Housing Units"]
master_table2["Walking and Cycling Commute Rate"] = (
    master_table["People Commuting to Work by Walk"]
    + master_table["People Commuting to Work by Bicycle"]
) / master_table["Total Occupied Housing Units"]
master_table2["Other Mobility Commute Rate"] = (
    master_table["People Commuting to Work by Taxicab, Motorcycle, or Other Means"]
    / master_table["Total Occupied Housing Units"]
)

# Housing Burden Features
# <30% Cost Burdened - map each owner/renter group 1:1 to proportion (divide by 100)
master_table2["Proportion of Home Owners Not Cost Burdened (W)"] = (
    master_table[
        "Owner Housing Cost Burdens - less than or equal 30% for White alone, Non-Hispanic"
    ]
    / 100
)
master_table2["Proportion of Home Renters Not Cost Burdened (W)"] = (
    master_table[
        "Renter Housing Cost Burdens - less than or equal 30% for White alone, Non-Hispanic"
    ]
    / 100
)
master_table2[
    "Proportion of Home Owners Not Cost Burdened (B)"
] = (
    master_table[
        "Owner Housing Cost Burdens - less than or equal 30% for Black or African-American alone, non-Hispanic"
    ]
    / 100
)
master_table2[
    "Proportion of Home Renters Not Cost Burdened (B)"
] = (
    master_table[
        "Renter Housing Cost Burdens - less than or equal 30% for Black or African-American alone, non-Hispanic"
    ]
    / 100
)
master_table2["Proportion of Home Owners Not Cost Burdened (H)"] = (
    master_table[
        "Owner Housing Cost Burdens - less than or equal 30% for Hispanic, any race"
    ]
    / 100
)
master_table2["Proportion of Home Renters Not Cost Burdened (H)"] = (
    master_table[
        "Renter Housing Cost Burdens - less than or equal 30% for Hispanic, any race"
    ]
    / 100
)
# 30-50% Cost Burdened - map each owner/renter group 1:1 to proportion (divide by 100)
master_table2["Proportion of Home Owners That Are Cost Burdened (W)"] = (
    master_table[
        "Owner Housing Cost Burdens - greater than 30% less than or equal to 50% for White alone, Non-Hispanic"
    ]
    / 100
)
master_table2["Proportion of Home Renters That Are Cost Burdened (W)"] = (
    master_table[
        "Renter Housing Cost Burdens - greater than 30% less than or equal to 50% for White alone, Non-Hispanic"
    ]
    / 100
)
master_table2[
    "Proportion of Home Owners That Are Cost Burdened (B)"
] = (
    master_table[
        "Owner Housing Cost Burdens - greater than 30% less than or equal to 50% for Black or African-American alone, non-Hispanic"
    ]
    / 100
)
master_table2[
    "Proportion of Home Renters That Are Cost Burdened (B)"
] = (
    master_table[
        "Renter Housing Cost Burdens - greater than 30% less than or equal to 50% for Black or African-American alone, non-Hispanic"
    ]
    / 100
)
master_table2["Proportion of Home Owners That Are Cost Burdened (H)"] = (
    master_table[
        "Owner Housing Cost Burdens - greater than 30% less than or equal to 50% for Hispanic, any race"
    ]
    / 100
)
master_table2["Proportion of Home Renters That Are Cost Burdened (H)"] = (
    master_table[
        "Renter Housing Cost Burdens - greater than 30% less than or equal to 50% for Hispanic, any race"
    ]
    / 100
)
# >50% Cost Burdened - map each owner/renter group 1:1 to proportion (divide by 100)
master_table2["Proportion of Home Owners That Are Severely Cost Burdened (W)"] = (
    master_table[
        "Owner Housing Cost Burdens - greater than 50% for White alone, Non-Hispanic"
    ]
    / 100
)
master_table2["Proportion of Home Renters That Are Severely Cost Burdened (W)"] = (
    master_table[
        "Renter Housing Cost Burdens - greater than 50% for White alone, Non-Hispanic"
    ]
    / 100
)
master_table2[
    "Proportion of Home Owners That Are Severely Cost Burdened (B)"
] = (
    master_table[
        "Owner Housing Cost Burdens - greater than 50% for Black or African-American alone, non-Hispanic"
    ]
    / 100
)
master_table2[
    "Proportion of Home Renters That Are Severely Cost Burdened (B)"
] = (
    master_table[
        "Renter Housing Cost Burdens - greater than 50% for Black or African-American alone, non-Hispanic"
    ]
    / 100
)
master_table2["Proportion of Home Owners That Are Severely Cost Burdened (H)"] = (
    master_table[
        "Owner Housing Cost Burdens - greater than 50% for Hispanic, any race"
    ]
    / 100
)
master_table2["Proportion of Home Renters That Are Severely Cost Burdened (H)"] = (
    master_table[
        "Renter Housing Cost Burdens - greater than 50% for Hispanic, any race"
    ]
    / 100
)

# Education Features, Individual Person Counts
master_table2["Education Rate - No High School Diploma"] = (
    master_table["Education 9th to 12th Grade, No Diploma"]
    + master_table["Education Less than 9th Grade"]
) / master_table["Total Occupied Housing Units"]
master_table2["Education Rate - High School Graduate"] = (
    master_table["Education High School Degree"]
    / master_table["Total Occupied Housing Units"]
)
master_table2["Education Rate - Some College or Associate's Degree"] = (
    master_table["Education Some College No Degree"]
    + master_table["Education Associate Degree"]
) / master_table["Total Occupied Housing Units"]
master_table2["Education Rate - Bachelor's Degree or Higher"] = (
    master_table["Education Bachelor's Degree"]
    + master_table["Education Graduate Degree"]
) / master_table["Total Occupied Housing Units"]

# Income Level Features, Household Counts
master_table2["Proportion of Households at Income Level - Low"] = (
    master_table["Income Less than $25,000"]
    / master_table["Total Occupied Housing Units"]
)
master_table2["Proportion of Households at Income Level - Middle"] = (
    master_table["Income $25,000 to $49,999"]
    / master_table["Total Occupied Housing Units"]
)
master_table2["Proportion of Households at Income Level - High"] = (
    master_table["Income $50,000 to $74,999"]
    + master_table["Income $75,000 to $99,999"]
) / master_table["Total Occupied Housing Units"]
master_table2["Proportion of Households at Income Level - Very High"] = (
    master_table["Income $100,000 to $124,999"]
    + master_table["Income $125,000 to $149,999"]
    + master_table["Income $150,000 to $199,999"]
    + master_table["Income $200,000 or More"]
) / master_table["Total Occupied Housing Units"]


# Family Makeup Features
master_table2["Children to Adult Ratio"] = (
    master_table["Population Age Under 5_y"]
    + master_table["Population Age 5 to 9_y"]
    + master_table["Population Age 10 to 14_y"]
    + master_table["Population Age 15 to 19_y"]
) / (
    master_table["Population Age 20 to 24_y"]
    + master_table["Population Age 25 to 34_y"]
    + master_table["Population Age 35 to 44_y"]
    + master_table["Population Age 45 to 54_y"]
    + master_table["Population Age 55 to 59_y"]
    + master_table["Population Age 60 to 64_y"]
    + master_table["Population Age 65 to 74_y"]
    + master_table["Population Age 75 to 84_y"]
    + master_table["Population Age Over 85_y"]
)


# Subsidized Housing Features
master_table2["HUD Subsidization Rate"] = (
    master_table["HUD Subsidized Housing Units"]
    / master_table["Total Occupied Housing Units"]
)
master_table2["SNAP Subsidization Rate"] = round(
    (  # Percent of households receiving SNAP benefits
        master_table["Households Receiving SNAP"]
        / master_table["Total Occupied Housing Units"]
    )
    * 100,
    2,
)

# General Housing Features
master_table2["Occupancy Rate"] = (
    master_table["Total Occupied Housing Units"] / master_table["Total Housing Units"]
)
master_table2["Overcrowding Rate"] = (
    master_table["Overcrowded Housing Units"]
    / master_table["Total Occupied Housing Units"]
)


print("Old Master Table Shape: ", master_table.shape)
print("New Master Table Shape: ", master_table2.shape)
master_table2.head(2)

Old Master Table Shape:  (207, 215)
New Master Table Shape:  (207, 36)


Unnamed: 0,Geo ID,Public Transport Commute Rate,Private Vehicle Commute Rate,Walking and Cycling Commute Rate,Other Mobility Commute Rate,Proportion of Home Owners Not Cost Burdened (W),Proportion of Home Renters Not Cost Burdened (W),Proportion of Home Owners Not Cost Burdened (B),Proportion of Home Renters Not Cost Burdened (B),Proportion of Home Owners Not Cost Burdened (H),...,Education Rate - Bachelor's Degree or Higher,Proportion of Households at Income Level - Low,Proportion of Households at Income Level - Middle,Proportion of Households at Income Level - High,Proportion of Households at Income Level - Very High,Children to Adult Ratio,HUD Subsidization Rate,SNAP Subsidization Rate,Occupancy Rate,Overcrowding Rate
0,12031000101,0.007529,0.674196,0.041068,0.033539,0.842466,0.0,0.3,0.341772,0.5,...,0.328542,0.175907,0.406571,0.166324,0.251198,0.294971,0.041752,17.25,0.794022,0.001369
1,12031000102,0.011542,0.610882,0.0,0.0,1.0,0.510638,0.636364,0.134921,0.5,...,0.155812,0.549876,0.165705,0.178895,0.105523,0.246436,0.108821,27.12,0.79541,0.0


## Load

In [22]:
# Save the processed data to CSV files
master_table2.dropna(inplace=True)
master_table2.to_csv("./data/MasterTable.csv", index=False)
master_table2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 124 entries, 1 to 205
Data columns (total 36 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   Geo ID                                                          124 non-null    int64  
 1   Public Transport Commute Rate                                   124 non-null    float64
 2   Private Vehicle Commute Rate                                    124 non-null    float64
 3   Walking and Cycling Commute Rate                                124 non-null    float64
 4   Other Mobility Commute Rate                                     124 non-null    float64
 5   Proportion of Home Owners Not Cost Burdened (W)                 124 non-null    float64
 6   Proportion of Home Renters Not Cost Burdened (W)                124 non-null    float64
 7   Proportion of Home Owners Not Cost Burdened (B)           