# 2022-48 Census Building Permit ETL QC
Test Plan: https://sandag.sharepoint.com/:o:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7Bc2c3f693-42fd-4ef1-be35-d81805feb277%7D&action=editnew

## Library Imports

In [1]:
import pandas as pd
import numpy as np
import pyodbc

from pathlib import Path

# ignore warning relating to pandas and pyodbc (just ignore all warnings)
import warnings
warnings.filterwarnings("ignore")

## Downloading Data

In [2]:
def download_data_year(year):
    """
    This function downloads raw and staging data of Census Building Permits for the input year

    :param year:    The year of data to download
    :returns:       Tuple containing raw data (1 df) and staging data (2 dfs) for the input year
    """
    # Raw data is stored in the following folder
    raw_folder = Path(r"R:\DPOE\Census\Census Building Permits\Source\raw")
    
    # Get raw data files for the input year (multiple files can exist)
    raw_data_files = raw_folder.glob(f"co{year}*")

    # There's an error in the raw data where columns get kind of messed up, easiest way to fix is 
    # just to define custom column names
    column_names = ["survey_yr", "state_fips", "county_fips", "region", "division", "area_name",
                    "bldgs_U1_est", "units_U1_est", "value_U1_est", "bldgs_U2_est", "units_U2_est", 
                    "value_U2_est", "bldgs_U3_4_est", "units_U3_4_est", "value_U3_4_est",
                    "bldgs_U5plus_est", "units_U5plus_est", "value_U5plus_est", "bldgs_U1_rep",
                    "units_U1_rep", "value_U1_rep", "bldgs_U2_rep", "units_U2_rep", "value_U2_rep", 
                    "bldgs_U3_4_rep", "units_U3_4_rep", "value_U3_4_rep", "bldgs_U5plus_rep",
                    "units_U5plus_rep", "value_U5plus_rep"]

    # Get the raw data for the input year
    df_list = []
    for file in raw_data_files:
        df_list.append(pd.read_csv(file, skiprows=2, names=column_names, dtype={"state_fips": str,
            "county_fips": str}))
    raw_data = pd.concat(df_list, axis=0, ignore_index=True)

    # Create a connection to the SQL server
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
        'Server=DDAMWSQL16.sandag.org;'
        'Database=dpoe_stage;'
        'Trusted_Connection=yes;')

    # Get the staging data for the input year
    # Note staging data has two tables, county_annual_dbo and county_geo_id_dbo
    annual_dbo_query = "SELECT * "\
        f"FROM dpoe_stage.census_building_permits.county_annual_dbo WHERE yr={year};"
    geo_id_dbo_query = "SELECT * "\
        f"FROM dpoe_stage.census_building_permits.county_geo_id_dbo WHERE yr={year};"
    staging_data = [pd.read_sql_query(annual_dbo_query, conn), 
        pd.read_sql_query(geo_id_dbo_query, conn)]

    # As per Lisbeth's email, we are not looking at production data
    # # Get the production data for the input year
    # # Note production data has two tables, county_annual and county_geo_id
    # annual_query = "SELECT * "\
    #     f"FROM socioec_data.census_building_permits.county_annual WHERE yr={year};"
    # geo_id_query = "SELECT * "\
    #     f"FROM socioec_data.census_building_permits.county_geo_id WHERE yr={year};"
    # production_data = [
    #     pd.read_sql_query(annual_query, conn), 
    #     pd.read_sql_query(geo_id_query, conn)
    #     ]

    # return the raw, staging, and production data
    # return (raw_data, staging_data[0], staging_data[1], production_data[0], production_data[1])
    return (raw_data, staging_data[0], staging_data[1])

In [3]:
def download_data(first_year, last_year):
    """
    This function collates the logic needed to download every year of raw and staging data

    :param first_year:  The first year of data to download inclusive
    :param last_year:   The last year of data to download inclusive. first_year must be less than
                        or equal to last_year
    :returns:           Tuple containing combined raw data (1 df) and combined staging data (2 dfs)
                        for every requested year 
    """
    # Make sure the years are ok
    if(first_year > last_year):
        raise ValueError("first_year must be less than or equal to last_year")

    # Store raw and staging data here
    raw_data_dfs = []
    staging_data_dfs = [[], []]

    # Loop over every year requested
    for year in range(first_year, last_year+1):
        # Get the raw and staging data for that year
        raw_data, staging_data_annual, staging_data_geo_id = download_data_year(year)

        # Add the data to the storage lists
        raw_data_dfs.append(raw_data)
        staging_data_dfs[0].append(staging_data_annual)
        staging_data_dfs[1].append(staging_data_geo_id)

    # Combine the various years of raw data into one df
    raw_data = pd.concat(raw_data_dfs).reset_index(drop=True)

    # Combine the various years of staging data into one df each
    staging_data_annual = pd.concat(staging_data_dfs[0]).reset_index(drop=True)
    staging_data_geo_id = pd.concat(staging_data_dfs[1]).reset_index(drop=True)

    # # For whatever reason, neither infer_objects() nor convert_dtypes() is working, so set the
    # # dtypes manually
    # staging_data_annual = staging_data_annual.astype({"county_fips": np.int64, 
    #     "buildings": np.int64, "units": np.int64, "valuation": np.int64})

    # print(staging_data_annual.head())
    # print(staging_data_annual.dtypes)

    return (raw_data, staging_data_annual, staging_data_geo_id)

## Transforming Data

In [4]:
def transform_data(raw_data, staging_data_annual, staging_data_geo_id):
    """
    This function transforms raw data into the (modified) format of the staging data. Key
    differences include the case_id column is not needed and that staging data will be transformed 
    to only be one table

    :param raw_data:            Raw data from DPOE
    :param staging_data_annual: Staging data from DDAMWSQL16.dpoe_stage...county_annual_dbo
    :param staging_data_geo_id: Staging data from DDAMWSQL16.dpoe_stage...county_geo_id_dbo
    :returns:                   A tuple containing raw data that has been transformed to the format 
                                of the staging data and staging data that has had its format 
                                updated
    """
    # Cast the "county_fips" columns in both staging tables to int to allow for joining
    staging_data_annual = staging_data_annual.astype({"county_fips": int})
    staging_data_geo_id = staging_data_geo_id.astype({"county_fips": int})

    # drop the duplicated yr column from staging_data_geo_id
    staging_data_geo_id = staging_data_geo_id.drop("yr", axis=1)

    # Combine staging data into one table
    staging_data = staging_data_annual.join(staging_data_geo_id.set_index("county_fips"), 
        on="county_fips", how="left", lsuffix="l").drop_duplicates()

    # Due to a quirk of the raw data, need to drop duplicate index rows as well
    staging_data = staging_data.reset_index().drop_duplicates(subset="index", 
        keep="first").set_index("index")

    # drop the case_id column
    staging_data = staging_data.drop("case_id", axis=1)

    # Transform raw data into the new staging data format
    # The steps to do so have been summarized right here:
    # 1. Combine state_fips and county_fips, make sure they are all 4 or 5 digit numbers
    # 2. Drop extraneous columns
    # 3. Reshape data wide to long
    # 4. Create the units_in_building column
    # 5. Create the estimate_type column
    # 6. Clean up the units and values columns
    # 7. Take only the columns we want
    # 8. Rename columns to fit the staging data
    # etc.
    
    # 1. Combine state_fips and county_fips, make sure they are all 4 or 5 digit numbers
    raw_data["county_fips"] = raw_data["state_fips"] + raw_data["county_fips"]
    raw_data = raw_data.astype({"county_fips": int})
    if(raw_data["county_fips"].min() < 1000 or raw_data["county_fips"].max() >= 100000):
        raise ValueError("County FIPS out of range")

    # 2. Drop extraneous columns
    raw_data = raw_data.drop(["state_fips"], axis=1)
    
    # 3. Reshape data wide to long
    # I don't really know how to use melt all that well tbh, so this is kinda hacky
    raw_data = pd.melt(raw_data, id_vars=["survey_yr", "county_fips", "area_name", "region", 
        "division"] + list(filter(lambda x: ("units" in x) or ("value" in x), raw_data.columns)), 
        var_name="variable", value_name="buildings")

    # 4. Create the units_in_building column
    # Surely there is a better way to do this, but oh well
    raw_data["units_in_building"] = raw_data["variable"].replace({
        ".*1.*": "1",
        ".*2.*": "2",
        ".*3.*": "3-4",
        ".*5.*": "5+"
    }, regex=True)

    # 5. Create the estimate_type column
    raw_data["estimate_type"] = raw_data["variable"].replace({
        ".*_rep": "Reported Only",
        ".*_est": "With Imputation"
    }, regex=True)
    
    # 6. Clean up the units and values columns
    raw_data["units"] = None
    raw_data["valuation"] = None
    for unit_count in ["U1", "U2", "U3_4", "U5plus"]:
        for estimate_type in ["_rep", "_est"]:
            mask = (raw_data["variable"].str.contains(unit_count)) & \
                (raw_data["variable"].str.contains(estimate_type))
            raw_data.loc[mask, "units"] = raw_data["units_" + unit_count + estimate_type]
            raw_data.loc[mask, "valuation"] = raw_data["value_" + unit_count + estimate_type]
            
    # 7. Take only the columns we want
    raw_data = raw_data[["survey_yr", "county_fips", "estimate_type", "units_in_building", 
        "buildings", "units", "valuation", "area_name", "region", "division"]]

    # 8. Rename columns to fit the staging data
    raw_data = raw_data.rename(columns={"area_name": "County_Name", "region": "Region_Code",
        "division": "Division_Code", "survey_yr": "yr"})

    return raw_data, staging_data

## Initialize Data

In [5]:
# the years (inclusive) of data to check
start_year = 2017
end_year = 2021

# download the data
raw_data, staging_data_annual, staging_data_geo_id = download_data(start_year, end_year)

# transform the data
raw_data, staging_data = transform_data(raw_data, staging_data_annual, staging_data_geo_id)

In [6]:
raw_data.head()

Unnamed: 0,yr,county_fips,estimate_type,units_in_building,buildings,units,valuation,County_Name,Region_Code,Division_Code
0,2017,1001,With Imputation,1,188,188,51711467,Autauga County,3,6
1,2017,1003,With Imputation,1,2207,2207,449821615,Baldwin County,3,6
2,2017,1005,With Imputation,1,3,3,901000,Barbour County,3,6
3,2017,1007,With Imputation,1,10,10,2551290,Bibb County,3,6
4,2017,1009,With Imputation,1,18,18,4265412,Blount County,3,6


In [7]:
staging_data.head()

Unnamed: 0_level_0,yr,county_fips,estimate_type,units_in_building,buildings,units,valuation,County_Name,Region_Code,Division_Code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,2017,1001,With Imputation,1,188,188,51711467,Autauga County,3,6
1,2017,1003,With Imputation,1,2207,2207,449821615,Baldwin County,3,6
2,2017,1005,With Imputation,1,3,3,901000,Barbour County,3,6
3,2017,1007,With Imputation,1,10,10,2551290,Bibb County,3,6
4,2017,1009,With Imputation,1,18,18,4265412,Blount County,3,6


## Run Tests

Tests are as follows:
1. Check that transformed data and SQL data have the same shape (basically checking the number of rows and columns)
2. Check number of buildings with each number of units is the same 
3. Check columns are the same 
4. Check that the number of rows for each year in raw data and staging data are the same

 



In [8]:
# Test 1: Check that the shapes are the same
raw_data.shape == staging_data.shape

True

In [9]:
# Test 2: Check that there are the same number of each kind of building
r = raw_data.groupby("units_in_building").sum()["buildings"]

staging_data["buildings"] = pd.to_numeric(staging_data["buildings"])
s = staging_data.groupby("units_in_building").sum()["buildings"]

r == s


units_in_building
1      True
2      True
3-4    True
5+     True
Name: buildings, dtype: bool

In [10]:
# Test 3: Check columns are the same
raw_data.columns == staging_data.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [11]:
# Test 4: Check that the number of rows for each year in raw data and staging data are the same
r = list(raw_data["yr"].value_counts()) 
s = list(staging_data["yr"].value_counts())

r == s

True