# 2022-48 Census Building Permit ETL QC
Test Plan: https://sandag.sharepoint.com/:o:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7Bc2c3f693-42fd-4ef1-be35-d81805feb277%7D&action=editnew

## Library Imports

In [4]:
import pandas as pd
import numpy as np
import pyodbc

from pathlib import Path

# ignore warning relating to pandas and pyodbc (just ignore all warnings)
import warnings
warnings.filterwarnings("ignore")

## Downloading Data

In [113]:
def download_data_year(year):
    """
    This function downloads raw and staging data of Census Building Permits for the input year

    :param year:    The year of data to download
    :returns:       Tuple containing raw data (1 df) and staging data (2 dfs) for the input year
    """
    # Raw data is stored in the following folder
    raw_folder = Path(r"R:\DPOE\Census\Census Building Permits\Source\raw")
    
    # Get raw data files for the input year (multiple files can exist)
    raw_data_files = raw_folder.glob(f"co{year}*")

    # There's an error in the raw data where columns get kind of messed up, easiest way to fix is 
    # just to define custom column names
    column_names = ["survey_yr", "state_fips", "county_fips", "region", "division", "area_name",
                    "bldgs_U1_est", "units_U1_est", "value_U1_est", "bldgs_U2_est", "units_U2_est", 
                    "value_U2_est", "bldgs_U3_4_est", "units_U3_4_est", "value_U3_4_est",
                    "bldgs_U5plus_est", "units_U5plus_est", "value_U5plus_est", "bldgs_U1_rep",
                    "units_U1_rep", "value_U1_rep", "bldgs_U2_rep", "units_U2_rep", "value_U2_rep", 
                    "bldgs_U3_4_rep", "units_U3_4_rep", "value_U3_4_rep", "bldgs_U5plus_rep",
                    "units_U5plus_rep", "value_U5plus_rep"]

    # Get the raw data for the input year
    df_list = []
    for file in raw_data_files:
        df_list.append(pd.read_csv(file, skiprows=2, names=column_names, dtype={"state_fips": str,
            "county_fips": str}))
    raw_data = pd.concat(df_list, axis=0, ignore_index=True)

    # Create a connection to the SQL server
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
        'Server=DDAMWSQL16.sandag.org;'
        'Database=dpoe_stage;'
        'Trusted_Connection=yes;')

    # Get the staging data for the input year
    # Note staging data has two tables, county_annual_dbo and county_geo_id_dbo
    annual_dbo_query = "SELECT * "\
        f"FROM dpoe_stage.census_building_permits.county_annual_dbo WHERE yr={year};"
    geo_id_dbo_query = "SELECT * "\
        f"FROM dpoe_stage.census_building_permits.county_geo_id_dbo WHERE yr={year};"
    staging_data = [pd.read_sql_query(annual_dbo_query, conn), 
        pd.read_sql_query(geo_id_dbo_query, conn)]

    # As per Lisbeth's email, we are not looking at production data
    # # Get the production data for the input year
    # # Note production data has two tables, county_annual and county_geo_id
    # annual_query = "SELECT * "\
    #     f"FROM socioec_data.census_building_permits.county_annual WHERE yr={year};"
    # geo_id_query = "SELECT * "\
    #     f"FROM socioec_data.census_building_permits.county_geo_id WHERE yr={year};"
    # production_data = [
    #     pd.read_sql_query(annual_query, conn), 
    #     pd.read_sql_query(geo_id_query, conn)
    #     ]

    # return the raw, staging, and production data
    # return (raw_data, staging_data[0], staging_data[1], production_data[0], production_data[1])
    return (raw_data, staging_data[0], staging_data[1])

In [125]:
def download_data(first_year, last_year):
    """
    This function collates the logic needed to download every year of raw and staging data

    :param first_year:  The first year of data to download inclusive
    :param last_year:   The last year of data to download inclusive. first_year must be less than
                        or equal to last_year
    :returns:           Tuple containing combined raw data (1 df) and combined staging data (2 dfs)
                        for every requested year 
    """
    # Make sure the years are ok
    if(first_year > last_year):
        raise ValueError("first_year must be less than or equal to last_year")

    # Store raw and staging data here
    raw_data_dfs = []
    staging_data_dfs = [[], []]

    # Loop over every year requested
    for year in range(first_year, last_year+1):
        # Get the raw and staging data for that year
        raw_data, staging_data_annual, staging_data_geo_id = download_data_year(year)

        # Add the data to the storage lists
        raw_data_dfs.append(raw_data)
        staging_data_dfs[0].append(staging_data_annual)
        staging_data_dfs[1].append(staging_data_geo_id)

    # Combine the various years of raw data into one df
    raw_data = pd.concat(raw_data_dfs).reset_index(drop=True)

    # combine the various years of staging data into one df each
    staging_data_annual = pd.concat(staging_data_dfs[0]).reset_index(drop=True)
    staging_data_geo_id = pd.concat(staging_data_dfs[1]).reset_index(drop=True)

    return (raw_data, staging_data_annual, staging_data_geo_id)

## Transforming Data

In [181]:
def transform_data(raw_data, staging_data_annual, staging_data_geo_id):
    """
    This function transforms raw data into the (modified) format of the staging data. Key
    differences include the assumption that only San Diego data is needed and that staging data
    will be transformed to only be one table

    :param raw_data:            Raw data from DPOE
    :param staging_data_annual: Staging data from DDAMWSQL16.dpoe_stage...county_annual_dbo
    :param staging_data_geo_id: Staging data from DDAMWSQL16.dpoe_stage...county_geo_id_dbo
    :returns:                   A tuple containing raw data that has been transformed to the format 
                                of the staging data and staging data that has had its format 
                                updated
    """
    # Cast the "county_fips" columns in both staging tables to int to allow for joining
    staging_data_annual = staging_data_annual.astype({"county_fips": int})
    staging_data_geo_id = staging_data_geo_id.astype({"county_fips": int})

    # drop the duplicated yr column from staging_data_geo_id
    staging_data_geo_id = staging_data_geo_id.drop("yr", axis=1)

    # Combine staging data into one table
    staging_data = staging_data_annual.join(staging_data_geo_id.set_index("county_fips"), 
        on="county_fips", how="left", lsuffix="l").drop_duplicates()

    # Due to a quirk of the raw data, need to drop duplicate index rows as well
    staging_data = staging_data.reset_index().drop_duplicates(subset="index", 
        keep="first").set_index("index")

    # Transform raw data into the new staging data format
    # The steps to do so have been summarized right here:
    # 1. Combine state_fips and county_fips, make sure they are all 4 or 5 digit numbers
    # 2. Drop extraneous columns
    # 3. Reshape data wide to long
    # 4. Create the units_in_building column
    # 5. Create the estimate_type column
    # 6. Clean up the units and values columns
    # etc.
    
    # 1. Combine state_fips and county_fips, make sure they are all 4 or 5 digit numbers
    raw_data["county_fips"] = raw_data["state_fips"] + raw_data["county_fips"]
    raw_data = raw_data.astype({"county_fips": int})
    if(raw_data["county_fips"].min() < 1000 or raw_data["county_fips"].max() >= 100000):
        raise ValueError("County FIPS out of range")

    # 2. Drop extraneous columns
    raw_data = raw_data.drop(["state_fips"], axis=1)
    
    # 3. Reshape data wide to long
    # I don't really know how to use melt all that well tbh, so this is kinda hacky
    raw_data = pd.melt(raw_data, id_vars=["survey_yr", "county_fips", "area_name", "region", 
        "division"] + list(filter(lambda x: ("units" in x) or ("value" in x), raw_data.columns)), 
        var_name="variable", value_name="counts")

    # 4. Create the units_in_building column
    # Surely there is a better way to do this, but oh well
    units_list = ["1", "2", "3-4", "5+"]
    raw_data["units_in_building"] = raw_data["variable"].replace({
        ".*1.*": units_list[0],
        ".*2.*": units_list[1],
        ".*3.*": units_list[2],
        ".*5.*": units_list[3]
    }, regex=True)

    # 5. Create the estimate_type column
    raw_data["estimate_type"] = raw_data["variable"].replace({
        ".*_rep": "Reported Only",
        ".*_est": "With Imputation"
    }, regex=True)
    
    # 6. Clean up the units and values columns
    raw_data["units"] = None
    raw_data["valuation"] = None
    for unit_count in ["U1", "U2", "U3_4", "U5plus"]:
        for estimate_type in ["_rep", "_est"]:
            pass

    return raw_data, staging_data

# the years (inclusive) of data to check
start_year = 2017
end_year = 2021

# download the data
raw_data, staging_data_annual, staging_data_geo_id = download_data(start_year, end_year)

print(raw_data.columns)

# # transform the data
# raw_data, staging_data = transform_data(raw_data, staging_data_annual, staging_data_geo_id)

a, b = transform_data(raw_data, staging_data_annual, staging_data_geo_id)

print(b.columns)

a

Index(['survey_yr', 'state_fips', 'county_fips', 'region', 'division',
       'area_name', 'bldgs_U1_est', 'units_U1_est', 'value_U1_est',
       'bldgs_U2_est', 'units_U2_est', 'value_U2_est', 'bldgs_U3_4_est',
       'units_U3_4_est', 'value_U3_4_est', 'bldgs_U5plus_est',
       'units_U5plus_est', 'value_U5plus_est', 'bldgs_U1_rep', 'units_U1_rep',
       'value_U1_rep', 'bldgs_U2_rep', 'units_U2_rep', 'value_U2_rep',
       'bldgs_U3_4_rep', 'units_U3_4_rep', 'value_U3_4_rep',
       'bldgs_U5plus_rep', 'units_U5plus_rep', 'value_U5plus_rep'],
      dtype='object')
Index(['yr', 'county_fips', 'estimate_type', 'units_in_building', 'case_id',
       'buildings', 'units', 'valuation', 'County_Name', 'Region_Code',
       'Division_Code'],
      dtype='object')


Unnamed: 0,survey_yr,county_fips,area_name,region,division,units_U1_est,value_U1_est,units_U2_est,value_U2_est,units_U3_4_est,...,units_U3_4_rep,value_U3_4_rep,units_U5plus_rep,value_U5plus_rep,variable,counts,units_in_building,estimate_type,units,valuation
0,2017,1001,Autauga County,3,6,188,51711467,0,0,0,...,0,0,0,0,bldgs_U1_est,188,1,With Imputation,,
1,2017,1003,Baldwin County,3,6,2207,449821615,92,4240286,98,...,98,4034508,20,2091690,bldgs_U1_est,2207,1,With Imputation,,
2,2017,1005,Barbour County,3,6,3,901000,0,0,0,...,0,0,0,0,bldgs_U1_est,3,1,With Imputation,,
3,2017,1007,Bibb County,3,6,10,2551290,0,0,0,...,0,0,0,0,bldgs_U1_est,10,1,With Imputation,,
4,2017,1009,Blount County,3,6,18,4265412,0,0,0,...,0,0,0,0,bldgs_U1_est,18,1,With Imputation,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121427,2021,56037,Sweetwater County,4,8,49,15089171,0,0,0,...,0,0,0,0,bldgs_U5plus_rep,0,5+,Reported Only,,
121428,2021,56039,Teton County,4,8,177,465810864,22,9483190,15,...,0,0,0,0,bldgs_U5plus_rep,0,5+,Reported Only,,
121429,2021,56041,Uinta County,4,8,36,10474129,0,0,0,...,0,0,0,0,bldgs_U5plus_rep,0,5+,Reported Only,,
121430,2021,56043,Washakie County,4,8,1,170000,0,0,0,...,0,0,0,0,bldgs_U5plus_rep,0,5+,Reported Only,,


In [166]:
a["variable"].value_counts()

bldgs_U1_est        15179
units_U1_est        15179
units_U5plus_rep    15179
bldgs_U5plus_rep    15179
value_U3_4_rep      15179
units_U3_4_rep      15179
bldgs_U3_4_rep      15179
value_U2_rep        15179
units_U2_rep        15179
bldgs_U2_rep        15179
value_U1_rep        15179
units_U1_rep        15179
bldgs_U1_rep        15179
value_U5plus_est    15179
units_U5plus_est    15179
bldgs_U5plus_est    15179
value_U3_4_est      15179
units_U3_4_est      15179
bldgs_U3_4_est      15179
value_U2_est        15179
units_U2_est        15179
bldgs_U2_est        15179
value_U1_est        15179
value_U5plus_rep    15179
Name: variable, dtype: int64

In [149]:
b

Unnamed: 0_level_0,yr,county_fips,estimate_type,units_in_building,case_id,buildings,units,valuation,County_Name,Region_Code,Division_Code
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,2017,1001,With Imputation,1,5,188,188,51711467,Autauga County,3,6
1,2017,1003,With Imputation,1,13,2207,2207,449821615,Baldwin County,3,6
2,2017,1005,With Imputation,1,21,3,3,901000,Barbour County,3,6
3,2017,1007,With Imputation,1,29,10,10,2551290,Bibb County,3,6
4,2017,1009,With Imputation,1,37,18,18,4265412,Blount County,3,6
...,...,...,...,...,...,...,...,...,...,...,...
121427,2021,56037,Reported Only,5+,121396,0,0,0,Sweetwater County,4,8
121428,2021,56039,Reported Only,5+,121404,0,0,0,Teton County,4,8
121429,2021,56041,Reported Only,5+,121412,0,0,0,Uinta County,4,8
121430,2021,56043,Reported Only,5+,121420,0,0,0,Washakie County,4,8


## Initialize Data

In [147]:
# the years (inclusive) of data to check
start_year = 2017
end_year = 2021

# download the data
raw_data, staging_data_annual, staging_data_geo_id = download_data(start_year, end_year)

# # transform the data
# raw_data, staging_data = transform_data(raw_data, staging_data_annual, staging_data_geo_id)

## Run Tests

Tests are as follows:
1. etc...

In [None]:
# TODO