# 2022-48 Census Building Permit ETL QC
Test Plan: https://sandag.sharepoint.com/:o:/r/qaqc/_layouts/15/Doc.aspx?sourcedoc=%7Bc2c3f693-42fd-4ef1-be35-d81805feb277%7D&action=editnew

## Library Imports

In [33]:
import pandas as pd
import numpy as np
import pyodbc

from pathlib import Path

## Downloading Data

In [32]:
def download_data(year):
    """
    This function downloads source, staging, and production data of Census Building Permits for the
    input year

    :param year:    The year of data to download
    :returns:       Tuple containing raw data (1 df), staging data (2 dfs), and production data 
                    (2 dfs) for the input year
    """
    # raw data is stored in the following folder
    raw_folder = Path(r"R:\DPOE\Census\Census Building Permits\Source\raw")
    
    # get raw data files for the input year (multiple files can exist)
    raw_data_files = raw_folder.glob(f"co{year}*")

    # get the raw data for the input year
    df_list = []
    for file in raw_data_files:
        df_list.append(pd.read_csv(file, index_col=None, header=0))
    raw_data = pd.concat(df_list, axis=0, ignore_index=True)

    # create a connection to the SQL server
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
        'Server=DDAMWSQL16.sandag.org;'
        'Database=dpoe_stage;'
        'Trusted_Connection=yes;')

    # get the staging data for the input year
    # note staging data has two tables, county_annual_dbo and county_geo_id_dbo
    annual_dbo_query = "SELECT * "\
        f"FROM dpoe_stage.census_building_permits.county_annual_dbo WHERE yr={year};"
    geo_id_dbo_query = "SELECT * "\
        f"FROM dpoe_stage.census_building_permits.county_geo_id_dbo WHERE yr={year};"
    staging_data = [pd.read_sql_query(annual_dbo_query, conn), 
        pd.read_sql_query(geo_id_dbo_query, conn)]

    # get the production data for the input year
    # note production data has two tables, county_annual and county_geo_id
    annual_query = "SELECT * "\
        f"FROM socioec_data.census_building_permits.county_annual WHERE yr={year};"
    geo_id_query = "SELECT * "\
        f"FROM socioec_data.census_building_permits.county_geo_id WHERE yr={year};"
    production_data = [pd.read_sql_query(annual_query, conn), pd.read_sql_query(geo_id_query, conn)]

    # return the raw, staging, and production data
    return (raw_data, staging_data[0], staging_data[1], production_data[0], production_data[1])