# 2022-53 ODP Data Surfer Extract QC (Estimates)

Test Plan: https://sandag.sharepoint.com/qaqc/_layouts/15/Doc.aspx?sourcedoc={6411f490-19d0-49bd-9bf0-ab52890d61f9}&action=edit&wd=target%28Untitled%20Section.one%7C19ee188a-2490-42fc-97e2-ab6875fa748e%2FTest%20Plan%7C3e4ff64a-bad9-46e4-9a24-6dc76f5d34bf%2F%29


In [1]:
import pandas as pd
import sqlalchemy as sql

from pathlib import Path

ddam = sql.create_engine('mssql+pymssql://DDAMWSQL16')

## Download Data

In [2]:
def download_raw_data(user, files):
    """
    Download the six raw data files. Note that copies of these files were put into SharePoint.

    :param user:    The user downloading the data from SharePoint. This is mostly here so that it
                    is easy for anyone to run the code
    :param files:   A list of the files to download
    :returns:       Tuple containing dataframes representing the list of files in the order they
                    appear
    """
    # The folder where raw data is stored
    base_url = Path(f"C:/Users/{user}/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-53 ODP Data Surfer Extract QC/ETL Data/Estimates/")

    # Get the six raw data files
    raw_data = []
    for file in files:
        raw_data.append(pd.read_csv(base_url / file))

    # Return the four raw data files in tuple format
    return tuple(raw_data)

In [3]:
def download_SQL_data(connection):
    """
    Download the relevant (pre-transformed) tables from SQL

    :param connection:  sqlalchemy connection to DDAMWSQL16
    :returns:           Tuple containing dataframes representing the list of files in the order they
                        appear, even though the file list is not input here
    """
    # The tables are rather small, so there is no issue in just downloading all tables and holding
    # them in memory
    age_college = pd.read_sql_query("""
        SELECT geozone as 'college', yr_id, age_group.name, SUM(population) as population 
        FROM [demographic_warehouse].[fact].[age] as tbl
        INNER JOIN [demographic_warehouse].[dim].[mgra] AS mgra ON 
            mgra.mgra_id = tbl.mgra_id
        INNER JOIN [demographic_warehouse].[dim].[age_group] ON 
            age_group.age_group_id = tbl.age_group_id
        WHERE 
            tbl.datasource_id = 40 AND 
            mgra.geotype = 'college'
        GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
        ORDER BY mgra.geozone, yr_id, tbl.age_group_id
        """, con=connection)

    age_sex_ethnicity_sd_council = pd.read_sql_query("""
    SELECT 
        geozone as 'sdcouncil', 
        yr_id, 
        age_group.name as age_group, 
        sex.sex as sex, 
        ethnicity.long_name as ethnicity, 
        SUM(population) as population 
    FROM [demographic_warehouse].[fact].[age_sex_ethnicity] as tbl
    INNER JOIN [demographic_warehouse].[dim].[mgra] AS mgra ON 
        mgra.mgra_id = tbl.mgra_id
    INNER JOIN [demographic_warehouse].[dim].[age_group] ON 
        age_group.age_group_id = tbl.age_group_id
    INNER JOIN [demographic_warehouse].[dim].[sex] ON 
        sex.sex_id = tbl.sex_id
    INNER JOIN [demographic_warehouse].[dim].[ethnicity] ON 
        ethnicity.ethnicity_id = tbl.ethnicity_id
    WHERE 
        tbl.datasource_id = 40 AND 
        mgra.geotype = 'sdcouncil'
    GROUP BY 
        mgra.geozone, 
        yr_id, 
        tbl.age_group_id, 
        tbl.sex_id, 
        tbl.ethnicity_id, 
        age_group.name, 
        sex.sex, 
        ethnicity.long_name
    ORDER BY mgra.geozone, yr_id, tbl.age_group_id, tbl.sex_id, tbl.ethnicity_id
    """, con=connection)

    age_sex_ethnicity_zip = pd.read_sql_query("""
    SELECT 
        geozone as 'zip', 
        yr_id, 
        age_group.name as age_group, 
        sex.sex as sex, 
        ethnicity.long_name as ethnicity, 
        SUM(population) as population 
    FROM [demographic_warehouse].[fact].[age_sex_ethnicity] as tbl
    INNER JOIN [demographic_warehouse].[dim].[mgra] AS mgra ON 
        mgra.mgra_id = tbl.mgra_id
    INNER JOIN [demographic_warehouse].[dim].[age_group] ON 
        age_group.age_group_id = tbl.age_group_id
    INNER JOIN [demographic_warehouse].[dim].[sex] ON 
        sex.sex_id = tbl.sex_id
    INNER JOIN [demographic_warehouse].[dim].[ethnicity] ON 
        ethnicity.ethnicity_id = tbl.ethnicity_id
    WHERE 
        tbl.datasource_id = 40 AND 
        mgra.geotype = 'zip'
    GROUP BY 
        mgra.geozone, 
        yr_id, 
        tbl.age_group_id, 
        tbl.sex_id, 
        tbl.ethnicity_id, 
        age_group.name, 
        sex.sex, 
        ethnicity.long_name
    ORDER BY mgra.geozone, yr_id, tbl.age_group_id, tbl.sex_id, tbl.ethnicity_id
    """, con=connection)

    housing_sra = pd.read_sql_query("""
    SELECT 
        geozone as 'sra', 
        yr_id, 
        structure_type.long_name, 
        SUM(units) as units, 
        SUM(unoccupiable) as unoccupiable, 
        SUM(occupied) as occupied, 
        SUM(vacancy) as vacancy
    FROM [demographic_warehouse].[fact].[housing] as tbl
    INNER JOIN [demographic_warehouse].[dim].[mgra] AS mgra ON 
        mgra.mgra_id = tbl.mgra_id
    INNER JOIN [demographic_warehouse].[dim].[structure_type] ON 
        structure_type.structure_type_id = tbl.structure_type_id
    WHERE 
        tbl.datasource_id = 40 AND 
        mgra.geotype = 'sra'
    GROUP BY mgra.geozone, yr_id, tbl.structure_type_id, structure_type.long_name
    ORDER BY mgra.geozone, yr_id, tbl.structure_type_id
    """, con=connection)

    population_sdcouncil = pd.read_sql_query("""
    SELECT geozone as 'sdcouncil', yr_id, housing_type.long_name, SUM(population) as population
    FROM [demographic_warehouse].[fact].[population] as tbl
    INNER JOIN [demographic_warehouse].[dim].[mgra] AS mgra ON 
        mgra.mgra_id = tbl.mgra_id
    INNER JOIN [demographic_warehouse].[dim].[housing_type] ON 
        housing_type.housing_type_id = tbl.housing_type_id
    WHERE 
        tbl.datasource_id = 40 AND 
        mgra.geotype = 'sdcouncil'
    GROUP BY mgra.geozone, yr_id, tbl.housing_type_id, housing_type.long_name
    ORDER BY mgra.geozone, yr_id, tbl.housing_type_id
    """, con=connection)

    sex_sdcouncil = pd.read_sql_query("""
    SELECT geozone as 'sdcouncil', yr_id, sex.sex, SUM(population) as population
    FROM [demographic_warehouse].[fact].[sex] as tbl
    INNER JOIN [demographic_warehouse].[dim].[mgra] AS mgra ON 
        mgra.mgra_id = tbl.mgra_id
    INNER JOIN [demographic_warehouse].[dim].[sex] ON 
        sex.sex_id = tbl.sex_id
    WHERE 
        tbl.datasource_id = 40 AND 
        mgra.geotype = 'sdcouncil'
    GROUP BY mgra.geozone, yr_id, tbl.sex_id, sex.sex
    ORDER BY mgra.geozone, yr_id, tbl.sex_id
    """, con=connection)

    # Return the SQL tables
    return (age_college, age_sex_ethnicity_sd_council, age_sex_ethnicity_zip, housing_sra,
        population_sdcouncil, sex_sdcouncil)

In [4]:
# The file names for the raw csv data
RAW_FILES = [
    Path("est_ds40_fact_age_college.csv"),
    Path("est_ds40_fact_age_sex_ethnicity_sdcouncil.csv"),
    Path("est_ds40_fact_age_sex_ethnicity_zip.csv"),
    Path("est_ds40_fact_housing_sra.csv"),
    Path("est_ds40_fact_population_sdcouncil.csv"),
    Path("est_ds40_fact_sex_sdcouncil.csv"),
]

# Download both csv and sql data
csv = download_raw_data("eli", RAW_FILES)
sql = download_SQL_data(ddam)

## Run Tests
1. Check Shape
2. Check values

In [5]:
def test(name, csv_df, sql_df, sum_cols):
    """
    Run the tests (see test plan at the top of the notebook)

    :param name:        The name of the file 
    :param csv_df:      The data as it appears in staging
    :param sql_df:      The data as it appears in SQL
    :param sum_cols:    The column(s) in list form for which to sum over to check total values
    :returns:           None, but prints out test results
    """
    # Print the file we are testing
    print("Tests for \"" + str(name) + "\"")

    # Check shape
    test_1_shape = (csv_df.shape == sql_df.shape)
    print(f"{'Shape test:' : <24}", test_1_shape)
    if(not test_1_shape):
        print(f"{'': <4}", f"{'csv shape:': <12}", csv_df.shape)
        print(f"{'': <4}", f"{'sql shape:': <12}", sql_df.shape)

    # Check columns
    test_1_col_names = (csv_df.columns == sql_df.columns)
    print(f"{'Column names test:' : <24}", test_1_col_names)
    if(not (test_1_col_names.sum() == len(test_1_col_names))):
        print(f"{'': <4}", f"{'csv columns:': <12}", csv_df.columns)
        print(f"{'': <4}", f"{'sql columns:': <12}", sql_df.columns)

    # Check values
    for sum_col in sum_cols:
        test_2_sum = (csv_df[sum_col].sum() == sql_df[sum_col].sum())
        print(f"{sum_col + ' sum test:' : <24}", test_2_sum)
        if(not test_2_sum):
            print(f"{'': <4}", f"{'csv sum:': <12}", csv_df[sum_col].sum())
            print(f"{'': <4}", f"{'sql sum:': <12}", sql_df[sum_col].sum())

    # New line at the end to space out the tests of different files
    print()

In [6]:
# The columns within each file (in the order they appear) to sum over
SUM_COL = [
    ["population"],
    ["population"],
    ["population"],
    ["units", "unoccupiable", "occupied", "vacancy"],
    ["population"], 
    ["population"]
]

# Run the tests on all the files
for i in range(0, len(RAW_FILES)):
    test(RAW_FILES[i], csv[i], sql[i], SUM_COL[i])

Tests for "est_ds40_fact_age_college.csv"
Shape test:              True
Column names test:       [ True  True  True  True]
population sum test:     True

Tests for "est_ds40_fact_age_sex_ethnicity_sdcouncil.csv"
Shape test:              True
Column names test:       [ True  True  True  True  True  True]
population sum test:     True

Tests for "est_ds40_fact_age_sex_ethnicity_zip.csv"
Shape test:              True
Column names test:       [ True  True  True  True  True  True]
population sum test:     True

Tests for "est_ds40_fact_housing_sra.csv"
Shape test:              True
Column names test:       [ True  True  True  True  True  True  True]
units sum test:          True
unoccupiable sum test:   True
occupied sum test:       True
vacancy sum test:        True

Tests for "est_ds40_fact_population_sdcouncil.csv"
Shape test:              True
Column names test:       [ True  True  True  True]
population sum test:     True

Tests for "est_ds40_fact_sex_sdcouncil.csv"
Shape test:        