# CRA Script

In [1]:
import pandas as pd
import numpy as np 
import sqlalchemy as sql
import os
import urllib.parse
import pyodbc
import warnings
warnings.filterwarnings('ignore') 

## SQL Query Setup

In [2]:
# All of the different geozones
zones = ['college', 'cpa', 'elementary', 'jurisdiction', 
         'msa', 'region', 'sdcouncil', 'secondary', 'sra', 
         'supervisorial', 'tract', 'transit', 'unified', 'zip']

# All of the relevant fact tables. Not included are fact.land_use, fact.price_index_sd, 
# fact.syn_households, and fact.syn_persons.
fact_tables = ['fact.age' 'fact.age-sex-ethnicity', 'fact.ethnicity', 
               'fact.household_income', 'fact.housing', 'fact.jobs',
               'fact.population', 'fact.sex']

# The SQL queries for each relevant fact table. These queries bascically use dim.mgra to get the
# geozone, use whatever other dim tables to get pretty categorical values instead of ids, then
# groupby geozone. The queries also filter for the correct datasource_id and geotype
fact_dict = {'fact_age': """
                         SELECT geozone as {zone}, yr_id, age_group.name, SUM(population) as population 
                         FROM fact.age as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.age_group
                         ON age_group.age_group_id = tbl.age_group_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
                         ORDER BY mgra.geozone, yr_id, tbl.age_group_id
                         """,
             'fact_age_sex_ethnicity': """
                         SELECT geozone as {zone}, yr_id, age_group.name as age_group, sex.sex as sex, 
						 ethnicity.long_name as ethnicity, SUM(population) as population 
                         FROM fact.age_sex_ethnicity as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.age_group
                         ON age_group.age_group_id = tbl.age_group_id
						 INNER JOIN dim.sex
                         ON sex.sex_id = tbl.sex_id
						 INNER JOIN dim.ethnicity
                         ON ethnicity.ethnicity_id = tbl.ethnicity_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.age_group_id, 
						 tbl.sex_id, tbl.ethnicity_id, 
						 age_group.name, sex.sex, ethnicity.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.age_group_id,
						 tbl.sex_id, tbl.ethnicity_id
                         """,
             'fact_ethnicity': """
                         SELECT geozone as {zone}, yr_id, ethnicity.long_name, SUM(population) as population 
                         FROM fact.ethnicity as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.ethnicity
                         ON ethnicity.ethnicity_id = tbl.ethnicity_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.ethnicity_id, ethnicity.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.ethnicity_id
                         """,
             'fact_household_income': """
                         SELECT geozone as {zone}, yr_id, income_group.name, SUM(households) as households 
                         FROM fact.household_income as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.income_group
                         ON income_group.income_group_id = tbl.income_group_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.income_group_id, income_group.name
                         ORDER BY mgra.geozone, yr_id, tbl.income_group_id
                         """,
            'fact_housing': """
                         SELECT geozone as {zone}, yr_id, structure_type.long_name, SUM(units) as units, 
                         SUM(unoccupiable) as unoccupiable, SUM(occupied) as occupied, SUM(vacancy) as vacancy
                         FROM fact.housing as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.structure_type
                         ON structure_type.structure_type_id = tbl.structure_type_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.structure_type_id, structure_type.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.structure_type_id
                         """,
             'fact_jobs': """
                         SELECT geozone as {zone}, yr_id, employment_type.full_name, SUM(jobs) as jobs
                         FROM fact.jobs as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.employment_type
                         ON employment_type.employment_type_id = tbl.employment_type_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.employment_type_id, employment_type.full_name
                         ORDER BY mgra.geozone, yr_id, tbl.employment_type_id
                         """,
             'fact_population': """
                         SELECT geozone as {zone}, yr_id, housing_type.long_name, SUM(population) as population
                         FROM fact.population as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.housing_type
                         ON housing_type.housing_type_id = tbl.housing_type_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.housing_type_id, housing_type.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.housing_type_id
                         """,
             'fact_sex': """
                         SELECT geozone as {zone}, yr_id, sex.sex, SUM(population) as population
                         FROM fact.sex as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.sex
                         ON sex.sex_id = tbl.sex_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.sex_id, sex.sex
                         ORDER BY mgra.geozone, yr_id, tbl.sex_id
                         """}

fact_pivot = {'fact_age': {'cols': ['name'], 'vals': ['population'], 'friendly_name': 'Age'},
             'fact_age_sex_ethnicity': {'cols': ['age_group', 'sex', 'ethnicity'], 'vals': ['population'], 'friendly_name': 'Age, Sex, Ethnicity'},
             'fact_ethnicity': {'cols': ['long_name'], 'vals': ['population'], 'friendly_name': 'Ethnicity'},
             'fact_household_income': {'cols': ['name'], 'vals': ['households'], 'friendly_name': 'Household Income'},
            'fact_housing': {'cols': ['long_name'], 'vals': ['units', 'unoccupiable', 'occupied', 'vacancy'], 'friendly_name': 'Housing'},
             'fact_jobs': {'cols': ['full_name'], 'vals': ['jobs'], 'friendly_name': 'Jobs'},
             'fact_population': {'cols': ['long_name'], 'vals': ['population'], 'friendly_name': 'Population'},
             'fact_sex': {'cols': ['sex'], 'vals': ['population'], 'friendly_name': 'Sex'}}

## Census Checks

In [4]:
def is_number(n):
    try:
        float(n)   # Type-casting the string to `float`.
                   # If string is not a valid `float`, 
                   # it'll raise `ValueError` exception
    except ValueError:
        return False
    return True

In [6]:
# Download the extract data
census_data = pd.read_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-66 Data Surfer Extract QC\Data\Census.csv')

# Cleaning 
census_data['geom_name'] = [str(x) for x in census_data['geom_name']] # first need to make it string data
census_data['name'] = [float(x) if is_number(x) else x for x in census_data['geom_name']] # Turn into a float 

## Age Check
years_dict = {12:"2000", 5:"2010"}

In [7]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=demographic_warehouse;'
                    'Trusted_Connection=yes;')

# age 
query ='''
SELECT geozone, yr_id, age_group.name, SUM(population) as population 
FROM fact.age as tbl
INNER JOIN dim.mgra AS mgra
ON mgra.mgra_id = tbl.mgra_id
INNER JOIN dim.age_group
ON age_group.age_group_id = tbl.age_group_id
WHERE tbl.datasource_id = 12 AND mgra.geotype IN ('college', 'cpa', 'elementary', 'jurisdiction', 
         'msa', 'region', 'sdcouncil', 'secondary', 'sra', 
         'supervisorial', 'tract', 'transit', 'unified', 'zip')
GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
ORDER BY mgra.geozone, yr_id, tbl.age_group_id
'''

sql_data =  pd.read_sql_query(query, conn)

In [8]:
# Set data as a float 
sql_data['name'] = [float(x) if is_number(x) else x for x in sql_data['geozone']]

In [11]:
# Sort the excel data for age 
excel_age_data = census_data[(census_data['year'] == 2000) & (census_data['category'] == 'Age')]
excel_age_data = excel_age_data[['name', 'year', 'variable_main', 'value_main', 'geography_type']]

In [12]:
# Checking to make sure columns are the same 
print(len(list(set(excel_age_data['name']) - set(sql_data['name']))))
print(len(set(sql_data['name']) - set(excel_age_data['name'])))

0
0


In [13]:
# Display of why the counts will be different
#excel_age_data[excel_age_data['name'] == 1]
#sql_data[sql_data['name'] == 1]

In [14]:
def age_sum_check(excel_age_data, sql_age_data):
    '''
    Input: Excel and SQL data that has identical geonames in a 'name' column
    What is done: Checks that the sums for each geoname are identical 
    Output: A dataframe that says if the particular geoname passes or fails 
    '''
    df = pd.DataFrame()

    for name in set(excel_age_data['name']):
        # Excel Data
        excel_population = excel_age_data[excel_age_data['name'] == name]['value_main'].sum()

        # SQL Data
        sql_population = sql_age_data[sql_age_data['name'] == name]['population'].sum()

        if excel_population == sql_population:
            df[name] = ['Pass']
        else:
            df[name] = ['Fail']

    final_df = df.T
    final_df.columns = ['SQL/Excel Match']
    return final_df

In [15]:
# Census 2000 Checks 
age_sum_check(excel_age_data, sql_data)

Unnamed: 0,SQL/Excel Match
1.0,Pass
2.0,Pass
3.0,Pass
4.0,Pass
5.0,Pass
...,...
92145.0,Pass
Southwestern,Pass
170.42,Pass
92154.0,Pass
