# CRA Script

In [1]:
import pandas as pd
import numpy as np 
import sqlalchemy as sql
import os
import urllib.parse
import pyodbc
import warnings
warnings.filterwarnings('ignore') 

# Census Checks

## Census SQL Query Setup

In [13]:
zones = ['college', 'cpa', 'elementary', 'jurisdiction', 
         'msa', 'region', 'sdcouncil', 'secondary', 'sra', 
         'supervisorial', 'tract', 'transit', 'unified', 'zip']

final_total = pd.DataFrame(columns=['SQL', 'Extract'], index=zones)

for particular_zone in zones:
    temp_total = 0
    for i in range(8):
        zone = particular_zone

        list = [f"""with t AS

        (
        SELECT geozone, yr_id, age_group.name, SUM(population) as population 
                                FROM fact.age as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.age_group
                                ON age_group.age_group_id = tbl.age_group_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
                                --ORDER BY mgra.geozone, yr_id, tbl.age_group_id
        )

        SELECT sum(population) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, age_group.name as age_group, sex.sex as sex, 
                                ethnicity.long_name as ethnicity, SUM(population) as population 
                                FROM fact.age_sex_ethnicity as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.age_group
                                ON age_group.age_group_id = tbl.age_group_id
                                INNER JOIN dim.sex
                                ON sex.sex_id = tbl.sex_id
                                INNER JOIN dim.ethnicity
                                ON ethnicity.ethnicity_id = tbl.ethnicity_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.age_group_id, 
                                tbl.sex_id, tbl.ethnicity_id, 
                                age_group.name, sex.sex, ethnicity.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.age_group_id,
                                --tbl.sex_id, tbl.ethnicity_id
        )

        SELECT sum(population) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, ethnicity.long_name, SUM(population) as population 
                                FROM fact.ethnicity as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.ethnicity
                                ON ethnicity.ethnicity_id = tbl.ethnicity_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.ethnicity_id, ethnicity.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.ethnicity_id
        )

        SELECT sum(population) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, income_group.name, SUM(households) as households 
                                FROM fact.household_income as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.income_group
                                ON income_group.income_group_id = tbl.income_group_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.income_group_id, income_group.name
                                --ORDER BY mgra.geozone, yr_id, tbl.income_group_id
        )

        SELECT sum(households) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, structure_type.long_name, SUM(units) as units, 
                                SUM(unoccupiable) as unoccupiable, SUM(occupied) as occupied, SUM(vacancy) as vacancy
                                FROM fact.housing as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.structure_type
                                ON structure_type.structure_type_id = tbl.structure_type_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.structure_type_id, structure_type.long_name
                                -- ORDER BY mgra.geozone, yr_id, tbl.structure_type_id
        )

        SELECT sum(units) + sum(unoccupiable) + sum(occupied) + sum(vacancy) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, employment_type.full_name, SUM(jobs) as jobs
                                FROM fact.jobs as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.employment_type
                                ON employment_type.employment_type_id = tbl.employment_type_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.employment_type_id, employment_type.full_name
                                --ORDER BY mgra.geozone, yr_id, tbl.employment_type_id
        )

        SELECT sum(jobs) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, housing_type.long_name, SUM(population) as population
                                FROM fact.population as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.housing_type
                                ON housing_type.housing_type_id = tbl.housing_type_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.housing_type_id, housing_type.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.housing_type_id
        )

        SELECT sum(population) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, sex.sex, SUM(population) as population
                                FROM fact.sex as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.sex
                                ON sex.sex_id = tbl.sex_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.sex_id, sex.sex
                                --ORDER BY mgra.geozone, yr_id, tbl.sex_id
        )

        SELECT sum(population) AS total
        from t

                                """]

        conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                        'Server=DDAMWSQL16.sandag.org;'
                        'Database=demographic_warehouse;'
                        'Trusted_Connection=yes;')

        sql_data =  pd.read_sql_query(list[i], conn)

        temp_total = temp_total + float(sql_data['total'])
    
    print(particular_zone, temp_total)
    final_total['SQL'][particular_zone] = temp_total

college 35910689.88033365
cpa 28953642.81429647
elementary 25863534.301965833
jurisdiction 35910689.88033365
msa 35910689.88033365
region 35910689.88033365
sdcouncil 26298621.865699213
secondary 25863534.301965833
sra 35910689.88033365
supervisorial 35910689.88033365
tract 35910689.88033365
transit 18809757.856982708
unified 28856913.435350522
zip 35910689.88033365


## Census - Excel output data

In [4]:
def is_number(n):
    try:
        float(n)   # Type-casting the string to `float`.
                   # If string is not a valid `float`, 
                   # it'll raise `ValueError` exception
    except ValueError:
        return False
    return True

In [5]:
# Download the extract data
census_data = pd.read_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-66 Data Surfer Extract QC\Data\Census.csv')

# Cleaning 
census_data['geom_name'] = [str(x) for x in census_data['geom_name']] # first need to make it string data
census_data['name'] = [float(x) if is_number(x) else x for x in census_data['geom_name']] # Turn into a float 

In [16]:
# Run the checks and add to the final_total dataframe 
for particular_zone in zones:
    temp_df = census_data[census_data['geography_type'] == particular_zone]
    temp_total = temp_df['value_main'].sum()
    print(particular_zone, temp_total)
    final_total['Extract'][particular_zone] = temp_total

college 35910689.88033365
cpa 28953642.81429647
elementary 25863534.301965833
jurisdiction 35910689.88033365
msa 35910689.88033365
region 35910689.88033365
sdcouncil 26298621.865699213
secondary 25863534.301965833
sra 35910689.88033365
supervisorial 35910689.88033365
tract 35910689.88033365
transit 18809757.856982708
unified 28856913.435350522
zip 35910689.88033365


In [19]:
final_total['Identical'] = final_total['SQL'] == final_total['Extract']

In [20]:
final_total

Unnamed: 0,SQL,Extract,Identical
college,35910689.880334,35910689.880334,True
cpa,28953642.814296,28953642.814296,True
elementary,25863534.301966,25863534.301966,True
jurisdiction,35910689.880334,35910689.880334,True
msa,35910689.880334,35910689.880334,True
region,35910689.880334,35910689.880334,True
sdcouncil,26298621.865699,26298621.865699,True
secondary,25863534.301966,25863534.301966,True
sra,35910689.880334,35910689.880334,True
supervisorial,35910689.880334,35910689.880334,True


## Checking Identical GEOM Names (Checking Age)
- Use data source 12 and 5
years_dict = {12:"2000", 5:"2010"}

In [7]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=demographic_warehouse;'
                    'Trusted_Connection=yes;')

# age 
query ='''
SELECT geozone, yr_id, age_group.name, SUM(population) as population 
FROM fact.age as tbl
INNER JOIN dim.mgra AS mgra
ON mgra.mgra_id = tbl.mgra_id
INNER JOIN dim.age_group
ON age_group.age_group_id = tbl.age_group_id
WHERE tbl.datasource_id = 12 AND mgra.geotype IN ('college', 'cpa', 'elementary', 'jurisdiction', 
         'msa', 'region', 'sdcouncil', 'secondary', 'sra', 
         'supervisorial', 'tract', 'transit', 'unified', 'zip')
GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
ORDER BY mgra.geozone, yr_id, tbl.age_group_id
'''

sql_data =  pd.read_sql_query(query, conn)

In [8]:
# Set data as a float 
sql_data['name'] = [float(x) if is_number(x) else x for x in sql_data['geozone']]

In [11]:
# Sort the excel data for age 
excel_age_data = census_data[(census_data['year'] == 2000) & (census_data['category'] == 'Age')]
excel_age_data = excel_age_data[['name', 'year', 'variable_main', 'value_main', 'geography_type']]

In [12]:
# Checking to make sure columns are the same 
print(len(list(set(excel_age_data['name']) - set(sql_data['name']))))
print(len(set(sql_data['name']) - set(excel_age_data['name'])))

0
0


# Forecast Check 

In [None]:
# Download the extract data
census_data = pd.read_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-66 Data Surfer Extract QC\Data\Census.csv')

In [23]:
zones = ['college', 'cpa', 'elementary', 'jurisdiction', 
         'msa', 'region', 'sdcouncil', 'secondary', 'sra', 
         'supervisorial', 'tract', 'transit', 'unified', 'zip']

final_total = pd.DataFrame(columns=['SQL', 'Extract'], index=zones)

for particular_zone in zones:
    temp_total = 0
    for i in range(8):
        zone = particular_zone

        list = [f"""with t AS

        (
        SELECT geozone, yr_id, age_group.name, SUM(population) as population 
                                FROM fact.age as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.age_group
                                ON age_group.age_group_id = tbl.age_group_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
                                --ORDER BY mgra.geozone, yr_id, tbl.age_group_id
        )

        SELECT sum(population) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, age_group.name as age_group, sex.sex as sex, 
                                ethnicity.long_name as ethnicity, SUM(population) as population 
                                FROM fact.age_sex_ethnicity as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.age_group
                                ON age_group.age_group_id = tbl.age_group_id
                                INNER JOIN dim.sex
                                ON sex.sex_id = tbl.sex_id
                                INNER JOIN dim.ethnicity
                                ON ethnicity.ethnicity_id = tbl.ethnicity_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.age_group_id, 
                                tbl.sex_id, tbl.ethnicity_id, 
                                age_group.name, sex.sex, ethnicity.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.age_group_id,
                                --tbl.sex_id, tbl.ethnicity_id
        )

        SELECT sum(population) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, ethnicity.long_name, SUM(population) as population 
                                FROM fact.ethnicity as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.ethnicity
                                ON ethnicity.ethnicity_id = tbl.ethnicity_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.ethnicity_id, ethnicity.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.ethnicity_id
        )

        SELECT sum(population) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, income_group.name, SUM(households) as households 
                                FROM fact.household_income as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.income_group
                                ON income_group.income_group_id = tbl.income_group_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.income_group_id, income_group.name
                                --ORDER BY mgra.geozone, yr_id, tbl.income_group_id
        )

        SELECT sum(households) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, structure_type.long_name, SUM(units) as units, 
                                SUM(unoccupiable) as unoccupiable, SUM(occupied) as occupied, SUM(vacancy) as vacancy
                                FROM fact.housing as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.structure_type
                                ON structure_type.structure_type_id = tbl.structure_type_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.structure_type_id, structure_type.long_name
                                -- ORDER BY mgra.geozone, yr_id, tbl.structure_type_id
        )

        SELECT sum(units) + sum(unoccupiable) + sum(occupied) + sum(vacancy) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, employment_type.full_name, SUM(jobs) as jobs
                                FROM fact.jobs as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.employment_type
                                ON employment_type.employment_type_id = tbl.employment_type_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.employment_type_id, employment_type.full_name
                                --ORDER BY mgra.geozone, yr_id, tbl.employment_type_id
        )

        SELECT sum(jobs) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, housing_type.long_name, SUM(population) as population
                                FROM fact.population as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.housing_type
                                ON housing_type.housing_type_id = tbl.housing_type_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.housing_type_id, housing_type.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.housing_type_id
        )

        SELECT sum(population) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, sex.sex, SUM(population) as population
                                FROM fact.sex as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.sex
                                ON sex.sex_id = tbl.sex_id
                                WHERE tbl.datasource_id IN (6,13,35) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.sex_id, sex.sex
                                --ORDER BY mgra.geozone, yr_id, tbl.sex_id
        )

        SELECT sum(population) AS total
        from t

                                """]

        conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                        'Server=DDAMWSQL16.sandag.org;'
                        'Database=demographic_warehouse;'
                        'Trusted_Connection=yes;')

        sql_data =  pd.read_sql_query(list[i], conn)

        temp_total = temp_total + float(sql_data['total'])
    
    print(particular_zone, temp_total)
    final_total['SQL'][particular_zone] = temp_total

college 630587122.639488
cpa 540837178.2935761
elementary 498271793.4839254
jurisdiction 630587122.639488
msa 630587122.639488
region 630587122.639488
sdcouncil 506999670.7392449
secondary 498271793.4839254
sra 630587122.639488
supervisorial 630587122.639488
tract 630587122.639488
transit 407073446.9657651
unified 539388776.1213276
zip 630587122.639488


In [None]:
final_total