# CRA Script

In [8]:
import pandas as pd
import numpy as np 
import sqlalchemy as sql
import os
import urllib.parse


## SQL Query Setup

In [9]:
# All of the different geozones
zones = ['college', 'cpa', 'elementary', 'jurisdiction', 
         'msa', 'region', 'sdcouncil', 'secondary', 'sra', 
         'supervisorial', 'tract', 'transit', 'unified', 'zip']

# All of the relevant fact tables. Not included are fact.land_use, fact.price_index_sd, 
# fact.syn_households, and fact.syn_persons.
fact_tables = ['fact.age' 'fact.age-sex-ethnicity', 'fact.ethnicity', 
               'fact.household_income', 'fact.housing', 'fact.jobs',
               'fact.population', 'fact.sex']

# The SQL queries for each relevant fact table. These queries bascically use dim.mgra to get the
# geozone, use whatever other dim tables to get pretty categorical values instead of ids, then
# groupby geozone. The queries also filter for the correct datasource_id and geotype
fact_dict = {'fact_age': """
                         SELECT geozone as {zone}, yr_id, age_group.name, SUM(population) as population 
                         FROM fact.age as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.age_group
                         ON age_group.age_group_id = tbl.age_group_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
                         ORDER BY mgra.geozone, yr_id, tbl.age_group_id
                         """,
             'fact_age_sex_ethnicity': """
                         SELECT geozone as {zone}, yr_id, age_group.name as age_group, sex.sex as sex, 
						 ethnicity.long_name as ethnicity, SUM(population) as population 
                         FROM fact.age_sex_ethnicity as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.age_group
                         ON age_group.age_group_id = tbl.age_group_id
						 INNER JOIN dim.sex
                         ON sex.sex_id = tbl.sex_id
						 INNER JOIN dim.ethnicity
                         ON ethnicity.ethnicity_id = tbl.ethnicity_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.age_group_id, 
						 tbl.sex_id, tbl.ethnicity_id, 
						 age_group.name, sex.sex, ethnicity.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.age_group_id,
						 tbl.sex_id, tbl.ethnicity_id
                         """,
             'fact_ethnicity': """
                         SELECT geozone as {zone}, yr_id, ethnicity.long_name, SUM(population) as population 
                         FROM fact.ethnicity as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.ethnicity
                         ON ethnicity.ethnicity_id = tbl.ethnicity_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.ethnicity_id, ethnicity.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.ethnicity_id
                         """,
             'fact_household_income': """
                         SELECT geozone as {zone}, yr_id, income_group.name, SUM(households) as households 
                         FROM fact.household_income as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.income_group
                         ON income_group.income_group_id = tbl.income_group_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.income_group_id, income_group.name
                         ORDER BY mgra.geozone, yr_id, tbl.income_group_id
                         """,
            'fact_housing': """
                         SELECT geozone as {zone}, yr_id, structure_type.long_name, SUM(units) as units, 
                         SUM(unoccupiable) as unoccupiable, SUM(occupied) as occupied, SUM(vacancy) as vacancy
                         FROM fact.housing as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.structure_type
                         ON structure_type.structure_type_id = tbl.structure_type_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.structure_type_id, structure_type.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.structure_type_id
                         """,
             'fact_jobs': """
                         SELECT geozone as {zone}, yr_id, employment_type.full_name, SUM(jobs) as jobs
                         FROM fact.jobs as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.employment_type
                         ON employment_type.employment_type_id = tbl.employment_type_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.employment_type_id, employment_type.full_name
                         ORDER BY mgra.geozone, yr_id, tbl.employment_type_id
                         """,
             'fact_population': """
                         SELECT geozone as {zone}, yr_id, housing_type.long_name, SUM(population) as population
                         FROM fact.population as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.housing_type
                         ON housing_type.housing_type_id = tbl.housing_type_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.housing_type_id, housing_type.long_name
                         ORDER BY mgra.geozone, yr_id, tbl.housing_type_id
                         """,
             'fact_sex': """
                         SELECT geozone as {zone}, yr_id, sex.sex, SUM(population) as population
                         FROM fact.sex as tbl
                         INNER JOIN dim.mgra AS mgra
                         ON mgra.mgra_id = tbl.mgra_id
                         INNER JOIN dim.sex
                         ON sex.sex_id = tbl.sex_id
                         WHERE tbl.datasource_id = {ds} AND mgra.geotype = '{zone}'
                         GROUP BY mgra.geozone, yr_id, tbl.sex_id, sex.sex
                         ORDER BY mgra.geozone, yr_id, tbl.sex_id
                         """}

fact_pivot = {'fact_age': {'cols': ['name'], 'vals': ['population'], 'friendly_name': 'Age'},
             'fact_age_sex_ethnicity': {'cols': ['age_group', 'sex', 'ethnicity'], 'vals': ['population'], 'friendly_name': 'Age, Sex, Ethnicity'},
             'fact_ethnicity': {'cols': ['long_name'], 'vals': ['population'], 'friendly_name': 'Ethnicity'},
             'fact_household_income': {'cols': ['name'], 'vals': ['households'], 'friendly_name': 'Household Income'},
            'fact_housing': {'cols': ['long_name'], 'vals': ['units', 'unoccupiable', 'occupied', 'vacancy'], 'friendly_name': 'Housing'},
             'fact_jobs': {'cols': ['full_name'], 'vals': ['jobs'], 'friendly_name': 'Jobs'},
             'fact_population': {'cols': ['long_name'], 'vals': ['population'], 'friendly_name': 'Population'},
             'fact_sex': {'cols': ['sex'], 'vals': ['population'], 'friendly_name': 'Sex'}}

## Set up SQL connection

In [None]:
# Clear this before publishing
creds = {'un': , 'pw': }

In [None]:
ddam = sql.create_engine('mssql+pymssql://%s:%s@DDAMWSQL16/demographic_warehouse' % (creds['un'], creds['pw']))

## Census Checks

In [2]:
# Download the extract data
census_data = pd.read_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-66 Data Surfer Extract QC\Data\Census.csv')

  census_data = pd.read_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Service Requests\2022\2022-66 Data Surfer Extract QC\Data\Census.csv')


In [6]:
census_data.head()

Unnamed: 0,geom_name,year,variable_main,value_main,geography_type,series_year,geom,pdf_url,category,value_percent,variable_sub,variable_tertiary
0,Grossmont-Cuyamaca,2000,Under 5,31612.0,college,Census 2000,,census/Census2000/college/Grossmont-Cuyamaca/e...,Age,,,
1,Grossmont-Cuyamaca,2000,5 to 9,34860.0,college,Census 2000,,census/Census2000/college/Grossmont-Cuyamaca/e...,Age,,,
2,Grossmont-Cuyamaca,2000,10 to 14,34758.0,college,Census 2000,,census/Census2000/college/Grossmont-Cuyamaca/e...,Age,,,
3,Grossmont-Cuyamaca,2000,15 to 17,19901.0,college,Census 2000,,census/Census2000/college/Grossmont-Cuyamaca/e...,Age,,,
4,Grossmont-Cuyamaca,2000,18 and 19,12066.0,college,Census 2000,,census/Census2000/college/Grossmont-Cuyamaca/e...,Age,,,
