# MGRA Series 13 Specific Outputs

In [1]:
import textwrap
import pathlib

import pandas as pd
import sqlalchemy as sql

In [4]:
# TODO
DDAM = sql.create_engine('mssql+pymssql://DDAMWSQL16/')

# Where to save outputs and the file format to save as
SAVE_FOLDER = pathlib.Path("./mgra_13_outputs/")
FILE_TEMPLATE = "forecast_{dsid}_{table}.csv"

# TODO
GEOGRAPHIES = ["mgra_id", "mgra", "cpa", "jurisdiction", "region"]
FORMATTED_GEOS = f"mgra.{', '.join(GEOGRAPHIES)}"

# TODO
MGRA_SERIES = 14
DATASOURCE_IDS = [41, 42]

# TODO
category_queries = {
    "age": textwrap.dedent("""\
        SELECT DISTINCT name
        FROM [demographic_warehouse].[dim].[age_group]
        ORDER BY name
        """),
    "ethnicity": textwrap.dedent("""\
        SELECT DISTINCT long_name
        FROM [demographic_warehouse].[dim].[ethnicity]
        ORDER BY long_name
        """),
    "household_income": textwrap.dedent("""\
        SELECT DISTINCT name
        FROM [demographic_warehouse].[dim].[income_group]
        WHERE categorization = 10
            AND constant_dollars_year = 2010
        """),
    # "housing": textwrap.dedent("""\
    #     SELECT DISTINCT long_name
    #     FROM [demographic_warehouse].[dim].[housing_type]
    #     ORDER BY long_name
    #     """),
    "jobs": textwrap.dedent("""\
        SELECT DISTINCT full_name
        FROM [demographic_warehouse].[dim].[employment_type]
        ORDER BY full_name
        """),
    "population":textwrap.dedent("""\
        SELECT DISTINCT long_name
        FROM [demographic_warehouse].[dim].[housing_type]
        ORDER BY long_name
        """),
    "sex": textwrap.dedent("""\
        SELECT DISTINCT sex
        FROM [demographic_warehouse].[dim].[sex]
        ORDER BY sex
        """),
}

# TODO
fact_queries = {
    "age": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, age_group.[name], [population] 
            FROM [demographic_warehouse].[fact].[age] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[age_group] as age_group
                ON age_group.age_group_id = tbl.age_group_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [name] IN (
                {categories}
            )
        ) as pivot_table
        """),
    "ethnicity": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, ethnicity.[long_name], [population] 
            FROM [demographic_warehouse].[fact].[ethnicity] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[ethnicity] as ethnicity
                ON ethnicity.ethnicity_id = tbl.ethnicity_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [long_name] IN (
                {categories}
            )
        ) as pivot_table
        """),
    "household_income": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, income_group.[name], [households] 
            FROM [demographic_warehouse].[fact].[household_income] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[income_group] as income_group
                ON income_group.income_group_id = tbl.income_group_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([households])
            FOR [name] IN (
                {categories}
            )
        ) as pivot_table
        """),
    # "housing": textwrap.dedent("""\
    #     """),
    "jobs": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, employment_type.[full_name], [jobs] 
            FROM [demographic_warehouse].[fact].[jobs] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[employment_type] as employment_type
                ON employment_type.employment_type_id = tbl.employment_type_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([jobs])
            FOR [full_name] IN (
                {categories}
            )
        ) as pivot_table
        """),
    "population":textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, housing_type.[long_name], [population] 
            FROM [demographic_warehouse].[fact].[population] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[housing_type] as housing_type
                ON housing_type.housing_type_id = tbl.housing_type_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [long_name] IN (
                {categories}
            )
        ) as pivot_table
        """),
    "sex": textwrap.dedent("""\
        SELECT * FROM (
            SELECT {geos}, yr_id, sex.[sex], [population] 
            FROM [demographic_warehouse].[fact].[sex] as tbl
            INNER JOIN [demographic_warehouse].[dim].[mgra_denormalize] AS mgra
                ON mgra.mgra_id = tbl.mgra_id
                AND mgra.series = {mgra_series}
            INNER JOIN [demographic_warehouse].[dim].[sex] as sex
                ON sex.sex_id = tbl.sex_id
            WHERE tbl.datasource_id = {dsid}) as p
        PIVOT (
            SUM([population])
            FOR [sex] IN (
                {categories}
            )
        ) as pivot_table
        """),
}

In [5]:
for datasource_id in DATASOURCE_IDS:
    for table_name, query in fact_queries.items():

        # Get and save the file
        print(f"Getting datasource_id={datasource_id}, table={table_name}")

        # Skip the file if it exists already
        file_name = SAVE_FOLDER / FILE_TEMPLATE.format(dsid=datasource_id, table=table_name)
        if(file_name.is_file()):
            print("File already exists, skipping...")
            
        # If the file does not exist than download and save
        else:
            print("Getting table from DDAMWSQL16")

            # Get the list of categorical variables
            cat_vars = [f"[{x[0]}]" for x in pd.read_sql_query(category_queries[table_name], con=DDAM).values]
            cat_vars = ", ".join(cat_vars)

            # Format the query
            formatted_query = query.format(
                geos=FORMATTED_GEOS, 
                mgra_series=MGRA_SERIES, 
                dsid=datasource_id,
                categories=cat_vars)
            
            # Actually get and save the file
            table = pd.read_sql_query(formatted_query, con=DDAM)
            table.to_csv(file_name, index=False)
        
            print("Completed")
            
        print()

Getting datasource_id=41, table=age
File already exists, skipping...

Getting datasource_id=41, table=ethnicity
File already exists, skipping...

Getting datasource_id=41, table=household_income
File already exists, skipping...

Getting datasource_id=41, table=jobs
File already exists, skipping...

Getting datasource_id=41, table=population
Getting table from DDAMWSQL16
Completed

Getting datasource_id=41, table=sex
Getting table from DDAMWSQL16
Completed

Getting datasource_id=42, table=age
Getting table from DDAMWSQL16
Completed

Getting datasource_id=42, table=ethnicity
Getting table from DDAMWSQL16
Completed

Getting datasource_id=42, table=household_income
Getting table from DDAMWSQL16
Completed

Getting datasource_id=42, table=jobs
Getting table from DDAMWSQL16
Completed

Getting datasource_id=42, table=population
Getting table from DDAMWSQL16
Completed

Getting datasource_id=42, table=sex
Getting table from DDAMWSQL16
Completed



## Emanual's Old Code Below

In [None]:
zones = ['college', 'cpa', 'elementary', 'jurisdiction', 
         'msa', 'region', 'sdcouncil', 'secondary', 'sra', 
         'supervisorial', 'tract', 'transit', 'unified', 'zip']

final_total = pd.DataFrame(columns=['SQL', 'Extract'], index=zones)

for particular_zone in zones:
    temp_total = 0
    for i in range(8):
        zone = particular_zone

        list = [f"""with t AS

        (
        SELECT geozone, yr_id, age_group.name, SUM(population) as population 
                                FROM fact.age as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.age_group
                                ON age_group.age_group_id = tbl.age_group_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.age_group_id, age_group.name
                                --ORDER BY mgra.geozone, yr_id, tbl.age_group_id
        )

        SELECT sum(population) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, age_group.name as age_group, sex.sex as sex, 
                                ethnicity.long_name as ethnicity, SUM(population) as population 
                                FROM fact.age_sex_ethnicity as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.age_group
                                ON age_group.age_group_id = tbl.age_group_id
                                INNER JOIN dim.sex
                                ON sex.sex_id = tbl.sex_id
                                INNER JOIN dim.ethnicity
                                ON ethnicity.ethnicity_id = tbl.ethnicity_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.age_group_id, 
                                tbl.sex_id, tbl.ethnicity_id, 
                                age_group.name, sex.sex, ethnicity.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.age_group_id,
                                --tbl.sex_id, tbl.ethnicity_id
        )

        SELECT sum(population) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, ethnicity.long_name, SUM(population) as population 
                                FROM fact.ethnicity as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.ethnicity
                                ON ethnicity.ethnicity_id = tbl.ethnicity_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.ethnicity_id, ethnicity.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.ethnicity_id
        )

        SELECT sum(population) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, income_group.name, SUM(households) as households 
                                FROM fact.household_income as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.income_group
                                ON income_group.income_group_id = tbl.income_group_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.income_group_id, income_group.name
                                --ORDER BY mgra.geozone, yr_id, tbl.income_group_id
        )

        SELECT sum(households) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, structure_type.long_name, SUM(units) as units, 
                                SUM(unoccupiable) as unoccupiable, SUM(occupied) as occupied, SUM(vacancy) as vacancy
                                FROM fact.housing as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.structure_type
                                ON structure_type.structure_type_id = tbl.structure_type_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.structure_type_id, structure_type.long_name
                                -- ORDER BY mgra.geozone, yr_id, tbl.structure_type_id
        )

        SELECT sum(units) + sum(unoccupiable) + sum(occupied) + sum(vacancy) AS total
        from t
                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, employment_type.full_name, SUM(jobs) as jobs
                                FROM fact.jobs as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.employment_type
                                ON employment_type.employment_type_id = tbl.employment_type_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.employment_type_id, employment_type.full_name
                                --ORDER BY mgra.geozone, yr_id, tbl.employment_type_id
        )

        SELECT sum(jobs) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, housing_type.long_name, SUM(population) as population
                                FROM fact.population as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.housing_type
                                ON housing_type.housing_type_id = tbl.housing_type_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.housing_type_id, housing_type.long_name
                                --ORDER BY mgra.geozone, yr_id, tbl.housing_type_id
        )

        SELECT sum(population) AS total
        from t

                                """,f"""
                                with t AS

        (
        SELECT geozone, yr_id, sex.sex, SUM(population) as population
                                FROM fact.sex as tbl
                                INNER JOIN dim.mgra AS mgra
                                ON mgra.mgra_id = tbl.mgra_id
                                INNER JOIN dim.sex
                                ON sex.sex_id = tbl.sex_id
                                WHERE tbl.datasource_id IN (12,5) AND mgra.geotype = '{zone}'
                                GROUP BY mgra.geozone, yr_id, tbl.sex_id, sex.sex
                                --ORDER BY mgra.geozone, yr_id, tbl.sex_id
        )

        SELECT sum(population) AS total
        from t

                                """]

        conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                        'Server=DDAMWSQL16.sandag.org;'
                        'Database=demographic_warehouse;'
                        'Trusted_Connection=yes;')

        sql_data =  pd.read_sql_query(list[i], conn)

        temp_total = temp_total + float(sql_data['total'])
    
    print(particular_zone, temp_total)
    final_total['SQL'][particular_zone] = temp_total