In [56]:
import os
import pandas as pd
import numpy as np
import pyodbc

# Create Builder Functions

In [57]:
# Import Data
def import_mgra_based_data(path):
    df = pd.read_csv(path)

    # I will be rolling up values using the mgra denorm file, so I can drop these columns
    df = df.drop(['taz', 'LUZ'], axis=1)
    return df

In [58]:
# Download sql_data
def download_mgra_denorm_data(geo_level):
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')
    
    with open(rf'sql_queries\mgra_denorm.sql', 'r') as sql_file:
        sql_query = sql_file.read()
    
    return  pd.read_sql_query(sql_query, conn)[['mgra', geo_level]]

In [59]:
# Merge and Aggregate Data
def merge_and_aggregate(mgra_input_file, mgra_denorm, geo_level):
    df = pd.merge(mgra_denorm, mgra_input_file, how='left')

    if geo_level != 'mgra':
        df = df.drop('mgra', axis=1)

    df = df.groupby(geo_level).sum()

    return df

In [60]:
def hhs_adjustment(df):
    """Adjusts hhs values, returns the adjusted dataframe"""
    df['hhs'] = df['hhp']/df['hh']
    return df

In [61]:
def export_data(output_folder_path, geo_level, version, df):
    df.to_excel(output_folder_path + f"\mgra_based_input_{geo_level}_{version}.xlsx")

In [62]:
def create_mgra_denorm_table(mgra_denorm_path, geo_level, output_folder_path, version):
    '''
    In all paths add the 'r' command before the string
    If you do not want the data outputted set output_folder_path to False'''
    df_1 = import_mgra_based_data(path = mgra_denorm_path)

    df_2 = download_mgra_denorm_data(geo_level=geo_level)

    df_3 = merge_and_aggregate(mgra_input_file=df_1, mgra_denorm=df_2, geo_level=geo_level)

    df_4 = hhs_adjustment(df_3)

    if output_folder_path != False:
        export_data(output_folder_path=output_folder_path, geo_level=geo_level, version=version, df=df_4)

    return df_4

# Create Outputs

In [63]:
geo_levels = ['census_tract', 'cpa', 'jurisdiction', 'sra', 'luz', 'region']
mgra_denorm_path = r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-028 MGRA15 Input Table 2022\2023-028-02 [Employment]\Data\Version 5\mgra15_based_input_2022_02.csv'
output_folder_path=r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-028 MGRA15 Input Table 2022\2023-028-02 [Employment]\Data\Version 5'
version='estimates_v5'

In [64]:
for geo_level in geo_levels:
    create_mgra_denorm_table(mgra_denorm_path=mgra_denorm_path, 
                             geo_level=geo_level, 
                             output_folder_path=output_folder_path, 
                             version=version)
    print(f"{geo_level} is completed.")

census_tract is completed.
cpa is completed.
jurisdiction is completed.
sra is completed.
luz is completed.
region is completed.


# Internal Consistency Check

In [65]:
base_mgra_denorm_data = pd.read_csv(mgra_denorm_path)

In [66]:
census_tract_data = create_mgra_denorm_table(mgra_denorm_path=mgra_denorm_path, 
                             geo_level='census_tract', 
                             output_folder_path=False, 
                             version=version)
cpa_data = create_mgra_denorm_table(mgra_denorm_path=mgra_denorm_path, 
                             geo_level='cpa', 
                             output_folder_path=False, 
                             version=version)
jurisdiction_data = create_mgra_denorm_table(mgra_denorm_path=mgra_denorm_path, 
                             geo_level='jurisdiction', 
                             output_folder_path=False, 
                             version=version)
sra_data = create_mgra_denorm_table(mgra_denorm_path=mgra_denorm_path, 
                             geo_level='sra', 
                             output_folder_path=False, 
                             version=version)
luz_data = create_mgra_denorm_table(mgra_denorm_path=mgra_denorm_path, 
                             geo_level='luz', 
                             output_folder_path=False, 
                             version=version)
region_data = create_mgra_denorm_table(mgra_denorm_path=mgra_denorm_path, 
                             geo_level='region', 
                             output_folder_path=False, 
                             version=version)

In [73]:
def internal_consistency(df):
    if list(df['hh'] == df['i1'] + df['i2'] + df['i3'] + df['i4'] + df['i5'] + df['i6'] + df['i7'] + df['i8'] + df['i9'] + df['i10'])[0]:
        print('True 1')
    else:
        print("False 1")

    if list(df['hh'] == df['hh_sf'] + df['hh_mf'] + df['hh_mh'])[0]: 
        print('True 2')
    else:
        print("False 2")

    if list(df['pop'] == df['hhp'] + df['gq_civ'] + df['gq_mil'])[0]:
        print("True 3")
    else:
        print("False 3")

    if list(df['emp_tot'] == df['emp_gov'] + df['emp_mil']	+ df['emp_ag_min'] + df['emp_bus_svcs'] + df['emp_fin_res_mgm'] + df['emp_educ'] + df['emp_hlth'] +	df['emp_ret'] + df['emp_trn_wrh_con'] + df['emp_utl_mnf_whl'] + df['emp_ent'] + df['emp_accm']	+ df['emp_food'] + df['emp_oth'] + df['emp_non_ws_wfh']	+ df['emp_non_ws_oth'])[0]:
        print('True 4')
    else:
        print('False 4')

    if list(df['hs'] == df['hs_sf'] + df['hs_mf'] + df['hs_mh'])[0]: 
        print('True 5')
    else:
        print("False 5")

In [74]:
for table in [['base', base_mgra_denorm_data],['census_tract', census_tract_data], ['cpa', cpa_data], ['jurisdiction', jurisdiction_data], ['sra', sra_data], ['luz', luz_data], ['region', region_data]]:
    test_table_name = table[0]
    test_table = table[1]
    print('----------------')
    print(test_table_name)
    internal_consistency(df=test_table)

----------------
base
True 1
True 2
True 3
True 4
True 5
----------------
census_tract
True 1
True 2
True 3
True 4
True 5
----------------
cpa
True 1
True 2
True 3
True 4
True 5
----------------
jurisdiction
True 1
True 2
True 3
True 4
True 5
----------------
sra
True 1
True 2
True 3
True 4
True 5
----------------
luz
True 1
True 2
True 3
True 4
True 5
----------------
region
True 1
True 2
True 3
True 4
True 5


# Null Check

In [71]:
base_mgra_denorm_data = pd.read_csv(mgra_denorm_path)
base_mgra_denorm_data

Unnamed: 0,mgra,taz,LUZ,pop,hhp,hs,hs_sf,hs_mf,hs_mh,hh,...,hotelroomtotal,parkactive,openspaceparkpreserve,beachactive,district27,milestocoast,acre,landacre,effective_acres,truckregiontype
0,1,3010,10,440,440,176,84,92,0,174,...,0,0.0,0.00000,0.0,9,4.35,18.837621,18.837621,18.837621,1
1,2,1797,28,130,68,56,0,56,0,48,...,0,0.0,0.00000,0.0,15,0.64,2.872330,2.872330,2.872330,1
2,3,4361,239,549,549,200,23,177,0,192,...,0,0.0,0.00000,0.0,13,12.22,25.713898,25.713898,25.713898,1
3,4,340,151,5,5,3,3,0,0,2,...,0,0.0,0.00000,0.0,2,0.17,2.678374,2.678374,2.678374,1
4,5,388,151,90,90,43,43,0,0,36,...,0,0.0,0.00000,0.0,2,0.47,4.057765,4.057765,4.057765,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24316,24317,3691,11,5,5,2,2,0,0,2,...,0,0.0,0.00000,0.0,9,7.76,0.648684,0.648684,0.648684,1
24317,24318,3683,212,136,136,46,46,0,0,46,...,0,0.0,0.00000,0.0,3,14.73,50.854228,50.854228,50.514707,1
24318,24319,4943,225,0,0,0,0,0,0,0,...,0,0.0,39188.40744,0.0,14,55.40,39632.715274,39632.715274,1892.855556,1
24319,24320,4940,227,0,0,0,0,0,0,0,...,0,0.0,47858.17530,0.0,14,68.04,47918.538908,47918.538908,-1.847529,1


In [72]:
base_mgra_denorm_data.isna().sum().sum()

0