# Purpose 
The purpose of this script is to perform extra QC checks that are not already performed in the QA process. 

# Notes
This is all for this version: [sr15_staging].[2023_10_20].[mgrabase]

In [1]:
# Change to Markdown
import pandas as pd
import numpy as np
import pyodbc
import warnings
from data_prep_functions import *
warnings.filterwarnings('ignore')
import copy

# Import the Data

In [2]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=DDAMWSQL16.sandag.org;'
                      'Database=demographic_warehouse;'
                      'Trusted_Connection=yes;')

mgrabase_query = '''SELECT *
FROM [sr15_staging].[2023_10_20].[mgrabase]  '''

mgrabase_data =  pd.read_sql_query(mgrabase_query, conn)
# Adjust the data as needed
mgrabase_data.rename(columns={'increment': 'year'}, inplace=True)
mgrabase_data = mgrabase_data.drop(['taz', 'luz'], axis=1)
mgrabase_data

Unnamed: 0,scenario,year,mgra,zip,pseudomsa,district27,pop,hhp,gq,gq_mil,...,redev_ag_schools,redev_ag_roads,redev_emp_res,redev_emp_emp,infill_sf,infill_mf,infill_emp,dev,vac,unusable
0,0,2026,11239,92101,2,26,0,0,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,8.521490,1.287800,0.008186
1,0,2026,14247,92071,5,13,3,3,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,2391.349337,0.000000,34.760036
2,0,2026,9772,91911,4,11,627,627,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,9.733043,0.000000,0.000000
3,0,2026,9836,91910,4,11,407,407,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,56.369337,1.850525,0.000000
4,0,2026,6964,92124,3,6,418,418,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,154.565434,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72958,0,2026,14261,92126,3,4,0,0,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,9.799002,11.370750,0.000000,0.000000
72959,0,2022,6524,92127,3,4,365,365,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,15.024832,0.000000,0.000000
72960,0,2026,6359,91913,4,12,803,803,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,63.545508,0.000000,0.000000
72961,0,2026,543,92101,1,15,0,0,0,0,...,0.0,0.0,0.0,0.0000,0.0,0.0,0.000000,0.437755,1.380648,0.000000


# Rollup the data to specific geographies 

In [3]:
#mgrabase_data_mgra = mgrabase_data.copy(deep=True)
mgrabase_data_mgra = mgra_output(dsid='99', to_jdrive=False)
mgrabase_data_cpa = rollup_data(dsid='99', geo_level='cpa', to_jdrive=False)
mgrabase_data_luz = rollup_data(dsid='99', geo_level='luz', to_jdrive=False)
mgrabase_data_jurisdiction = rollup_data(dsid='99', geo_level='jurisdiction', to_jdrive=False)
mgrabase_data_region = rollup_data(dsid='99', geo_level='region', to_jdrive=False)

In [8]:
rolled_up_files = [mgrabase_data_mgra, 
                   mgrabase_data_cpa,
                   mgrabase_data_luz,
                   mgrabase_data_jurisdiction,
                   mgrabase_data_region]

# Check Internal Consistency 

In [10]:
# In case GQ doesn't exist (check source data)
mgrabase_data_mgra['gq'] = mgrabase_data_mgra['gq_civ'] + mgrabase_data_mgra['gq_mil']
mgrabase_data_cpa['gq'] = mgrabase_data_cpa['gq_civ'] + mgrabase_data_cpa['gq_mil']
mgrabase_data_luz['gq'] = mgrabase_data_luz['gq_civ'] + mgrabase_data_luz['gq_mil']
mgrabase_data_jurisdiction['gq'] = mgrabase_data_jurisdiction['gq_civ'] + mgrabase_data_jurisdiction['gq_mil']
mgrabase_data_region['gq'] = mgrabase_data_region['gq_civ'] + mgrabase_data_region['gq_mil']

In [11]:
# Usage
result = check_internal_consistency(mgrabase_data_mgra)
result

KeyError: "None of [Index(['gq_civ_college', 'gq_civ_other'], dtype='object')] are in the [columns]"

In [15]:
mgrabase_data_mgra.loc[list(result['row value'])][['emp_tot', 'emp_civ', 'emp_mil']]

Unnamed: 0,emp_tot,emp_civ,emp_mil
0,50,23,0
1,24,14,0
2,39,10,0
3,41,12,0
4,2,0,0
...,...,...,...
72957,12,1,0
72958,175,145,0
72959,31,19,0
72960,62,20,0


# Compare Against Regional Controls 
Table I will be using: [sr15_dev].[ccm].[2023_08_31_detailed_forecast]

In [5]:
ccm_query = '''SELECT
	year,
	sum(pop) As 'pop',
	sum(gq) AS 'gq',
	sum(hh) AS 'hh'
FROM [sr15_dev].[ccm].[2023_08_31_detailed_forecast]
Group BY year'''

ccm_data =  pd.read_sql_query(ccm_query, conn)
ccm_data.head()

Unnamed: 0,year,pop,gq,hh
0,2044,3430894,115651,1328642
1,2053,3379625,113928,1330646
2,2030,3347438,114437,1235423
3,2024,3286406,112830,1184929
4,2033,3384660,116089,1259571


In [6]:
validate_df(mgrabase_data_jurisdiction, ccm_data)

Found inconsistencies:
Year 2026, Column pop: Expected 3302237, Found 3302247
Year 2026, Column gq: Expected 113421, Found 113431
Year 2029, Column pop: Expected 3334675, Found 3334720
Year 2029, Column gq: Expected 114281, Found 114326
