# SANDAG Populationsim QC

In [1]:
import pandas as pd
import numpy as np
import pyodbc
import glob
import copy

# Data Preparation

### Download Populationsim data

In [2]:
synthetic_households = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-72 SANDPOPSIM Output QC/Population Sim Outputs/synthetic_households.csv')
synthetic_persons = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-72 SANDPOPSIM Output QC/Population Sim Outputs/synthetic_persons.csv')

### Download Crosswalk Data

In [3]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=SQL2014B8.sandag.org;'
                      'Database=GeoDepot;'
                      'Trusted_Connection=yes;')

query = '''SELECT
      [MGRA]
	  ,cpa_data.NAME AS CPA
	  ,jur_data.Name AS Jurisdiction
      ,[LUZ]
  FROM [GeoDepot].[gis].[MGRA15] as mgra_base
  LEFT JOIN [GeoDepot].[gis].[CITIES] AS jur_data
  ON mgra_base.City = jur_data.City
  LEFT JOIN [GeoDepot].[gis].[CITYCPA] AS cpa_data
  ON mgra_base.CPA = cpa_data.CPA'''

crosswalk = pd.read_sql_query(query,conn)

In [4]:
# Appending to synthetic data
synthetic_households= synthetic_households.merge(crosswalk, left_on='mgra', right_on='MGRA', how='left')
synthetic_persons = synthetic_persons.merge(crosswalk, left_on='mgra', right_on='MGRA', how='left')

In [19]:
synthetic_persons['Region'] = 'San Diego'
synthetic_households['Region'] = 'San Diego'


Index(['household_id', 'tract', 'mgra', 'NP', 'HHADJINC', 'ADJINC', 'HHT',
       'WIF', 'HUPAC', 'VEH', 'numWorkers', 'GQ_type', 'MGRA', 'CPA',
       'Jurisdiction', 'LUZ', 'Region'],
      dtype='object')

# Calculating the range
Request from Purva. 

In [25]:
list_of_mgras = list(set(synthetic_households['MGRA']))

In [27]:
hh_inc_filter = synthetic_households[['MGRA', 'HHADJINC']]

In [28]:
final_df = dict()
for mgra in list_of_mgras:
    # Find the Max
    max_value = max(hh_inc_filter[hh_inc_filter['MGRA'] == mgra]['HHADJINC'])

    # find the min 
    min_value = min(hh_inc_filter[hh_inc_filter['MGRA'] == mgra]['HHADJINC'])

    # Range
    range_value = max_value - min_value

    # Add to dataframe
    final_df[mgra] = [max_value, min_value, range_value]

In [34]:
final_output = pd.DataFrame(final_df).T
final_output.columns = ['Max', 'Min', 'Range']
final_output.index.name = 'mgra'
final_output

Unnamed: 0_level_0,Max,Min,Range
mgra,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,609400.0,30.0,609370.0
2,236000.0,0.0,236000.0
3,731200.0,590.0,730610.0
4,118900.0,21600.0,97300.0
5,360000.0,11600.0,348400.0
...,...,...,...
24313,160900.0,39000.0,121900.0
24314,937000.0,11600.0,925400.0
24316,926000.0,0.0,926000.0
24317,91600.0,36000.0,55600.0


### Georgaphy Groupings

In [6]:
# Synthetic Population (SP)
sh_mgra = pd.DataFrame(synthetic_households['mgra'].value_counts()).rename(columns={'mgra':'Total'})
sh_mgra.index.name = 'mgra'

sh_tract = pd.DataFrame(synthetic_households['tract'].value_counts()).rename(columns={'tract':'Total'})
sh_tract.index.name = 'tract'

sh_CPA = pd.DataFrame(synthetic_households['CPA'].value_counts()).rename(columns={'CPA':'Total'})
sh_CPA.index.name = 'CPA'

sh_jurisdiction = pd.DataFrame(synthetic_households['Jurisdiction'].value_counts()).rename(columns={'Jurisdiction':'Total'})
sh_jurisdiction.index.name = 'Jurisdiction'

sh_LUZ = pd.DataFrame(synthetic_households['LUZ'].value_counts()).rename(columns={'LUZ':'Total'})
sh_LUZ.index.name = 'LUZ'

sh_Region = pd.DataFrame(synthetic_households['Region'].value_counts()).rename(columns={'Region':'Total'})
sh_Region.index.name = 'Region'

In [7]:
# Synthetic Persons (SP)
sp_mgra = pd.DataFrame(synthetic_persons['mgra'].value_counts()).rename(columns={'mgra':'Total'})
sp_mgra.index.name = 'mgra'

sp_tract = pd.DataFrame(synthetic_persons['tract'].value_counts()).rename(columns={'tract':'Total'})
sp_tract.index.name = 'tract'

sp_CPA = pd.DataFrame(synthetic_persons['CPA'].value_counts()).rename(columns={'CPA':'Total'})
sp_CPA.index.name = 'CPA'

sp_jurisdiction = pd.DataFrame(synthetic_persons['Jurisdiction'].value_counts()).rename(columns={'Jurisdiction':'Total'})
sp_jurisdiction.index.name = 'Jurisdiction'

sp_LUZ = pd.DataFrame(synthetic_persons['LUZ'].value_counts()).rename(columns={'LUZ':'Total'})
sp_LUZ.index.name = 'LUZ'

sp_Region = pd.DataFrame(synthetic_persons['Region'].value_counts()).rename(columns={'Region':'Total'})
sp_Region.index.name = 'Region'

# Internal Consistency Checks
Reminder: Tract and CPA should not match 

In [8]:
# Synthetic Persons
print(f"MGRA total - {sp_mgra['Total'].sum()}")
print(f"Tract total - {sp_tract['Total'].sum()}")
print(f"CPA total - {sp_CPA['Total'].sum()}")
print(f"Jurisdiction Total - {sp_jurisdiction['Total'].sum()}")
print(f"LUZ total - {sp_LUZ['Total'].sum()}")
print(f"Region total - {sp_Region['Total'].sum()}")

MGRA total - 3361280
Tract total - 3247180
CPA total - 1440291
Jurisdiction Total - 3361280
LUZ total - 3361280
Region total - 3361280


In [9]:
# Synthetic Households
print(f"MGRA total - {sh_mgra['Total'].sum()}")
print(f"Tract total - {sh_tract['Total'].sum()}")
print(f"CPA total - {sh_CPA['Total'].sum()}")
print(f"Jurisdiction Total - {sh_jurisdiction['Total'].sum()}")
print(f"LUZ total - {sh_LUZ['Total'].sum()}")
print(f"Region total - {sh_Region['Total'].sum()}")

MGRA total - 1267147
Tract total - 1153047
CPA total - 579807
Jurisdiction Total - 1267147
LUZ total - 1267147
Region total - 1267147


# Comparison with Forecast Data
- Will be checking with the 2019 forecast values (These are ran on MGRA series 15)
- Comparing population difference between the two. 

In [10]:
forecast_data_mgra = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/mgra_2019_CSV_Data_ind_QA.csv')
forecast_data_jurisdiction = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/jurisdiction_2019_CSV_Data_ind_QA.csv')
forecast_data_region = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/region_2019_CSV_Data_ind_QA.csv')

## Population Comparison

### MGRA Level Check

In [11]:
mgra_population = forecast_data_mgra[['mgra','pop']]
mgra_persons_comparison = sp_mgra.reset_index().merge(mgra_population, how='left', on='mgra')
mgra_persons_comparison.columns = ['mgra', 'popsim_pop', 'forecast_pop']
mgra_persons_comparison['Diff'] = mgra_persons_comparison['popsim_pop'] - mgra_persons_comparison['forecast_pop']
mgra_persons_comparison = mgra_persons_comparison.sort_values('Diff', ascending=False)
mgra_persons_comparison = mgra_persons_comparison.reset_index(drop=True)
mgra_persons_comparison

Unnamed: 0,mgra,popsim_pop,forecast_pop,Diff
0,4929,8528,4264,4264
1,7507,4228,3623,605
2,19747,3194,2742,452
3,5878,11396,11018,378
4,2428,3395,3028,367
...,...,...,...,...
18897,19499,1465,1610,-145
18898,2768,673,821,-148
18899,6963,1952,2105,-153
18900,13437,1852,2013,-161


### Jurisdiction Level Check

In [12]:
jurisdiction_population = forecast_data_jurisdiction[['Jurisdiction','pop']]
jurisdiction_persons_comparison = sp_jurisdiction.reset_index().merge(jurisdiction_population, how='left', on='Jurisdiction')
jurisdiction_persons_comparison.columns = ['Jurisdiction', 'popsim_pop', 'forecast_pop']
jurisdiction_persons_comparison['Diff'] = jurisdiction_persons_comparison['popsim_pop'] - jurisdiction_persons_comparison['forecast_pop']
jurisdiction_persons_comparison['pct_diff'] = round((jurisdiction_persons_comparison['forecast_pop'] - jurisdiction_persons_comparison['popsim_pop'])/jurisdiction_persons_comparison['forecast_pop'] * 100, 3)
jurisdiction_persons_comparison = jurisdiction_persons_comparison.sort_values('pct_diff', ascending=False)
jurisdiction_persons_comparison = jurisdiction_persons_comparison.reset_index(drop=True)
jurisdiction_persons_comparison

Unnamed: 0,Jurisdiction,popsim_pop,forecast_pop,Diff,pct_diff
0,Coronado,22538,23813,-1275,5.354
1,National City,60298,62256,-1958,3.145
2,Encinitas,61329,62098,-769,1.238
3,Oceanside,176974,177245,-271,0.153
4,Escondido,152197,152359,-162,0.106
5,Carlsbad,113561,113623,-62,0.055
6,Vista,102443,102041,402,-0.394
7,Unincorporated,509309,507278,2031,-0.4
8,San Diego,1440291,1432100,8191,-0.572
9,San Marcos,97264,96610,654,-0.677


### Region Level Check

In [13]:
region_population = forecast_data_region[['Region','pop']]
region_persons_comparison = sp_Region.reset_index().merge(region_population, how='left', on='Region')
region_persons_comparison.columns = ['Jurisdiction', 'popsim_pop', 'forecast_pop']
region_persons_comparison['Diff'] = region_persons_comparison['popsim_pop'] - region_persons_comparison['forecast_pop']
region_persons_comparison['pct_diff'] = round((region_persons_comparison['forecast_pop'] - region_persons_comparison['popsim_pop'])/region_persons_comparison['forecast_pop'] * 100, 3)
region_persons_comparison = region_persons_comparison.sort_values('pct_diff', ascending=False)
region_persons_comparison = region_persons_comparison.reset_index(drop=True)
region_persons_comparison

Unnamed: 0,Jurisdiction,popsim_pop,forecast_pop,Diff,pct_diff
0,San Diego,3361280,3343626,17654,-0.528


## Household Number Comparison

### MGRA Level Check

In [14]:
mgra_households = forecast_data_mgra[['mgra','hh']]
mgra_households_comparison = sh_mgra.reset_index().merge(mgra_households, how='left', on='mgra')
mgra_households_comparison.columns = ['mgra', 'popsim_hh', 'forecast_hh']
mgra_households_comparison['Diff'] = mgra_households_comparison['popsim_hh'] - mgra_households_comparison['forecast_hh']
mgra_households_comparison = mgra_households_comparison.sort_values('Diff', ascending=False)
mgra_households_comparison = mgra_households_comparison.reset_index(drop=True)
mgra_households_comparison

Unnamed: 0,mgra,popsim_hh,forecast_hh,Diff
0,8979,9763,0,9763
1,4929,8528,0,8528
2,5972,5197,0,5197
3,9161,5011,0,5011
4,12713,4776,0,4776
...,...,...,...,...
18897,17539,200,204,-4
18898,7201,38,42,-4
18899,19473,77,81,-4
18900,19531,93,98,-5


### Jurisdiction Level Check

In [15]:
jurisdiction_population = forecast_data_jurisdiction[['Jurisdiction','hh']]
jurisdiction_households_comparison = sh_jurisdiction.reset_index().merge(jurisdiction_population, how='left', on='Jurisdiction')
jurisdiction_households_comparison.columns = ['Jurisdiction', 'popsim_hh', 'forecast_hh']
jurisdiction_households_comparison['Diff'] = jurisdiction_households_comparison['popsim_hh'] - jurisdiction_households_comparison['forecast_hh']
jurisdiction_households_comparison['pct_diff'] = round((jurisdiction_households_comparison['forecast_hh'] - jurisdiction_households_comparison['popsim_hh'])/jurisdiction_households_comparison['forecast_hh'] * 100, 3)
jurisdiction_households_comparison = jurisdiction_households_comparison.sort_values('pct_diff', ascending=False)
jurisdiction_households_comparison = jurisdiction_households_comparison.reset_index(drop=True)
jurisdiction_households_comparison

Unnamed: 0,Jurisdiction,popsim_hh,forecast_hh,Diff,pct_diff
0,Del Mar,2083,2083,0,0.0
1,Solana Beach,5923,5923,0,0.0
2,Oceanside,62714,61779,935,-1.513
3,Chula Vista,84200,82551,1649,-1.998
4,Carlsbad,44748,43833,915,-2.087
5,Encinitas,25208,24680,528,-2.139
6,La Mesa,25797,25138,659,-2.622
7,Poway,16828,16276,552,-3.391
8,Lemon Grove,8910,8564,346,-4.04
9,Imperial Beach,10021,9577,444,-4.636


### Region Level Check 

In [16]:
region_population = forecast_data_region[['Region','hh']]
region_households_comparison = sh_Region.reset_index().merge(region_population, how='left', on='Region')
region_households_comparison.columns = ['Jurisdiction', 'popsim_hh', 'forecast_hh']
region_households_comparison['Diff'] = region_households_comparison['popsim_hh'] - region_households_comparison['forecast_hh']
region_households_comparison['pct_diff'] = round((region_households_comparison['forecast_hh'] - region_households_comparison['popsim_hh'])/region_households_comparison['forecast_hh'] * 100, 3)
region_households_comparison = region_households_comparison.sort_values('pct_diff', ascending=False)
region_households_comparison = region_households_comparison.reset_index(drop=True)
region_households_comparison

Unnamed: 0,Jurisdiction,popsim_hh,forecast_hh,Diff,pct_diff
0,San Diego,1267147,1153032,114115,-9.897


# Comparison with Estimate Data

In [17]:
estimates_data_mgra = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/mgra_2019_CSV_Data_ind_QA.csv')
estimates_data_jurisdiction = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/jurisdiction_2019_CSV_Data_ind_QA.csv')
estimates_data_region = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/region_2019_CSV_Data_ind_QA.csv')

# Comparison with LEHD Data