# SANDAG Populationsim QC

In [1]:
import pandas as pd
import numpy as np
import pyodbc
import glob
import copy
import warnings
warnings.filterwarnings('ignore')

# Data Preparation

### Download Populationsim data

In [2]:
synthetic_households = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-72 SANDPOPSIM Output QC/Population Sim Outputs/synthetic_households.csv')
synthetic_persons = pd.read_csv('C:/Users/cra/San Diego Association of Governments/SANDAG QA QC - Documents/Projects/2022/2022-72 SANDPOPSIM Output QC/Population Sim Outputs/synthetic_persons.csv')

In [6]:
synthetic_persons[synthetic_persons['COW'].isin([6,7])]

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker
4,18100,245,2,2,24,1,1.0,6.0,20.0,15.0,1,1,2.0,1
9,18100,245,4,1,41,2,1.0,6.0,40.0,,1,3,4.0,1
37,18100,836,16,1,39,1,1.0,6.0,55.0,,1,1,4.0,1
55,18100,836,24,1,62,2,1.0,6.0,60.0,,1,1,4.0,1
62,18100,836,26,1,67,2,6.0,6.0,3.0,,1,1,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3356851,17601,24280,1262719,1,48,1,6.0,6.0,40.0,,1,1,4.0,0
3356887,17601,24280,1262755,1,48,1,6.0,6.0,40.0,,1,1,4.0,0
3356897,17601,24280,1262765,1,46,1,6.0,6.0,,,1,1,4.0,0
3356910,17601,24280,1262778,1,38,1,6.0,6.0,20.0,,1,2,4.0,0


In [3]:
synthetic_persons.shape

(3356969, 14)

In [4]:
synthetic_persons[synthetic_persons['mgra'] == 608]

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker
2956132,11801,608,1063965,1,53,2,1.0,1.0,40.0,,9,2,4.0,1
2956133,11801,608,1063965,2,55,1,1.0,3.0,43.0,,1,2,4.0,1
2956134,11801,608,1063965,3,26,1,1.0,1.0,5.0,15.0,9,2,4.0,1
2956135,11801,608,1063966,1,62,1,1.0,5.0,40.0,,1,2,3.0,1
2956136,11801,608,1063966,2,57,2,6.0,,,,1,2,4.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3247277,11801,608,1153145,1,21,1,4.0,5.0,50.0,,1,1,1.0,1
3247278,11801,608,1153146,1,31,1,4.0,5.0,60.0,15.0,6,1,1.0,1
3247279,11801,608,1153147,1,24,1,4.0,5.0,43.0,,2,1,1.0,1
3247280,11801,608,1153148,1,34,1,4.0,5.0,60.0,,1,1,1.0,1


In [5]:
df = synthetic_persons[synthetic_persons['tract'].isna()]
df[df['mgra'] == 608]

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker


In [6]:
synthetic_persons.isna().sum()

tract                 0
mgra                  0
household_id          0
SPORDER               0
AGEP                  0
SEX                   0
ESR              699147
COW             1432081
WKHP            1669201
SCHG            2439621
RAC1P                 0
HISP                  0
MIL              744612
isWorker              0
dtype: int64

In [7]:
synthetic_households.head()

Unnamed: 0,household_id,tract,mgra,NP,HHADJINC,ADJINC,HHT,WIF,HUPAC,VEH,numWorkers,GQ_type
0,1,18100,245,3,121500.0,1010145,3.0,2.0,4.0,3.0,1.0,
1,2,18100,245,3,75400.0,1010145,1.0,2.0,1.0,2.0,2.0,
2,3,18100,245,3,45800.0,1010145,3.0,1.0,4.0,1.0,1.0,
3,4,18100,245,3,70000.0,1010145,1.0,2.0,1.0,2.0,2.0,
4,5,18100,245,1,35700.0,1010145,6.0,,4.0,1.0,1.0,


In [8]:
synthetic_persons[synthetic_persons['household_id'] == 2]

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker
3,18100,245,2,1,25,2,4.0,5.0,50.0,15.0,6,2,1.0,1
4,18100,245,2,2,24,1,1.0,6.0,20.0,15.0,1,1,2.0,1
5,18100,245,2,3,4,2,,,,1.0,6,2,,0


In [9]:
df = synthetic_households[(synthetic_households['HHT'] == 4) | (synthetic_households['HHT'] == 6)]
df[df['NP'] > 1]

Unnamed: 0,household_id,tract,mgra,NP,HHADJINC,ADJINC,HHT,WIF,HUPAC,VEH,numWorkers,GQ_type


In [10]:
synthetic_persons[synthetic_persons['isWorker']==1].shape[0]

1556474

# ACS Population Comparison

In [11]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = '''SELECT summary_level, geo_name, tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B01003' AND yr = 2019 AND county = 073 AND summary_level=140'''

acs_data = pd.read_sql_query(query,conn)

In [12]:
# Group the ACS Data by tract 
acs_grouped_data = acs_data[['tract', 'estimate']].groupby('tract').sum().reset_index()
acs_grouped_data['tract'] = acs_grouped_data['tract'].astype(int)
#acs_grouped_data.head(3)

In [13]:
# Group the synthetic households by tract  
sh_grouped_bytract = synthetic_households[['tract', 'NP']].groupby('tract').sum().reset_index()
#sh_grouped_bytract.head(3)

In [14]:
# Find the difference in Population
acs_sh_tract_comp = sh_grouped_bytract.merge(acs_grouped_data, how="left", on="tract")
acs_sh_tract_comp.columns = ['tract', 'Popsim Pop', 'ACS Pop']
acs_sh_tract_comp["Diff"] = acs_sh_tract_comp['Popsim Pop'] - acs_sh_tract_comp['ACS Pop']
acs_sh_tract_comp = acs_sh_tract_comp.sort_values('Diff')

#acs_sh_tract_comp.to_excel('C:/Users/cra/Desktop/summary_level_140_acs_popsim_diff.xlsx')

In [15]:
print(f"ACS data has a total population of: {sum(acs_data['estimate'])}")
print(f"Popsim data has a total population of: {sum(sh_grouped_bytract['NP'])}")
print(f"The difference is: {sum(acs_data['estimate']) - sum(sh_grouped_bytract['NP'])} which is a {round((sum(acs_data['estimate']) - sum(sh_grouped_bytract['NP']))/sum(sh_grouped_bytract['NP']) * 100, 2)} percent difference")

ACS data has a total population of: 3316073.0
Popsim data has a total population of: 3356969
The difference is: -40896.0 which is a -1.22 percent difference


In [16]:
acs_sh_tract_comp

Unnamed: 0,tract,Popsim Pop,ACS Pop,Diff
306,13310,28105,31118.0,-3013.0
45,2902,5312,7598.0,-2286.0
501,18611,8039,10106.0,-2067.0
554,20018,6692,8741.0,-2049.0
426,17032,13322,15361.0,-2039.0
...,...,...,...,...
101,5300,10604,5847.0,4757.0
565,20029,10900,5808.0,5092.0
620,21500,16410,11078.0,5332.0
99,5100,18448,7702.0,10746.0


# Comparison with ACS 5-yr

## Question 16
Median household income between household output and ACS dataset (non-GQ households) 

In [5]:
# Table: B19013
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = """SELECT tract, estimate
FROM census.acs.vw_summary_file 
WHERE subject_table = 'B19013' AND yr = 2019 AND county = 073 AND summary_level=140"""

acs_median_inc_data = pd.read_sql_query(query,conn)
acs_median_inc_data['tract'] = acs_median_inc_data['tract'].astype(int)

In [29]:
acs_median_inc_data

Unnamed: 0,tract,estimate
0,20018,50972.0
1,21600,87604.0
2,21400,87188.0
3,21303,135083.0
4,20801,120708.0
...,...,...
616,17104,126765.0
617,17010,134318.0
618,16614,106250.0
619,13909,64500.0


In [28]:
# Removing Groupquarters 
popsim_hhadjinc_tract = synthetic_households[synthetic_households['GQ_type'].isna()][['tract', 'HHADJINC', 'ADJINC']]
popsim_hhadjinc_tract['ADJINC'] = popsim_hhadjinc_tract['ADJINC'] * .000001
popsim_hhadjinc_tract['Adjusted_Inc'] = popsim_hhadjinc_tract['HHADJINC'] * popsim_hhadjinc_tract['ADJINC']
popsim_hhadjinc_tract = popsim_hhadjinc_tract[['tract', 'Adjusted_Inc']]
popsim_hhadjinc_tract = popsim_hhadjinc_tract.groupby('tract').median().reset_index()
popsim_hhadjinc_tract

Unnamed: 0,tract,Adjusted_Inc
0,100,141420.300000
1,201,82831.890000
2,202,66871.599000
3,300,71770.802250
4,400,72730.440000
...,...,...
617,21600,76296.251850
618,21800,114348.414000
619,21900,35451.038775
620,22000,41415.945000


In [31]:
# Merge The Data
# Merge the two and subtract
non_gq_acs_popsim_median_inc_comp = popsim_hhadjinc_tract.merge(acs_median_inc_data, how='left', on='tract')
non_gq_acs_popsim_median_inc_comp.columns = ['tract', 'popsim_median_inc', 'acs_median_inc']
non_gq_acs_popsim_median_inc_comp['Diff'] = non_gq_acs_popsim_median_inc_comp['popsim_median_inc'] - non_gq_acs_popsim_median_inc_comp['acs_median_inc']
non_gq_acs_popsim_median_inc_comp = non_gq_acs_popsim_median_inc_comp.sort_values('Diff', ascending= False).reset_index(drop=True)

non_gq_acs_popsim_median_inc_comp

Unnamed: 0,tract,popsim_median_inc,acs_median_inc,Diff
0,5000,43840.2930,31875.0,11965.2930
1,4000,43840.2930,35056.0,8784.2930
2,4900,43840.2930,36400.0,7440.2930
3,4800,43840.2930,36792.0,7048.2930
4,20029,49800.1485,42799.0,7001.1485
...,...,...,...,...
617,17303,141420.3000,182292.0,-40871.7000
618,8328,151824.7935,199613.0,-47788.2065
619,17109,143440.5900,192028.0,-48587.4100
620,8311,151521.7500,229583.0,-78061.2500


In [32]:
# non_gq_acs_popsim_median_inc_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\non_gq_acs_popsim_median_inc_comp.xlsx')

In [17]:
str(popsim_hhadjinc_tract['ADJINC'][0])

'1010145'

In [22]:
popsim_hhadjinc_tract['ADJINC'] = popsim_hhadjinc_tract['ADJINC'] * .000001

In [19]:
[str(x)[0] + '.' + str(x)[1:] for x in popsim_hhadjinc_tract['ADJINC']]

['1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',
 '1.010145',

In [12]:
str(popsim_hhadjinc_tract['ADJINC'][0])

'1010145'

In [18]:
str(popsim_hhadjinc_tract['ADJINC'][0])[0] + '.' + str(popsim_hhadjinc_tract['ADJINC'][0])[1:]

'1.010145'

In [14]:
str(popsim_hhadjinc_tract['ADJINC'][0])[1:]

'010145'

In [19]:
# popsim_hhadjinc_tract.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\non_gq_popsim_median_income_by_tract.csv')

## Question 17
No. Of households (non-GQ)between household output and ACS dataset  

### Non-GQ

In [20]:
# ACS Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = """SELECT tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B11016' AND yr = 2019 AND county = 073 AND summary_level=140 AND line_desc = 'Total:'"""

acs_hs_data = pd.read_sql_query(query,conn)
acs_hs_data['tract'] = acs_hs_data['tract'].astype(int)

In [21]:
# Populationsim (remove GQ households)
non_gq_hs = synthetic_households[synthetic_households['GQ_type'].isna()]
non_gq_popsim_hs = non_gq_hs[['tract', 'NP']].groupby('tract').count().reset_index()

In [22]:
# Merge the two and subtract
non_gq_popsim_acs_hs_comp = non_gq_popsim_hs.merge(acs_hs_data, how='left', on='tract')
non_gq_popsim_acs_hs_comp.columns = ['tract', 'popsim_hh', 'acs_hh']
non_gq_popsim_acs_hs_comp['Diff'] = non_gq_popsim_acs_hs_comp['popsim_hh'] - non_gq_popsim_acs_hs_comp['acs_hh']
non_gq_popsim_acs_hs_comp = non_gq_popsim_acs_hs_comp.sort_values('Diff', ascending= False).reset_index(drop=True)

non_gq_popsim_acs_hs_comp

Unnamed: 0,tract,popsim_hh,acs_hh,Diff
0,5100,5989,3892.0,2097.0
1,21500,4364,3475.0,889.0
2,7600,2904,2049.0,855.0
3,5300,3597,2752.0,845.0
4,8511,2926,2149.0,777.0
...,...,...,...,...
617,10900,724,923.0,-199.0
618,11100,1637,1851.0,-214.0
619,2902,2301,2548.0,-247.0
620,20013,4719,4970.0,-251.0


In [23]:
# non_gq_popsim_acs_hs_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\non_gq_popsim_acs_hs_comp.xlsx')

### Total

In [24]:
# ACS Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = """SELECT tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B11016' AND yr = 2019 AND county = 073 AND summary_level=140 AND line_desc = 'Total:'"""

acs_hs_data = pd.read_sql_query(query,conn)
acs_hs_data['tract'] = acs_hs_data['tract'].astype(int)

In [25]:
# Populationsim (all)
popsim_hs = synthetic_households[['tract', 'NP']].groupby('tract').count().reset_index()

In [26]:
# Merge the two and subtract
total_popsim_acs_hs_comp = popsim_hs.merge(acs_hs_data, how='left', on='tract')
total_popsim_acs_hs_comp.columns = ['tract', 'popsim_hh', 'acs_hh']
total_popsim_acs_hs_comp['Diff'] = total_popsim_acs_hs_comp['popsim_hh'] - total_popsim_acs_hs_comp['acs_hh']
total_popsim_acs_hs_comp = total_popsim_acs_hs_comp.sort_values('Diff', ascending= False).reset_index(drop=True)

total_popsim_acs_hs_comp

Unnamed: 0,tract,popsim_hh,acs_hh,Diff
0,18700,22080,6745.0,15335.0
1,8305,12045,558.0,11487.0
2,3800,7225,0.0,7225.0
3,9400,5820,532.0,5288.0
4,21900,5806,708.0,5098.0
...,...,...,...,...
621,17404,2620,2802.0,-182.0
622,10900,729,923.0,-194.0
623,11000,1445,1650.0,-205.0
624,11100,1637,1851.0,-214.0


In [27]:
acs_hs_data.isna().sum()

tract       0
estimate    0
dtype: int64

In [28]:
# total_popsim_acs_hs_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\total_popsim_acs_hs_comp.xlsx')

## Question 18
Total Population (non-GQ) between household output and ACS dataset 

### Non-GQ

In [29]:
# ACS Data (ACS Already does not have GQ)
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = '''SELECT summary_level, geo_name, tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B01003' AND yr = 2019 AND county = 073 AND summary_level=140'''

acs_data = pd.read_sql_query(query,conn)
acs_data['tract'] = acs_data['tract'].astype(int)


acs_pop = acs_data[['tract', 'estimate']].groupby('tract').sum().reset_index()
acs_pop


Unnamed: 0,tract,estimate
0,100,3093.0
1,201,1891.0
2,202,4542.0
3,300,5239.0
4,400,3801.0
...,...,...
623,21800,2403.0
624,21900,7107.0
625,22000,4681.0
626,22100,10005.0


In [30]:
acs_pop = acs_data[['tract', 'estimate']].groupby('tract').sum().reset_index()
acs_pop

Unnamed: 0,tract,estimate
0,100,3093.0
1,201,1891.0
2,202,4542.0
3,300,5239.0
4,400,3801.0
...,...,...
623,21800,2403.0
624,21900,7107.0
625,22000,4681.0
626,22100,10005.0


In [31]:
# Populationsim (remove GQ households)
non_gq_pop = synthetic_households[synthetic_households['GQ_type'].isna()]
non_gq_popsim_pop = non_gq_pop[['tract', 'NP']].groupby('tract').sum().reset_index()

In [32]:
# Merge the two 
non_gq_popsim_acs_pop_comp = non_gq_popsim_pop.merge(acs_pop, how='left', on='tract')
non_gq_popsim_acs_pop_comp.columns = ['tract', 'popsim_pop', 'acs_pop']
non_gq_popsim_acs_pop_comp['Diff'] = non_gq_popsim_acs_pop_comp['popsim_pop'] - non_gq_popsim_acs_pop_comp['acs_pop']
non_gq_popsim_acs_pop_comp = non_gq_popsim_acs_pop_comp.sort_values('Diff', ascending=False)


non_gq_popsim_acs_pop_comp

Unnamed: 0,tract,popsim_pop,acs_pop,Diff
98,5100,15597,7702.0,7895.0
616,21500,16410,11078.0,5332.0
561,20029,10637,5808.0,4829.0
191,8511,8542,4989.0,3553.0
606,21000,6146,2706.0,3440.0
...,...,...,...,...
47,2904,6969,10078.0,-3109.0
612,21302,4190,7616.0,-3426.0
302,13310,27635,31118.0,-3483.0
619,21900,1772,7107.0,-5335.0


In [33]:
# non_gq_popsim_acs_pop_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\non_gq_popsim_acs_pop_comp.xlsx')

### Total

In [34]:
# ACS Data (ACS Already does not have GQ)
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = '''SELECT summary_level, geo_name, tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B01003' AND yr = 2019 AND county = 073 AND summary_level=140'''

acs_data = pd.read_sql_query(query,conn)
acs_data['tract'] = acs_data['tract'].astype(int)


acs_pop = acs_data[['tract', 'estimate']].groupby('tract').sum().reset_index()
acs_pop

Unnamed: 0,tract,estimate
0,100,3093.0
1,201,1891.0
2,202,4542.0
3,300,5239.0
4,400,3801.0
...,...,...
623,21800,2403.0
624,21900,7107.0
625,22000,4681.0
626,22100,10005.0


In [35]:
acs_pop = acs_data[['tract', 'estimate']].groupby('tract').sum().reset_index()
acs_pop

Unnamed: 0,tract,estimate
0,100,3093.0
1,201,1891.0
2,202,4542.0
3,300,5239.0
4,400,3801.0
...,...,...
623,21800,2403.0
624,21900,7107.0
625,22000,4681.0
626,22100,10005.0


In [36]:
# Populationsim (remove GQ households)
total_popsim_pop = synthetic_households[['tract', 'NP']].groupby('tract').sum().reset_index()

In [37]:
# Merge the two 
total_popsim_acs_pop_comp = total_popsim_pop.merge(acs_pop, how='left', on='tract')
total_popsim_acs_pop_comp.columns = ['tract', 'popsim_pop', 'acs_pop']
total_popsim_acs_pop_comp['Diff'] = total_popsim_acs_pop_comp['popsim_pop'] - total_popsim_acs_pop_comp['acs_pop']
total_popsim_acs_pop_comp = total_popsim_acs_pop_comp.sort_values('Diff', ascending=False)

total_popsim_acs_pop_comp

Unnamed: 0,tract,popsim_pop,acs_pop,Diff
142,8305,13874,1460.0,12414.0
99,5100,18448,7702.0,10746.0
620,21500,16410,11078.0,5332.0
565,20029,10900,5808.0,5092.0
101,5300,10604,5847.0,4757.0
...,...,...,...,...
426,17032,13322,15361.0,-2039.0
554,20018,6692,8741.0,-2049.0
501,18611,8039,10106.0,-2067.0
45,2902,5312,7598.0,-2286.0


In [38]:
# total_popsim_acs_pop_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\total_popsim_acs_pop_comp.xlsx')

## Question 19
Median Household Size by census tracts (all households) 


In [39]:
# ACS Population
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = '''SELECT summary_level, geo_name, tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B01003' AND yr = 2019 AND county = 073 AND summary_level=140'''

acs_data = pd.read_sql_query(query,conn)
acs_data['tract'] = acs_data['tract'].astype(int)


acs_pop = acs_data[['tract', 'estimate']].groupby('tract').sum().reset_index()
acs_pop

# ACS Households
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = """SELECT tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B11016' AND yr = 2019 AND county = 073 AND summary_level=140 AND line_desc = 'Total:'"""

acs_hs_data = pd.read_sql_query(query,conn)
acs_hs_data['tract'] = acs_hs_data['tract'].astype(int)
acs_hs_data

# Merge and create the necessary output
merged_acs_info = acs_pop.merge(acs_hs_data, how='left', on='tract')

merged_acs_info.columns = ['tract', 'ACS_pop', 'ACS_hs_num']

merged_acs_info['ASC_hs_size'] = round(merged_acs_info['ACS_pop'] / merged_acs_info['ACS_hs_num'], 2)


merged_acs_info = merged_acs_info[['tract', 'ASC_hs_size']]

merged_acs_info


Unnamed: 0,tract,ASC_hs_size
0,100,2.46
1,201,1.75
2,202,1.95
3,300,1.74
4,400,1.87
...,...,...
623,21800,2.56
624,21900,10.04
625,22000,3.71
626,22100,2.79


In [40]:
# Same thing as above will be done from populationsim

# Population sim population 
popsim_population = synthetic_households[['tract', 'NP']].groupby('tract').sum().reset_index()

# Populationsim household number 
popsim_hs = synthetic_households[['tract', 'NP']].groupby('tract').count().reset_index()

# Merge and output 
popsim_merged_data = popsim_population.merge(popsim_hs, how='left', on='tract')
popsim_merged_data.columns = ['tract', 'popsim_pop', 'popsim_hs']
popsim_merged_data['hs_size'] = round(popsim_merged_data['popsim_pop'] / popsim_merged_data['popsim_hs'], 2)
popsim_merged_data = popsim_merged_data[['tract', 'hs_size']]
popsim_merged_data

Unnamed: 0,tract,hs_size
0,100,2.59
1,201,2.13
2,202,2.25
3,300,1.92
4,400,1.91
...,...,...
621,21600,1.76
622,21800,2.18
623,21900,1.20
624,22000,3.09


In [41]:
household_size_acs_popsim_comp = popsim_merged_data.merge(merged_acs_info, how='left', on='tract')
household_size_acs_popsim_comp.columns = ['tract', 'popsim_hs_size', 'ACS_hs_size']
household_size_acs_popsim_comp['Diff'] = household_size_acs_popsim_comp['popsim_hs_size'] - household_size_acs_popsim_comp['ACS_hs_size']
household_size_acs_popsim_comp = household_size_acs_popsim_comp.sort_values('Diff', ascending=False)
household_size_acs_popsim_comp

Unnamed: 0,tract,popsim_hs_size,ACS_hs_size,Diff
610,21000,2.92,2.00,0.92
481,18300,2.78,1.87,0.91
565,20029,3.50,2.63,0.87
285,12501,3.45,2.62,0.83
609,20904,2.82,2.07,0.75
...,...,...,...,...
237,9901,1.03,191.75,-190.72
103,5500,1.00,inf,-inf
85,3800,1.00,inf,-inf
110,6200,1.00,inf,-inf


In [42]:
# household_size_acs_popsim_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\household_size_acs_popsim_comp.xlsx')

## Question 20
Median age by census tracts (all households) 

In [43]:
# ACS Data (ACS Already does not have GQ)
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = '''SELECT tract, estimate FROM census.acs.vw_summary_file WHERE subject_table = 'B01002' AND yr = 2019 AND county = 073 AND summary_level=140'''

acs_data = pd.read_sql_query(query,conn)
acs_data['tract'] = acs_data['tract'].astype(int)
acs_data = acs_data.groupby('tract').median().reset_index()
acs_data

Unnamed: 0,tract,estimate
0,100,54.2
1,201,49.4
2,202,41.2
3,300,43.8
4,400,34.4
...,...,...
621,21600,26.0
622,21800,50.1
623,21900,25.0
624,22000,31.5


In [44]:
# ACS from CSV
acs_csv_median_age = pd.read_csv(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2022\2022-72 SANDPOPSIM Output QC\ACSST5Y2019.S0101-Data.csv')

In [45]:
acs_csv_df = acs_csv_median_age[['NAME', 'S0101_C01_032E']]
acs_csv_df = acs_csv_df.iloc[1:]
acs_csv_df.columns = ['name', 'median_age']
acs_csv_df

Unnamed: 0,name,median_age
1,"Census Tract 1, San Diego County, California",54.2
2,"Census Tract 2.01, San Diego County, California",49.4
3,"Census Tract 2.02, San Diego County, California",41.2
4,"Census Tract 3, San Diego County, California",43.8
5,"Census Tract 4, San Diego County, California",34.4
...,...,...
624,"Census Tract 218, San Diego County, California",50.1
625,"Census Tract 219, San Diego County, California",25
626,"Census Tract 220, San Diego County, California",31.5
627,"Census Tract 221, San Diego County, California",38.2


In [46]:
acs_csv_df['median_age'] = acs_csv_df['median_age'].replace(['-'],np.nan)

In [47]:
list_1 = [x.split(',')[0][13:] for x in acs_csv_df['name']]
list_2 = [x.replace('.', '') if '.' in x else x+'00' for x in list_1]

In [48]:
acs_csv_df['tract'] = list_2

In [49]:
acs_csv_df


Unnamed: 0,name,median_age,tract
1,"Census Tract 1, San Diego County, California",54.2,100
2,"Census Tract 2.01, San Diego County, California",49.4,201
3,"Census Tract 2.02, San Diego County, California",41.2,202
4,"Census Tract 3, San Diego County, California",43.8,300
5,"Census Tract 4, San Diego County, California",34.4,400
...,...,...,...
624,"Census Tract 218, San Diego County, California",50.1,21800
625,"Census Tract 219, San Diego County, California",25,21900
626,"Census Tract 220, San Diego County, California",31.5,22000
627,"Census Tract 221, San Diego County, California",38.2,22100


In [50]:
acs_csv_df = acs_csv_df[['tract', 'median_age']]
acs_csv_df['tract'] = acs_csv_df['tract'].astype(int)
acs_csv_df['median_age'] = acs_csv_df['median_age'].astype(float)
acs_csv_df

Unnamed: 0,tract,median_age
1,100,54.2
2,201,49.4
3,202,41.2
4,300,43.8
5,400,34.4
...,...,...
624,21800,50.1
625,21900,25.0
626,22000,31.5
627,22100,38.2


In [51]:
# Popsim
popsim_median_age = synthetic_persons[['tract', 'AGEP']].groupby('tract').median().reset_index()
popsim_median_age['tract'] = popsim_median_age['tract'].astype(int)
popsim_median_age

Unnamed: 0,tract,AGEP
0,100,49.0
1,201,44.0
2,202,41.0
3,300,44.0
4,400,39.0
...,...,...
621,21600,26.0
622,21800,45.0
623,21900,22.0
624,22000,31.0


In [52]:
# merge 
acs_popsim_median_age_comp = popsim_median_age.merge(acs_csv_df, how='left', on='tract')
acs_popsim_median_age_comp.columns = ['tract', 'popsim_median_age', 'ACS_median_age']
acs_popsim_median_age_comp['Diff'] = acs_popsim_median_age_comp['popsim_median_age'] - acs_popsim_median_age_comp['ACS_median_age']
acs_popsim_median_age_comp = acs_popsim_median_age_comp.sort_values('Diff', ascending=False)
acs_popsim_median_age_comp

Unnamed: 0,tract,popsim_median_age,ACS_median_age,Diff
159,8339,39.0,25.3,13.7
224,9510,31.0,17.9,13.1
161,8341,38.0,25.9,12.1
44,2804,33.0,22.0,11.0
45,2902,39.0,28.8,10.2
...,...,...,...,...
610,21000,46.0,62.0,-16.0
274,11902,31.0,48.1,-17.1
488,18512,45.0,63.3,-18.3
543,19808,40.0,66.3,-26.3


In [53]:
# acs_popsim_median_age_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\acs_popsim_median_age_comp.xlsx')

# Check 8
LEHD Comparison 

## RAC

In [54]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=dpoe_stage;'
                      'Trusted_Connection=yes;')

query = """SELECT SUM(C000)
FROM [dpoe_stage].[lehd_lodes].[rac_7_5_20211018]
WHERE yr = 2019 AND type = 'JT00' AND segment = 'S000'"""

lehd_RAC_jobs_sum = pd.read_sql_query(query,conn)

In [55]:
lehd_RAC_total = lehd_RAC_jobs_sum[''][0]
lehd_RAC_total

1467579

In [56]:
# From popsim
popsim_isworker = synthetic_persons[synthetic_persons['isWorker']==1].shape[0]
popsim_isworker

1556474

In [57]:
diff_worker_popsim_lehd = popsim_isworker - lehd_RAC_total
diff_worker_popsim_lehd

88895

In [58]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=dpoe_stage;'
                      'Trusted_Connection=yes;')

query = """SELECT SUBSTRING(h_geoid, 6, 6) AS tract, SUM(C000) AS 'sum of employment'
FROM [dpoe_stage].[lehd_lodes].[rac_7_5_20211018]
WHERE yr = 2019 AND type = 'JT00' AND segment = 'S000'
GROUP BY SUBSTRING(h_geoid, 6, 6);"""

lehd_RAC_jobs_tract_sum = pd.read_sql_query(query,conn)

In [59]:
lehd_RAC_jobs_tract_sum['tract'] = lehd_RAC_jobs_tract_sum['tract'].astype(int)
lehd_RAC_jobs_tract_sum

Unnamed: 0,tract,sum of employment
0,20013,5641
1,20807,1406
2,21206,1497
3,2902,2874
4,11100,1391
...,...,...
622,20106,1951
623,17041,3076
624,18511,2264
625,10010,2666


In [60]:
# popsim adjustments
popsim_isworker_output = synthetic_persons[synthetic_persons['isWorker']==1]
popsim_isworker_output['count'] = None
popsim_isworker_output

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,count
0,18100,245,1,1,52,2,1.0,1.0,40.0,,1,2,4.0,1,
3,18100,245,2,1,25,2,4.0,5.0,50.0,15.0,6,2,1.0,1,
4,18100,245,2,2,24,1,1.0,6.0,20.0,15.0,1,1,2.0,1,
6,18100,245,3,1,64,2,1.0,4.0,60.0,,1,1,4.0,1,
9,18100,245,4,1,41,2,1.0,6.0,40.0,,1,3,4.0,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3356832,17601,24280,1262700,1,21,2,1.0,1.0,8.0,,2,1,4.0,1,
3356853,17601,24280,1262721,1,81,1,1.0,2.0,10.0,,1,1,4.0,1,
3356857,17601,24280,1262725,1,19,1,1.0,5.0,60.0,,1,2,2.0,1,
3356920,17601,24280,1262788,1,81,1,1.0,2.0,10.0,,1,1,4.0,1,


In [61]:
popsim_isworker_output = popsim_isworker_output[['tract', 'isWorker']].groupby('tract').count().reset_index()
popsim_isworker_output.columns = ['tract', 'count']

popsim_isworker_output


Unnamed: 0,tract,count
0,100,1593
1,201,927
2,202,2803
3,300,3347
4,400,2775
...,...,...
621,21600,1942
622,21800,978
623,21900,5978
624,22000,1941


In [62]:
lehd_households_tract_comp = popsim_isworker_output.merge(lehd_RAC_jobs_tract_sum, how='left', on='tract')

lehd_households_tract_comp.columns = ['tract', 'popsim_worker', 'lehd_worker']

lehd_households_tract_comp['Diff'] = lehd_households_tract_comp['popsim_worker'] - lehd_households_tract_comp['lehd_worker']

lehd_households_tract_comp.sort_values('Diff', ascending=False)

lehd_households_tract_comp

Unnamed: 0,tract,popsim_worker,lehd_worker,Diff
0,100,1593,1249,344
1,201,927,933,-6
2,202,2803,2323,480
3,300,3347,2621,726
4,400,2775,2250,525
...,...,...,...,...
621,21600,1942,693,1249
622,21800,978,784,194
623,21900,5978,1951,4027
624,22000,1941,2201,-260


In [63]:
# lehd_households_tract_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\ACS_Data\popsim_lehd_worker_difference.xlsx')

# QC Check with Purva

In [64]:
df1 = synthetic_households[['household_id','mgra', 'HHADJINC','GQ_type', 'NP']][synthetic_households[['household_id','mgra', 'HHADJINC','GQ_type', 'NP']]['HHADJINC'] == 0]

df2 = df1[(df1['GQ_type'].isna())]

df2

df3 = synthetic_persons[synthetic_persons['household_id'].isin(df2['household_id'])]

# Which MGRAs are not present

In [65]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=SQL2014B8.sandag.org;'
                      'Database=GeoDepot;'
                      'Trusted_Connection=yes;')

query = '''SELECT
      [MGRA]
  FROM [GeoDepot].[gis].[MGRA15]'''

crosswalk = pd.read_sql_query(query,conn)

result = pd.DataFrame({'mgras_not_in_popsim':list(set(crosswalk['MGRA']) - set(synthetic_persons['mgra']))})
result

Unnamed: 0,mgras_not_in_popsim
0,16
1,36
2,67
3,78
4,79
...,...
5414,24311
5415,24315
5416,24319
5417,24320


In [66]:
#result.to_excel('C:/Users/cra/Desktop/mgras_not_in_popsim.xlsx')

# Comparison with 2019 Base Year Forecast

## Download Forecast Data

In [67]:
forecast_data_mgra = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/mgra_2019_CSV_Data_ind_QA.csv')
forecast_data_jurisdiction = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/jurisdiction_2019_CSV_Data_ind_QA.csv')
forecast_data_region = pd.read_csv('J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_15_2019_CSV_outputs/region_2019_CSV_Data_ind_QA.csv')

## Question 5 (Part 1)
 Non GQ Population Sum Comparison   
Note: 'hhp' = total household population (exclude gq pop) (https://github.com/SANDAG/ABM/wiki/input-files)

### MGRA

In [68]:
# From populationsim
hs_small_df = synthetic_households[['mgra', 'NP', 'GQ_type']]
non_gq = hs_small_df[(hs_small_df['GQ_type'].isna())]
non_gq_pop_sum = non_gq[['mgra', 'NP']].groupby('mgra').sum()

In [69]:
# Combine 
non_gq_pop_comparison = non_gq_pop_sum.reset_index().merge(forecast_data_mgra[['mgra', 'hhp']], how='left', on='mgra')
non_gq_pop_comparison.columns = ['mgra', 'Populationsim Pop', 'Forecast Pop']
non_gq_pop_comparison.set_index('mgra')
non_gq_pop_comparison['Diff'] = non_gq_pop_comparison['Populationsim Pop'] - non_gq_pop_comparison['Forecast Pop']

In [70]:
non_gq_pop_comparison = non_gq_pop_comparison.sort_values('Diff', ascending=False)
non_gq_pop_comparison#.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\non_gq_pop_comparison.csv')

Unnamed: 0,mgra,Populationsim Pop,Forecast Pop,Diff
6170,7507,4228,3623,605
15445,19747,3194,2742,452
4904,5878,9910,9532,378
2160,2428,3395,3028,367
14518,18520,1807,1452,355
...,...,...,...,...
15291,19499,1465,1610,-145
2456,2768,673,821,-148
5745,6963,1952,2105,-153
10388,13437,1852,2013,-161


### Region

In [71]:
non_gq_pop_comparison_region = pd.DataFrame(non_gq_pop_comparison[['Populationsim Pop', 'Forecast Pop', 'Diff']].sum(axis=0)).T

# non_gq_pop_comparison_region.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\non_gq_pop_comparison_region.xlsx', index=False)

In [72]:
# non_gq_pop_comparison_region.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\non_gq_pop_comparison_region.xlsx', index=False)

## Question 5 (Part 2)
GQ Population Sum Comparison 

### MGRA

In [73]:
# Grabbing Popsim info
gq_popsim = hs_small_df[~(hs_small_df['GQ_type'].isna())]
gq_pop_sum = gq_popsim[['mgra', 'NP']].groupby('mgra').sum()

In [74]:
# Grabbing Forecast Info
gq_pop_forecast = forecast_data_mgra[['mgra', 'pop', 'hhp']]
gq_pop_forecast['gq_pop'] = gq_pop_forecast['pop'] - gq_pop_forecast['hhp']

In [75]:
gq_pop_comparison = gq_pop_sum.reset_index().merge(gq_pop_forecast[['mgra', 'gq_pop']], how='left', on='mgra')
gq_pop_comparison.columns = ['mgra', 'Populationsim Pop', 'Forecast Pop']
gq_pop_comparison.set_index('mgra')
gq_pop_comparison['Diff'] = gq_pop_comparison['Populationsim Pop'] - gq_pop_comparison['Forecast Pop']


gq_pop_comparison#.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\gq_pop_comparison.csv')

Unnamed: 0,mgra,Populationsim Pop,Forecast Pop,Diff
0,25,2,2,0
1,28,1,1,0
2,50,1,1,0
3,53,4,4,0
4,55,1,1,0
...,...,...,...,...
1412,24024,641,641,0
1413,24079,2,2,0
1414,24084,16,16,0
1415,24254,3,3,0


### Region

In [76]:
gq_pop_comparison_region = pd.DataFrame(gq_pop_comparison[['Populationsim Pop', 'Forecast Pop', 'Diff']].sum(axis=0)).T
gq_pop_comparison_region
# gq_pop_comparison_region.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\gq_pop_comparison_region.xlsx', index=False)

Unnamed: 0,Populationsim Pop,Forecast Pop,Diff
0,109789,109789,0


## Question 6 (Part 1)
Non GQ Household Number Comparison  

### MGRA

In [77]:
# From popsim
hs_non_gq_sum = pd.DataFrame(non_gq['mgra'].value_counts())
hs_non_gq_sum = hs_non_gq_sum.reset_index()
hs_non_gq_sum.columns = ['mgra', 'hh_popsim']

In [78]:
# From forecast 
forecast_non_gq_hh_sum = forecast_data_mgra[['mgra','hh']]
forecast_non_gq_hh_sum.columns = ['mgra', 'hh_forecast']

In [79]:
non_gq_hh_comp = hs_non_gq_sum.merge(forecast_non_gq_hh_sum, how='left', on='mgra')
non_gq_hh_comp['Diff'] = non_gq_hh_comp['hh_popsim'] - non_gq_hh_comp['hh_forecast']
non_gq_hh_comp = non_gq_hh_comp.sort_values('Diff', ascending=False)
non_gq_hh_comp

Unnamed: 0,mgra,hh_popsim,hh_forecast,Diff
1381,19514,159,154,5
8792,21398,44,39,5
6488,19470,62,57,5
384,12873,272,268,4
5,1492,957,953,4
...,...,...,...,...
6850,10836,59,63,-4
1796,4237,141,145,-4
807,17539,200,204,-4
3835,19531,93,98,-5


In [80]:
# non_gq_hh_comp.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\non_gq_HH_comparison.csv')

### Region

In [81]:
non_gq_hh_comp_region = pd.DataFrame(non_gq_hh_comp[['hh_popsim', 'hh_forecast', 'Diff']].sum(axis=0)).T
non_gq_hh_comp_region
# non_gq_hh_comp_region.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\non_gq_hh_comp_region.xlsx', index=False)

Unnamed: 0,hh_popsim,hh_forecast,Diff
0,1153047,1153024,23


## Question 6 (Part 2) 
GQ Household Number Comparison

### MGRA

In [82]:
# Forecast
gq_hh_forecast = forecast_data_mgra[['mgra', 'gq_civ', 'gq_mil']]
gq_hh_forecast['gq_hh_total_forecast'] = gq_hh_forecast['gq_civ'] + gq_hh_forecast['gq_mil']
gq_hh_forecast = gq_hh_forecast[['mgra', 'gq_hh_total_forecast']]

In [83]:
# Populationsim 
gq_household_popsim = hs_small_df[~(hs_small_df['GQ_type'].isna())]
hs_gq_sum_pop_sim = pd.DataFrame(gq_household_popsim['mgra'].value_counts())
hs_gq_sum_pop_sim = hs_gq_sum_pop_sim.reset_index()
hs_gq_sum_pop_sim.columns = ['mgra', 'hh_gq_popsim']

In [84]:
hh_gq_sum_comparison = hs_gq_sum_pop_sim.merge(gq_hh_forecast, how='left', on='mgra')
hh_gq_sum_comparison['Diff'] = hh_gq_sum_comparison['hh_gq_popsim'] - hh_gq_sum_comparison['gq_hh_total_forecast']

hh_gq_sum_comparison#.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\gq_HH_comparison.xlsx')

hh_gq_sum_comparison[hh_gq_sum_comparison['Diff'] != 0]

Unnamed: 0,mgra,hh_gq_popsim,gq_hh_total_forecast,Diff


In [85]:
# hh_gq_sum_comparison.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\gq_HH_comparison.csv')

### Region

In [86]:
hh_gq_sum_comparison_region = pd.DataFrame(hh_gq_sum_comparison[['hh_gq_popsim', 'gq_hh_total_forecast', 'Diff']].sum(axis=0)).T
hh_gq_sum_comparison_region
# hh_gq_sum_comparison_region.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\hh_gq_sum_comparison_region.xlsx', index=False)

Unnamed: 0,hh_gq_popsim,gq_hh_total_forecast,Diff
0,109789,109789,0


# Question 23
Compare AGE and SEX distribution between the persons output and the 2019 estimates data  

In [106]:
q7_person_df = copy.deepcopy(synthetic_persons)
q7_person_df = q7_person_df[['AGEP', 'SEX']]

In [107]:
q7_person_df['SEX'] = np.where(q7_person_df["SEX"] == 1, 'Male', 'Female')

In [108]:
bins = [-1, 4, 9, 14, 17, 19,24,29,34,39,44,49,54,59,61,64,69,74,79,84,np.inf]
names = ['Under 5','5 to 9','10 to 14','15 to 17','18 and 19','20 to 24','25 to 29','30 to 34','35 to 39','40 to 44','45 to 49','50 to 54','55 to 59','60 and 61','62 to 64','65 to 69','70 to 74','75 to 79','80 to 84','85 and Older']

q7_person_df['AgeRange'] = pd.cut(q7_person_df['AGEP'], bins, labels=names)

In [109]:
set(q7_person_df[(q7_person_df['AgeRange'] == '62 to 64')]['AGEP'])

{62, 63, 64}

In [110]:
q7_person_df = q7_person_df.groupby(['AgeRange', 'SEX']).count().reset_index()
q7_person_df.columns = ['name', 'sex', 'Total_popsim']

In [111]:
estimates_age_sex_eth = pd.read_csv('J:/DataScience/DataQuality/QAQC/Estimates QC Automation/v_20220915/data/raw_data/QA_2021_01_region_age_sex_ethnicity.csv')

In [112]:
estimates_age_sex_eth['Total_est'] = list(estimates_age_sex_eth[['Hispanic', 'Non-Hispanic, White',
       'Non-Hispanic, Asian', 'Non-Hispanic, Hawaiian or Pacific Islander',
       'Non-Hispanic, American Indian or Alaska Native', 'Non-Hispanic, Other',
       'Non-Hispanic, Two or More Races', 'Non-Hispanic, Black']].sum(axis=1))

In [113]:
estimates_data_agg = estimates_age_sex_eth[['yr_id', 'name', 'sex', 'Total_est']][estimates_age_sex_eth['yr_id']==2019][['name', 'sex', 'Total_est']].reset_index(drop=True)

In [114]:
age_sex_estimmates_popsim_comp = q7_person_df.merge(estimates_data_agg, how='left', on=['name', 'sex'])
age_sex_estimmates_popsim_comp['Diff'] = age_sex_estimmates_popsim_comp['Total_popsim'] - age_sex_estimmates_popsim_comp['Total_est']
age_sex_estimmates_popsim_comp

Unnamed: 0,name,sex,Total_popsim,Total_est,Diff
0,Under 5,Female,97336,106562,-9226
1,Under 5,Male,115672,106919,8753
2,5 to 9,Female,108846,111848,-3002
3,5 to 9,Male,118221,118279,-58
4,10 to 14,Female,103832,107219,-3387
5,10 to 14,Male,112710,112902,-192
6,15 to 17,Female,64193,62720,1473
7,15 to 17,Male,70104,69506,598
8,18 and 19,Female,49765,46795,2970
9,18 and 19,Male,59296,52433,6863


In [87]:
# age_sex_estimmates_popsim_comp.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\age_sex_estimmates_popsim_comp_v2.xlsx')

In [85]:
print(f"Estimates has a total population here of {sum(age_sex_estimmates_popsim_comp['Total_est'])}")
print(f"Populationsim has a total population here of {sum(age_sex_estimmates_popsim_comp['Total_popsim'])}")
print(f"The difference between them is popsim has {sum(age_sex_estimmates_popsim_comp['Total_popsim']) - sum(age_sex_estimmates_popsim_comp['Total_est'])} more people")

Estimates has a total population here of 3333319
Populationsim has a total population here of 3356969
The difference between them is popsim has 23650 more people


### The proportion

In [3]:
age_sex_estimmates_popsim_comp.head()

NameError: name 'age_sex_estimmates_popsim_comp' is not defined

In [98]:
age_sex_est_popsim_proportion = age_sex_estimmates_popsim_comp[['name', 'Total_popsim', 'Total_est']].groupby('name').sum()
age_sex_est_popsim_proportion['Popsim_Proportion'] = round(age_sex_est_popsim_proportion['Total_popsim'] / sum(age_sex_est_popsim_proportion['Total_popsim'])*100, 2)
age_sex_est_popsim_proportion['Estimates_Proportion'] = round(age_sex_est_popsim_proportion['Total_est'] / sum(age_sex_est_popsim_proportion['Total_est'])*100, 2)

age_sex_est_popsim_proportion = age_sex_est_popsim_proportion[['Popsim_Proportion', 'Estimates_Proportion']]
age_sex_est_popsim_proportion

Unnamed: 0_level_0,Popsim_Proportion,Estimates_Proportion
name,Unnamed: 1_level_1,Unnamed: 2_level_1
10 to 14,6.45,6.6
15 to 17,4.0,3.97
18 and 19,3.25,2.98
20 to 24,7.63,7.9
25 to 29,6.34,6.3
30 to 34,6.58,6.48
35 to 39,6.94,6.98
40 to 44,6.21,6.2
45 to 49,6.13,6.26
5 to 9,5.28,6.9


In [99]:
# age_sex_est_popsim_proportion.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\age_sex_est_popsim_proportion.csv')

# Purva Median Income Request 

In [100]:
new_df = synthetic_households[['mgra', 'HHADJINC']].groupby('mgra').median()

In [101]:
synthetic_households[['mgra', 'HHADJINC']][synthetic_households[['mgra', 'HHADJINC']]['mgra'] == 9179]

Unnamed: 0,mgra,HHADJINC
1138568,9179,86700.0
1240392,9179,0.0
1240393,9179,950.0
1240394,9179,3000.0
1240395,9179,0.0
1240396,9179,0.0
1240397,9179,32000.0
1240398,9179,9600.0
1240399,9179,0.0
1240400,9179,0.0


In [102]:
new_df = synthetic_households[['mgra', 'HHADJINC']].groupby('mgra').median()
new_df.columns = ['Median Adj Income']

#new_df.to_excel('C:/Users/cra/Desktop/median_income.xlsx')

In [103]:
# Checking to see if the creation of that median file dropped any MGRAs
synthetic_households['mgra'].nunique()
synthetic_households[['mgra', 'HHADJINC']].groupby('mgra').median().reset_index()['mgra'].nunique()

18902

# Self Employment Figures

### Download Crosswalk Data

In [104]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=SQL2014B8.sandag.org;'
                      'Database=GeoDepot;'
                      'Trusted_Connection=yes;')

query = '''SELECT
      [MGRA]
	  ,cpa_data.NAME AS CPA
	  ,jur_data.Name AS Jurisdiction
      ,[LUZ]
  FROM [GeoDepot].[gis].[MGRA15] as mgra_base
  LEFT JOIN [GeoDepot].[gis].[CITIES] AS jur_data
  ON mgra_base.City = jur_data.City
  LEFT JOIN [GeoDepot].[gis].[CITYCPA] AS cpa_data
  ON mgra_base.CPA = cpa_data.CPA'''

crosswalk = pd.read_sql_query(query,conn)

In [105]:
# Appending to synthetic data
synthetic_households= synthetic_households.merge(crosswalk, left_on='mgra', right_on='MGRA', how='left')
synthetic_persons = synthetic_persons.merge(crosswalk, left_on='mgra', right_on='MGRA', how='left')

In [106]:
synthetic_persons['Region'] = 'San Diego'
synthetic_households['Region'] = 'San Diego'

In [107]:
synthetic_persons.head()

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,MGRA,CPA,Jurisdiction,LUZ,Region
0,18100,245,1,1,52,2,1.0,1.0,40.0,,1,2,4.0,1,245,,Oceanside,161,San Diego
1,18100,245,1,2,39,1,3.0,1.0,40.0,,1,2,4.0,0,245,,Oceanside,161,San Diego
2,18100,245,1,3,79,2,6.0,,,,1,2,4.0,0,245,,Oceanside,161,San Diego
3,18100,245,2,1,25,2,4.0,5.0,50.0,15.0,6,2,1.0,1,245,,Oceanside,161,San Diego
4,18100,245,2,2,24,1,1.0,6.0,20.0,15.0,1,1,2.0,1,245,,Oceanside,161,San Diego


# Calculating the range
Request from Purva. 

In [108]:
list_of_mgras = list(set(synthetic_households['MGRA']))

In [109]:
hh_inc_filter = synthetic_households[['MGRA', 'HHADJINC']]

In [110]:
final_df = dict()
for mgra in list_of_mgras:
    # Find the Max
    max_value = max(hh_inc_filter[hh_inc_filter['MGRA'] == mgra]['HHADJINC'])

    # find the min 
    min_value = min(hh_inc_filter[hh_inc_filter['MGRA'] == mgra]['HHADJINC'])

    # Range
    range_value = max_value - min_value

    # Add to dataframe
    final_df[mgra] = [max_value, min_value, range_value]

In [111]:
final_output = pd.DataFrame(final_df).T
final_output.columns = ['Max', 'Min', 'Range']
final_output.index.name = 'mgra'
final_output

Unnamed: 0_level_0,Max,Min,Range
mgra,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,609400.0,30.0,609370.0
2,236000.0,0.0,236000.0
3,731200.0,590.0,730610.0
4,118900.0,21600.0,97300.0
5,360000.0,11600.0,348400.0
...,...,...,...
24313,160900.0,39000.0,121900.0
24314,937000.0,11600.0,925400.0
24316,926000.0,0.0,926000.0
24317,91600.0,36000.0,55600.0


# Geography Rollup

In [112]:
# Synthetic Population (SP) -- with GQ
sh_mgra = pd.DataFrame(synthetic_households['mgra'].value_counts()).rename(columns={'mgra':'Total'})
sh_mgra.index.name = 'mgra'

sh_tract = pd.DataFrame(synthetic_households['tract'].value_counts()).rename(columns={'tract':'Total'})
sh_tract.index.name = 'tract'

sh_CPA = pd.DataFrame(synthetic_households['CPA'].value_counts()).rename(columns={'CPA':'Total'})
sh_CPA.index.name = 'CPA'

sh_jurisdiction = pd.DataFrame(synthetic_households['Jurisdiction'].value_counts()).rename(columns={'Jurisdiction':'Total'})
sh_jurisdiction.index.name = 'Jurisdiction'

sh_LUZ = pd.DataFrame(synthetic_households['LUZ'].value_counts()).rename(columns={'LUZ':'Total'})
sh_LUZ.index.name = 'LUZ'

sh_Region = pd.DataFrame(synthetic_households['Region'].value_counts()).rename(columns={'Region':'Total'})
sh_Region.index.name = 'Region'

In [113]:
# Synthetic Population (SP) -- with GQ
sh_mgra = pd.DataFrame(synthetic_households['mgra'].value_counts()).rename(columns={'mgra':'Total'})
sh_mgra.index.name = 'mgra'

sh_tract = pd.DataFrame(synthetic_households['tract'].value_counts()).rename(columns={'tract':'Total'})
sh_tract.index.name = 'tract'

sh_CPA = pd.DataFrame(synthetic_households['CPA'].value_counts()).rename(columns={'CPA':'Total'})
sh_CPA.index.name = 'CPA'

sh_jurisdiction = pd.DataFrame(synthetic_households['Jurisdiction'].value_counts()).rename(columns={'Jurisdiction':'Total'})
sh_jurisdiction.index.name = 'Jurisdiction'

sh_LUZ = pd.DataFrame(synthetic_households['LUZ'].value_counts()).rename(columns={'LUZ':'Total'})
sh_LUZ.index.name = 'LUZ'

sh_Region = pd.DataFrame(synthetic_households['Region'].value_counts()).rename(columns={'Region':'Total'})
sh_Region.index.name = 'Region'

In [114]:
# Synthetic Persons (SP) -- without GQ
sp_mgra = pd.DataFrame(synthetic_persons['mgra'].value_counts()).rename(columns={'mgra':'Total'})
sp_mgra.index.name = 'mgra'

sp_tract = pd.DataFrame(synthetic_persons['tract'].value_counts()).rename(columns={'tract':'Total'})
sp_tract.index.name = 'tract'

sp_CPA = pd.DataFrame(synthetic_persons['CPA'].value_counts()).rename(columns={'CPA':'Total'})
sp_CPA.index.name = 'CPA'

sp_jurisdiction = pd.DataFrame(synthetic_persons['Jurisdiction'].value_counts()).rename(columns={'Jurisdiction':'Total'})
sp_jurisdiction.index.name = 'Jurisdiction'

sp_LUZ = pd.DataFrame(synthetic_persons['LUZ'].value_counts()).rename(columns={'LUZ':'Total'})
sp_LUZ.index.name = 'LUZ'

sp_Region = pd.DataFrame(synthetic_persons['Region'].value_counts()).rename(columns={'Region':'Total'})
sp_Region.index.name = 'Region'

# Internal Consistency Checks

### Check 1 
Internal Consistency Check in aggregate values across geography levels for both persons and households outputs 

In [115]:
# Synthetic Persons
print(f"MGRA total - {sp_mgra['Total'].sum()}")
print(f"Tract total - {sp_tract['Total'].sum()}")
print(f"CPA total - {sp_CPA['Total'].sum()}")
print(f"Jurisdiction Total - {sp_jurisdiction['Total'].sum()}")
print(f"LUZ total - {sp_LUZ['Total'].sum()}")
print(f"Region total - {sp_Region['Total'].sum()}")

MGRA total - 3361280
Tract total - 3361280
CPA total - 1440291
Jurisdiction Total - 3361280
LUZ total - 3361280
Region total - 3361280


In [116]:
# Synthetic Households
print(f"MGRA total - {sh_mgra['Total'].sum()}")
print(f"Tract total - {sh_tract['Total'].sum()}")
print(f"CPA total - {sh_CPA['Total'].sum()}")
print(f"Jurisdiction Total - {sh_jurisdiction['Total'].sum()}")
print(f"LUZ total - {sh_LUZ['Total'].sum()}")
print(f"Region total - {sh_Region['Total'].sum()}")

MGRA total - 1267147
Tract total - 1267147
CPA total - 579807
Jurisdiction Total - 1267147
LUZ total - 1267147
Region total - 1267147


### Check 2
Compare total 'NP' in the households output with count of 'SPORDER' in the persons output  

In [117]:
synthetic_persons['SPORDER'].count()

3361280

In [118]:
sum(synthetic_households['NP'])

3361280

### Check 3
Compare number of households match between the households and persons output  

In [119]:
synthetic_households.shape[0]

1267147

In [120]:
synthetic_persons['household_id'].nunique()

1262836

### Check 4
Compare workers ('numworkers' and 'isWorker') from households and persons output 

In [121]:
sum(synthetic_households['numWorkers'])

1560775.0

In [122]:
synthetic_persons[synthetic_persons['isWorker'] == 1].shape[0]

1560775

# Check 9
Compare 'VEH' (vehicles available) between the household output with 2019 DMV data in the database  


In [123]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=dpoe_stage;'
                      'Trusted_Connection=yes;')

query = """SELECT COUNT(*)
  FROM [dpoe_stage].[veh_reg_dmv].[fact]
  WHERE yr = 2019 AND own LIKE 'Personal%'"""

dmv_data = pd.read_sql_query(query,conn)

In [124]:
# DMV Sum
dmv_sum_cars = dmv_data[''][0]
dmv_sum_cars

2431762

In [125]:
# Populationsim Sum
popsim_sum_cars = synthetic_households[['VEH']].sum()[0]
popsim_sum_cars

2283834.0

In [126]:
# Difference
print(f"DMV has {dmv_sum_cars - popsim_sum_cars} more cars than popsim has. That is a difference of {round((dmv_sum_cars - popsim_sum_cars)/popsim_sum_cars*100,2)} percent diff")

DMV has 147928.0 more cars than popsim has. That is a difference of 6.48 percent diff


In [127]:
synthetic_households[synthetic_households['VEH'] == 6]

Unnamed: 0,household_id,tract,mgra,NP,HHADJINC,ADJINC,HHT,WIF,HUPAC,VEH,numWorkers,GQ_type,MGRA,CPA,Jurisdiction,LUZ,Region
23,24,18100,836,5,161200.0,1010145,3.0,2.0,2.0,6.0,4.0,,836,,Oceanside,161,San Diego
59,60,18100,1423,5,161200.0,1010145,3.0,2.0,2.0,6.0,4.0,,1423,,Oceanside,161,San Diego
109,110,18100,1553,2,354500.0,1010145,5.0,,4.0,6.0,1.0,,1553,,Oceanside,161,San Diego
419,420,18100,4968,2,354500.0,1010145,5.0,,4.0,6.0,1.0,,4968,,Oceanside,161,San Diego
434,435,18100,5275,5,332400.0,1010145,1.0,3.0,4.0,6.0,4.0,,5275,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152886,1152887,10502,9234,9,85200.0,1010145,1.0,3.0,3.0,6.0,5.0,,9234,,Imperial Beach,90,San Diego
1152934,1152935,10502,9235,8,229800.0,1010145,1.0,3.0,2.0,6.0,4.0,,9235,,Imperial Beach,90,San Diego
1152977,1152978,10502,9236,8,229800.0,1010145,1.0,3.0,2.0,6.0,4.0,,9236,,Imperial Beach,237,San Diego
1153026,1153027,10502,9237,9,85200.0,1010145,1.0,3.0,3.0,6.0,5.0,,9237,,Imperial Beach,237,San Diego


### ACS Vehicle Info

In [128]:
# Download Data
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                      'Server=ddamwsql16.sandag.org;'
                      'Database=census;'
                      'Trusted_Connection=yes;')

query = """SELECT subject_table
    ,CAST(tract as INT) as trct
    ,line_desc
    ,estimate
  FROM [census].[acs].[vw_summary_file]
  WHERE yr = 2019 AND release_type = '5Y'
    AND county = '073'
    AND summary_level = 140
      AND subject_table = 'B08201'
      AND line_number IN (3, 4, 5, 6)
ORDER BY tract, line_number"""

ACS_veh_data = pd.read_sql_query(query,conn)

In [129]:
new_df = ACS_veh_data[ACS_veh_data['line_desc'].isin(['1 vehicle available',
 '2 vehicles available',
 '3 vehicles available',
 '4 or more vehicles available'])][['line_desc', 'estimate']]
new_df

Unnamed: 0,line_desc,estimate
0,1 vehicle available,269.0
1,2 vehicles available,580.0
2,3 vehicles available,274.0
3,4 or more vehicles available,76.0
4,1 vehicle available,517.0
...,...,...
2507,4 or more vehicles available,240.0
2508,1 vehicle available,0.0
2509,2 vehicles available,0.0
2510,3 vehicles available,0.0


In [130]:
conditions  = [ new_df['line_desc'] == '1 vehicle available',
new_df['line_desc'] == '2 vehicles available',
new_df['line_desc'] == '3 vehicles available',
new_df['line_desc'] == '4 or more vehicles available']


choices = [new_df['estimate']*1, 
new_df['estimate']*2, 
new_df['estimate']*3, 
new_df['estimate']*4]


new_df['Total Cars'] = np.select(conditions, choices, default=np.nan)

new_df.head(50)

Unnamed: 0,line_desc,estimate,Total Cars
0,1 vehicle available,269.0,269.0
1,2 vehicles available,580.0,1160.0
2,3 vehicles available,274.0,822.0
3,4 or more vehicles available,76.0,304.0
4,1 vehicle available,517.0,517.0
5,2 vehicles available,323.0,646.0
6,3 vehicles available,76.0,228.0
7,4 or more vehicles available,37.0,148.0
8,1 vehicle available,1069.0,1069.0
9,2 vehicles available,701.0,1402.0


In [131]:
ACS_veh_data_sum = new_df['Total Cars'].sum()
ACS_veh_data_sum

2156471.0

In [132]:
# ACS minus DMV car data
dmv_sum_cars

2431762

In [133]:
abs(ACS_veh_data_sum-dmv_sum_cars)

275291.0

## Vehicle Subcheck 1

In [134]:
vs1 = synthetic_households[synthetic_households['VEH'] > 2]
vs1 = vs1[vs1['HHADJINC'] < 30000].reset_index(drop=True)
vs1

Unnamed: 0,household_id,tract,mgra,NP,HHADJINC,ADJINC,HHT,WIF,HUPAC,VEH,numWorkers,GQ_type,MGRA,CPA,Jurisdiction,LUZ,Region
0,199,18100,2941,3,8600.0,1010145,1.0,0.0,2.0,3.0,0.0,,2941,,Oceanside,161,San Diego
1,441,18100,5275,5,5250.0,1010145,1.0,2.0,2.0,3.0,2.0,,5275,,Oceanside,161,San Diego
2,1571,18100,12539,1,16800.0,1010145,4.0,,4.0,4.0,0.0,,12539,,Oceanside,161,San Diego
3,2193,18100,23848,1,14000.0,1010145,6.0,,4.0,4.0,0.0,,23848,,Oceanside,161,San Diego
4,2552,18200,57,1,14000.0,1010145,6.0,,4.0,4.0,0.0,,57,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21527,1151707,10502,3789,3,0.0,1010145,1.0,0.0,4.0,4.0,0.0,,3789,,Imperial Beach,93,San Diego
21528,1151708,10502,3789,3,0.0,1010145,1.0,0.0,4.0,4.0,0.0,,3789,,Imperial Beach,93,San Diego
21529,1151922,10502,5079,2,11400.0,1010145,1.0,1.0,4.0,3.0,1.0,,5079,,Imperial Beach,237,San Diego
21530,1151924,10502,5079,2,21000.0,1010145,1.0,2.0,4.0,3.0,2.0,,5079,,Imperial Beach,237,San Diego


In [135]:
# vs1.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\more_than_2_cars_less_than_30k_income.csv')

## Vehicle Subcheck 2

In [136]:
vs2 = synthetic_households[synthetic_households['VEH'] == 0]
vs2 = vs2[vs2['HHADJINC'] > 100000].reset_index(drop=True)
vs2

Unnamed: 0,household_id,tract,mgra,NP,HHADJINC,ADJINC,HHT,WIF,HUPAC,VEH,numWorkers,GQ_type,MGRA,CPA,Jurisdiction,LUZ,Region
0,3975,18200,6183,4,247600.0,1010145,1.0,3.0,4.0,0.0,4.0,,6183,,Oceanside,161,San Diego
1,4332,18200,8790,4,247600.0,1010145,1.0,3.0,4.0,0.0,4.0,,8790,,Oceanside,161,San Diego
2,4984,18200,12520,4,143400.0,1010145,1.0,3.0,4.0,0.0,3.0,,12520,,Oceanside,161,San Diego
3,5278,18300,5424,2,125000.0,1010145,1.0,1.0,4.0,0.0,1.0,,5424,,Oceanside,161,San Diego
4,5317,18300,5751,2,125000.0,1010145,1.0,1.0,4.0,0.0,1.0,,5751,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3936,1134482,10104,9242,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,9242,Otay Mesa-Nestor,San Diego,90,San Diego
3937,1146592,10300,4973,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,4973,,Imperial Beach,92,San Diego
3938,1147072,10300,6655,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,6655,,Imperial Beach,91,San Diego
3939,1147199,10300,7099,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,7099,,Imperial Beach,237,San Diego


In [137]:
# vs2.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\no_cars_more_than_100k_income.xlsx', index=False)

## Vehicle Subcheck 3

In [138]:
vs3 = synthetic_households[synthetic_households['VEH'] == 0]
vs3['HHADJINC'].min()
vs3['HHADJINC'].max()
#vs3

1142000.0

## Vehicle Subcheck 4

In [139]:
vs4 = synthetic_households[synthetic_households['VEH'] == 0]
vs4 = vs4[vs4['HHADJINC'] > 100000].reset_index(drop=True)
vs4 = vs4[vs4['NP'] > 2]
vs4

Unnamed: 0,household_id,tract,mgra,NP,HHADJINC,ADJINC,HHT,WIF,HUPAC,VEH,numWorkers,GQ_type,MGRA,CPA,Jurisdiction,LUZ,Region
0,3975,18200,6183,4,247600.0,1010145,1.0,3.0,4.0,0.0,4.0,,6183,,Oceanside,161,San Diego
1,4332,18200,8790,4,247600.0,1010145,1.0,3.0,4.0,0.0,4.0,,8790,,Oceanside,161,San Diego
2,4984,18200,12520,4,143400.0,1010145,1.0,3.0,4.0,0.0,3.0,,12520,,Oceanside,161,San Diego
18,11918,18507,16243,4,247600.0,1010145,1.0,3.0,4.0,0.0,4.0,,16243,,Oceanside,163,San Diego
19,12057,18507,16244,4,247600.0,1010145,1.0,3.0,4.0,0.0,4.0,,16244,,Oceanside,163,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3936,1134482,10104,9242,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,9242,Otay Mesa-Nestor,San Diego,90,San Diego
3937,1146592,10300,4973,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,4973,,Imperial Beach,92,San Diego
3938,1147072,10300,6655,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,6655,,Imperial Beach,91,San Diego
3939,1147199,10300,7099,7,172800.0,1010145,1.0,3.0,2.0,0.0,4.0,,7099,,Imperial Beach,237,San Diego


In [140]:
# vs4.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\no_cars_more_than_100k_income_more_than_2_people_in_hs.xlsx', index=False)

## Vehicle Subcheck 5

In [141]:
vs5 = synthetic_households[synthetic_households['VEH'] >= 6]

vs5['HHADJINC'].min()
vs5['HHADJINC'].max()

1099000.0

## Vehicle Subcheck 6

In [142]:
vs6 = synthetic_households[synthetic_households['VEH'] ==0]
vs6

Unnamed: 0,household_id,tract,mgra,NP,HHADJINC,ADJINC,HHT,WIF,HUPAC,VEH,numWorkers,GQ_type,MGRA,CPA,Jurisdiction,LUZ,Region
9,10,18100,245,2,92120.0,1010145,1.0,0.0,4.0,0.0,0.0,,245,,Oceanside,161,San Diego
68,69,18100,1423,2,25570.0,1010145,1.0,1.0,4.0,0.0,1.0,,1423,,Oceanside,161,San Diego
91,92,18100,1553,1,23300.0,1010145,6.0,,4.0,0.0,0.0,,1553,,Oceanside,161,San Diego
228,229,18100,3085,1,38920.0,1010145,6.0,,4.0,0.0,0.0,,3085,,Oceanside,161,San Diego
234,235,18100,3085,1,13700.0,1010145,6.0,,4.0,0.0,0.0,,3085,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152998,1152999,10502,9237,1,10800.0,1010145,6.0,,4.0,0.0,0.0,,9237,,Imperial Beach,237,San Diego
1153005,1153006,10502,9237,2,41000.0,1010145,5.0,,4.0,0.0,1.0,,9237,,Imperial Beach,237,San Diego
1153006,1153007,10502,9237,5,70000.0,1010145,1.0,1.0,2.0,0.0,1.0,,9237,,Imperial Beach,237,San Diego
1153020,1153021,10502,9237,2,40300.0,1010145,3.0,1.0,4.0,0.0,1.0,,9237,,Imperial Beach,237,San Diego


In [143]:
# vs6.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\no_vehicles_in_household.xlsx', index=False)

# Check 11

## Check 11.5
ESR b or 3 or 6 have COW of b or 9 (b= NAN)


In [144]:
import math

In [145]:
synthetic_persons[synthetic_persons['ESR'].isna()]

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,MGRA,CPA,Jurisdiction,LUZ,Region
5,18100,245,2,3,4,2,,,,1.0,6,2,,0,245,,Oceanside,161,San Diego
11,18100,245,4,3,5,2,,,,2.0,1,3,,0,245,,Oceanside,161,San Diego
16,18100,245,7,3,1,1,,,,,3,2,,0,245,,Oceanside,161,San Diego
19,18100,245,8,3,10,2,,,,7.0,1,1,,0,245,,Oceanside,161,San Diego
20,18100,245,8,4,9,1,,,,6.0,1,1,,0,245,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3361153,17601,24280,1262710,1,13,1,,,,10.0,1,1,,0,24280,,Carlsbad,146,San Diego
3361172,17601,24280,1262729,1,13,1,,,,10.0,1,1,,0,24280,,Carlsbad,146,San Diego
3361207,17601,24280,1262764,1,13,1,,,,10.0,1,1,,0,24280,,Carlsbad,146,San Diego
3361211,17601,24280,1262768,1,13,1,,,,10.0,1,1,,0,24280,,Carlsbad,146,San Diego


In [146]:
c11_5 = synthetic_persons[(synthetic_persons['ESR'].isna())|(synthetic_persons['ESR'] == 3)|(synthetic_persons['ESR'] == 6)]
c11_5['COW'] = [np.nan if math.isnan(x) else x for x in c11_5['COW']]
c11_5 = c11_5[~((c11_5['COW'].isna())|(c11_5['COW']==9))]
c11_5

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,MGRA,CPA,Jurisdiction,LUZ,Region
1,18100,245,1,2,39,1,3.0,1.0,40.0,,1,2,4.0,0,245,,Oceanside,161,San Diego
14,18100,245,7,1,35,1,3.0,1.0,30.0,15.0,3,2,4.0,0,245,,Oceanside,161,San Diego
26,18100,245,11,2,35,2,6.0,1.0,40.0,,1,3,4.0,0,245,,Oceanside,161,San Diego
47,18100,836,21,1,56,1,3.0,1.0,,,1,1,4.0,0,836,,Oceanside,161,San Diego
62,18100,836,26,1,67,2,6.0,6.0,3.0,,1,1,4.0,0,836,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3361263,17601,24280,1262820,1,29,1,6.0,1.0,,,1,2,4.0,0,24280,,Carlsbad,146,San Diego
3361267,17601,24280,1262824,1,27,1,6.0,1.0,40.0,,1,2,4.0,0,24280,,Carlsbad,146,San Diego
3361269,17601,24280,1262826,1,33,1,6.0,1.0,40.0,,1,1,2.0,0,24280,,Carlsbad,146,San Diego
3361272,17601,24280,1262829,1,33,1,6.0,1.0,40.0,,1,1,2.0,0,24280,,Carlsbad,146,San Diego


In [147]:
synthetic_persons.head()

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,MGRA,CPA,Jurisdiction,LUZ,Region
0,18100,245,1,1,52,2,1.0,1.0,40.0,,1,2,4.0,1,245,,Oceanside,161,San Diego
1,18100,245,1,2,39,1,3.0,1.0,40.0,,1,2,4.0,0,245,,Oceanside,161,San Diego
2,18100,245,1,3,79,2,6.0,,,,1,2,4.0,0,245,,Oceanside,161,San Diego
3,18100,245,2,1,25,2,4.0,5.0,50.0,15.0,6,2,1.0,1,245,,Oceanside,161,San Diego
4,18100,245,2,2,24,1,1.0,6.0,20.0,15.0,1,1,2.0,1,245,,Oceanside,161,San Diego


In [148]:
print(synthetic_persons['COW'][2])
print(synthetic_persons['COW'][2] == np.nan)
print(pd.isna(synthetic_persons['COW'][2]))
print(math.isnan(synthetic_persons['COW'][2]))

nan
False
True
True


In [149]:
pd.isna(synthetic_persons['COW'][2])

True

In [150]:
synthetic_persons[synthetic_persons['COW'].isna()]['COW'][2]

nan

In [151]:
synthetic_persons.isna().sum()

tract                 0
mgra                  0
household_id          0
SPORDER               0
AGEP                  0
SEX                   0
ESR              699148
COW             1432086
WKHP            1669209
SCHG            2443823
RAC1P                 0
HISP                  0
MIL              744613
isWorker              0
MGRA                  0
CPA             1920989
Jurisdiction          0
LUZ                   0
Region                0
dtype: int64

In [152]:
c11_5.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\check_11.5.xlsx', index=False)

In [153]:
synthetic_persons.shape

(3361280, 19)

In [154]:
synthetic_persons.isna().sum()

tract                 0
mgra                  0
household_id          0
SPORDER               0
AGEP                  0
SEX                   0
ESR              699148
COW             1432086
WKHP            1669209
SCHG            2443823
RAC1P                 0
HISP                  0
MIL              744613
isWorker              0
MGRA                  0
CPA             1920989
Jurisdiction          0
LUZ                   0
Region                0
dtype: int64

In [155]:
set(c11_5['COW'])

{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}

In [156]:
list(c11_5['COW'])[2]

1.0

In [157]:
import math

In [158]:
math.isnan(1)

False

In [159]:
list(c11_5['COW'])[2] == list(c11_5['COW'])[2]

True

In [160]:
math.isnan(list(c11_5['COW'])[2])

False

In [161]:
c11_5['COW'] = [np.nan if math.isnan(x) else x for x in c11_5['COW']]

## Chech 11.13
MIL = 1 or 3 have age > 60

In [162]:
c11_13 = synthetic_persons[(synthetic_persons['MIL'] == 1)|(synthetic_persons['MIL'] == 3)]
print(set(c11_13['MIL']))

c11_13 = c11_13[c11_13['AGEP'] > 60]

c11_13 = c11_13.sort_values('AGEP', ascending=False)

c11_13

{1.0, 3.0}


Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,MGRA,CPA,Jurisdiction,LUZ,Region
2520953,3001,11685,928767,2,94,1,6.0,,,,2,1,3.0,0,11685,Southeastern:Encanto Neighborhoods,San Diego,12,San Diego
2558913,3112,3336,940452,2,94,1,6.0,,,,2,1,3.0,0,3336,Southeastern:Encanto Neighborhoods,San Diego,14,San Diego
2555855,3111,10983,939531,2,94,1,6.0,,,,2,1,3.0,0,10983,Southeastern:Encanto Neighborhoods,San Diego,14,San Diego
2556235,3111,10984,939643,2,94,1,6.0,,,,2,1,3.0,0,10984,Southeastern:Encanto Neighborhoods,San Diego,14,San Diego
2556834,3111,10986,939835,2,94,1,6.0,,,,2,1,3.0,0,10986,Southeastern:Encanto Neighborhoods,San Diego,14,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1829529,9504,8927,653982,1,61,1,6.0,,,,9,1,3.0,0,8927,Rancho Encantada,San Diego,70,San Diego
1844043,9507,399,659676,1,61,1,6.0,,,,9,1,3.0,0,399,Navajo,San Diego,69,San Diego
1853999,9510,6963,663558,1,61,1,6.0,,,,9,1,3.0,0,6963,Tierrasanta,San Diego,63,San Diego
1844803,9507,4248,659992,1,61,1,6.0,,,,9,1,3.0,0,4248,Tierrasanta,San Diego,69,San Diego


In [163]:
# c11_13.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\over_60_in_military.xlsx', index=False)

# Check 13
Are there any households with household income 0 where persons are employed in for profit organization?

In [164]:
P1: Subset households with HI == 0
P2: Use that list to subset persons (on household_ID) 

SyntaxError: invalid syntax (2562444213.py, line 1)

In [None]:
no_income_households = synthetic_households[synthetic_households['HHADJINC'] == 0]
no_income_household_id = list(no_income_households['household_id'])
#no_income_household_id

In [None]:
persons_in_a_no_income_household = synthetic_persons[synthetic_persons['household_id'].isin(no_income_household_id)].reset_index(drop=True)
persons_in_a_no_income_household

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,MGRA,CPA,Jurisdiction,LUZ,Region
0,18100.0,4904,345,1,51,1,6.0,,,,6,1,4.0,0,4904,,Oceanside,161,San Diego
1,18100.0,4904,345,2,52,2,6.0,,,,6,1,4.0,0,4904,,Oceanside,161,San Diego
2,18100.0,4904,345,3,15,2,,,,10.0,6,1,,0,4904,,Oceanside,161,San Diego
3,18100.0,4904,345,4,12,1,,,,8.0,6,1,,0,4904,,Oceanside,161,San Diego
4,18100.0,4904,345,5,70,2,6.0,,,,6,1,4.0,0,4904,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59950,,24280,1262820,1,29,1,6.0,1.0,,,1,2,4.0,0,24280,,Carlsbad,146,San Diego
59951,,24280,1262828,1,17,2,6.0,,,10.0,1,2,4.0,0,24280,,Carlsbad,146,San Diego
59952,,24280,1262830,1,36,1,6.0,,,,2,1,4.0,0,24280,,Carlsbad,146,San Diego
59953,,24280,1262831,1,47,1,6.0,,,,2,1,4.0,0,24280,,Carlsbad,146,San Diego


In [None]:
# persons_in_a_no_income_household.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\persons_in_a_no_income_household.xlsx', index=False)

# EDA Requests

In [None]:
self_employed_persons = synthetic_persons[synthetic_persons['COW'].isin([6,7,8])].reset_index(drop=True)

In [None]:
# self_employed_persons.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2022\2022-72 SANDPOPSIM Output QC\QA Data\self_employed_persons.xlsx', index=False)

In [None]:
self_employed_persons

Unnamed: 0,tract,mgra,household_id,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,RAC1P,HISP,MIL,isWorker,MGRA,CPA,Jurisdiction,LUZ,Region
0,18100.0,245,2,2,24,1,1.0,6.0,20.0,15.0,1,1,2.0,1,245,,Oceanside,161,San Diego
1,18100.0,245,4,1,41,2,1.0,6.0,40.0,,1,3,4.0,1,245,,Oceanside,161,San Diego
2,18100.0,836,16,1,39,1,1.0,6.0,55.0,,1,1,4.0,1,836,,Oceanside,161,San Diego
3,18100.0,836,24,1,62,2,1.0,6.0,60.0,,1,1,4.0,1,836,,Oceanside,161,San Diego
4,18100.0,836,26,1,67,2,6.0,6.0,3.0,,1,1,4.0,0,836,,Oceanside,161,San Diego
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
252650,,24280,1262755,1,48,1,6.0,6.0,40.0,,1,1,4.0,0,24280,,Carlsbad,146,San Diego
252651,,24280,1262765,1,46,1,6.0,6.0,,,1,1,4.0,0,24280,,Carlsbad,146,San Diego
252652,,24280,1262778,1,38,1,6.0,6.0,20.0,,1,2,4.0,0,24280,,Carlsbad,146,San Diego
252653,,24280,1262816,1,64,1,6.0,6.0,40.0,,1,1,4.0,0,24280,,Carlsbad,146,San Diego
