In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import pyodbc
import sqlalchemy as sql
version = 10
path = r"C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2023\Series 15 Subregional Forecast\2023-089 PopSim- Series 15 Subregional Baseline Forecast\Data\version{}".format(str(version))

year = 2029

# Download the data

In [2]:
mgra_based_data = pd.read_csv(path + "\mgra15_based_input_{}.csv".format(year))
print("MGRA based data is downloaded")
synthetic_households = pd.read_csv(path + "\synthetic_households_{}.csv".format(year))
print("Synthetic Households is downloaded")
synthetic_persons = pd.read_csv(path + "\synthetic_persons_{}.csv".format(year))
print("Synthetic Persons is downloaded")

MGRA based data is downloaded
Synthetic Households is downloaded
Synthetic Persons is downloaded


# Check 1 - Null Value and Negative Value check – synthetic persons, and synthetic households file  

In [3]:
mgra_based_data.isna().sum()[mgra_based_data.isna().sum() > 0]

Series([], dtype: int64)

In [4]:
synthetic_households.isna().sum()

household_id         0
mgra                 0
SERIALNO             0
NP                   0
HHADJINC        114281
HHT                  0
HUPAC                0
VEH             114281
BLD                  0
gq_type              0
workers              0
dtype: int64

# Check 2, 3, 4, 5 - Consistency in Household Population between MGRA-based input, synthetic persons, and synthetic households file

In [5]:
synthetic_persons_with_GQ = synthetic_persons.merge(synthetic_households[['household_id', 'gq_type']], how='left', on='household_id')
synthetic_persons_with_GQ

Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,HISP,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,gq_type
0,57,1,2017000779630,1.0,62.0,1,6.0,6.0,0.0,0,...,2,2.0,24.0,2100.0,0.0,5411,54,2310XX,23.0,0
1,57,2,2018HU0591666,1.0,64.0,1,6.0,1.0,0.0,0,...,1,4.0,18.0,7340.0,0.0,325M,32,499071,49.0,0
2,57,2,2018HU0591666,2.0,56.0,1,6.0,4.0,60.0,0,...,1,4.0,23.0,2310.0,3.0,6111,61,252020,25.0,0
3,57,2,2018HU0591666,3.0,30.0,1,1.0,1.0,40.0,0,...,1,4.0,18.0,9030.0,1.0,481,48,532010,53.0,0
4,57,3,2018HU0876663,1.0,57.0,1,6.0,1.0,0.0,0,...,1,4.0,21.0,9415.0,0.0,482,48,536061,53.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3327552,24280,1341552,2017001457253,1.0,55.0,1,6.0,1.0,0.0,0,...,2,4.0,17.0,6305.0,0.0,23,23,472070,47.0,3
3327553,24280,1341553,2019GQ0080520,1.0,59.0,1,6.0,0.0,0.0,0,...,1,4.0,19.0,0.0,0.0,,,,,3
3327554,24280,1341554,2017001436144,1.0,86.0,1,6.0,0.0,0.0,0,...,1,2.0,18.0,0.0,0.0,,,,,3
3327555,24280,1341555,2018GQ0016377,1.0,48.0,1,6.0,1.0,40.0,0,...,1,4.0,19.0,8225.0,3.0,3328,33,514XXX,51.0,3


In [6]:
# Synthetic Households Work 
sh_number_of_non_gq_households = len(synthetic_households[synthetic_households['gq_type'] == 0])
sh_number_of_gq_households = len(synthetic_households[synthetic_households['gq_type'] != 0])
sh_population_non_gq = sum(synthetic_households[synthetic_households['gq_type'] == 0]['NP'])
sh_population_gq = sum(synthetic_households[synthetic_households['gq_type'] != 0]['NP'])

In [7]:
# Synthetic Persons Work 
sp_number_of_non_gq_households = synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] == 0]['household_id'].nunique()
sp_number_of_gq_households = synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] != 0]['household_id'].nunique()
sp_population_non_gq = len(synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] == 0])
sp_population_gq = len(synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] != 0])
sp_population_gq

114281

In [8]:
# MGRA based information 
mb_number_of_non_gq_households = sum(mgra_based_data['hh'])
mb_population_non_gq = sum(mgra_based_data['hhp'])
mb_population_gq = sum(mgra_based_data['gq_civ'] + mgra_based_data['gq_mil'])

## Consistency in Household Population
This is non-gq

In [9]:
print(f"Synthetic Households non-gq population {sh_population_non_gq}")
print(f"Synthetic Persons non-gq population {sp_population_non_gq}")
print(f"MGRA Based non-gq population {mb_population_non_gq}")

Synthetic Households non-gq population 3213276.0
Synthetic Persons non-gq population 3213276
MGRA Based non-gq population 3220394


In [10]:
# # Synthetic Households non-gq dataframe
# sh_non_qg_pop_df = synthetic_households[synthetic_households['GQ_type'] == 0][['mgra', 'NP']]
# sh_non_qg_pop_df = sh_non_qg_pop_df.groupby('mgra').sum().reset_index()
# sh_non_qg_pop_df.columns = ['mgra', 'non_gq_pop_SH']

# # MGRA Based non-gq dataframe
# mgra_based_data_non_gq_pop = mgra_based_data[['mgra','hhp']]
# mgra_based_data_non_gq_pop.columns = ['mgra', 'non_gq_pop_MB']

In [11]:
# merged_sh_mb_non_gq_pop = sh_non_qg_pop_df.merge(mgra_based_data_non_gq_pop, how='left', on='mgra')
# merged_sh_mb_non_gq_pop['Diff'] = merged_sh_mb_non_gq_pop['non_gq_pop_SH'] - merged_sh_mb_non_gq_pop['non_gq_pop_MB']
# merged_sh_mb_non_gq_pop

## Consistency in Group Quarter Population

In [12]:
print(f"Synthetic Households gq population {sh_population_gq}")
print(f"Synthetic Persons gq population {sp_population_gq}")
print(f"MGRA Based gq population {mb_population_gq}")

Synthetic Households gq population 114281.0
Synthetic Persons gq population 114281
MGRA Based gq population 114281


## Consistency in number of non-gq households

In [13]:
print(f"Synthetic Households -  number of non-gq households: {sh_number_of_non_gq_households}")
print(f"Synthetic Persons -  number of non-gq households: {sp_number_of_non_gq_households}")
print(f"MGRA Based -  number of non-gq households: {mb_number_of_non_gq_households}")

Synthetic Households -  number of non-gq households: 1227275
Synthetic Persons -  number of non-gq households: 1227275
MGRA Based -  number of non-gq households: 1227275


## Consistency in number of gq households

In [14]:
print(f"Synthetic Households -  number of gq households: {sh_number_of_gq_households}")
print(f"Synthetic Persons -  number of gq households: {sp_number_of_gq_households}")

Synthetic Households -  number of gq households: 114281
Synthetic Persons -  number of gq households: 114281


# Check #6 - Compare total 'NP' in the synthetic households output with count of 'SPORDER' in the persons output 

In [15]:
synthetic_households['NP'].sum() == synthetic_persons['SPORDER'].count()

True

# Check #7 Compare distribution of household income categories between synthetic households file and households' distributions sent by Econ team [Region and MGRA level] 

In [16]:
bins = [0, 15000, 30000, 45000, 60000, 75000, 100000, 125000, 150000, 200000, np.inf]
labels = [
    'Less than 15,000', 
    '15,000 to 30,000', 
    '30,000 to 45,000', 
    '45,000 to 60,000', 
    '60,000 to 75,000', 
    '75,000 to 100,000',
    '100,000 to 125,000',
    '125,000 to 150,000',
    '150,000 to 200,000',
    '200,000 or more'
]

synthetic_households['Income Category'] = pd.cut(synthetic_households['HHADJINC'], bins=bins, labels=labels, include_lowest=True)

In [17]:
synthetic_households

Unnamed: 0,household_id,mgra,SERIALNO,NP,HHADJINC,HHT,HUPAC,VEH,BLD,gq_type,workers,Income Category
0,1,57,2017000779630,1.0,35292.0,4.0,4.0,1.0,2.0,0,0,"30,000 to 45,000"
1,2,57,2018HU0591666,3.0,298458.0,1.0,4.0,3.0,2.0,0,1,"200,000 or more"
2,3,57,2018HU0876663,2.0,131858.0,2.0,4.0,6.0,2.0,0,0,"125,000 to 150,000"
3,4,57,2019HU0394645,2.0,28756.0,7.0,4.0,1.0,5.0,0,0,"15,000 to 30,000"
4,5,57,2019HU0838385,3.0,138027.0,1.0,2.0,2.0,2.0,0,0,"125,000 to 150,000"
...,...,...,...,...,...,...,...,...,...,...,...,...
1341551,1341552,24280,2017001457253,1.0,,0.0,0.0,,0.0,3,0,
1341552,1341553,24280,2019GQ0080520,1.0,,0.0,0.0,,0.0,3,0,
1341553,1341554,24280,2017001436144,1.0,,0.0,0.0,,0.0,3,0,
1341554,1341555,24280,2018GQ0016377,1.0,,0.0,0.0,,0.0,3,0,


In [18]:
synthetic_households_only = synthetic_households[synthetic_households.gq_type == 0]
income_frequency_distribution = synthetic_households_only.groupby('Income Category').size().div(len(synthetic_households_only)).reset_index(name='SH_percentage')
income_frequency_distribution['SH_percentage'] = round(income_frequency_distribution['SH_percentage']*100, 2)
income_frequency_distribution

Unnamed: 0,Income Category,SH_percentage
0,"Less than 15,000",5.45
1,"15,000 to 30,000",8.17
2,"30,000 to 45,000",8.63
3,"45,000 to 60,000",8.75
4,"60,000 to 75,000",8.36
5,"75,000 to 100,000",11.82
6,"100,000 to 125,000",10.22
7,"125,000 to 150,000",8.15
8,"150,000 to 200,000",11.93
9,"200,000 or more",18.52


In [19]:
if year == 2022:
    income_frequency_distribution['Econ_percentage'] = [5.4, 8.2, 8.7, 8.6, 8.1, 12.1, 10.1, 8.3, 11.9, 18.6]
elif year == 2026:
    income_frequency_distribution['Econ_percentage'] = [5.21, 8.01, 8.66, 8.56, 8.11, 12.10, 10.17, 8.34, 12.08, 18.75]
elif year == 2029:
    income_frequency_distribution['Econ_percentage'] = [5.42, 8.16, 8.74, 8.60, 8.11, 12.06, 10.11, 8.27, 11.96, 18.56]
elif year == 2032:
    income_frequency_distribution['Econ_percentage'] = [5.61, 8.29, 8.81, 8.62, 8.11, 12.02, 10.05, 8.21, 11.86, 18.41]
elif year == 2035:
    income_frequency_distribution['Econ_percentage'] = [5.74, 8.39, 8.87, 8.65, 8.11, 12.00, 10.02, 8.17, 11.78, 18.28]
elif year == 2040:
    income_frequency_distribution['Econ_percentage'] = [5.98, 8.54, 8.94, 8.67, 8.10, 11.95, 9.94, 8.10, 11.66, 18.13]
elif year == 2050:
    income_frequency_distribution['Econ_percentage'] = [6.20, 8.71, 9.05, 8.73, 8.13, 11.93, 9.89, 8.03, 11.53, 17.80]
elif year == 2060:
    income_frequency_distribution['Econ_percentage'] = [6.22, 8.72, 9.05, 8.73, 8.13, 11.93, 9.89, 8.03, 11.52, 17.79]
income_frequency_distribution

Unnamed: 0,Income Category,SH_percentage,Econ_percentage
0,"Less than 15,000",5.45,5.42
1,"15,000 to 30,000",8.17,8.16
2,"30,000 to 45,000",8.63,8.74
3,"45,000 to 60,000",8.75,8.6
4,"60,000 to 75,000",8.36,8.11
5,"75,000 to 100,000",11.82,12.06
6,"100,000 to 125,000",10.22,10.11
7,"125,000 to 150,000",8.15,8.27
8,"150,000 to 200,000",11.93,11.96
9,"200,000 or more",18.52,18.56


In [20]:
income_frequency_distribution['Diff'] = income_frequency_distribution['SH_percentage'] - income_frequency_distribution['Econ_percentage']
income_frequency_distribution

Unnamed: 0,Income Category,SH_percentage,Econ_percentage,Diff
0,"Less than 15,000",5.45,5.42,0.03
1,"15,000 to 30,000",8.17,8.16,0.01
2,"30,000 to 45,000",8.63,8.74,-0.11
3,"45,000 to 60,000",8.75,8.6,0.15
4,"60,000 to 75,000",8.36,8.11,0.25
5,"75,000 to 100,000",11.82,12.06,-0.24
6,"100,000 to 125,000",10.22,10.11,0.11
7,"125,000 to 150,000",8.15,8.27,-0.12
8,"150,000 to 200,000",11.93,11.96,-0.03
9,"200,000 or more",18.52,18.56,-0.04


# Check #8 - Compare number of workers in non-GQ households between synthetic persons file and number of workers calculated by the Econ team [see production side jobs for workers] 

In [21]:
if year == 2022:
    econ_num = 1_598_836
elif year == 2026:
    econ_num = 1_623_691
elif year == 2029:
    econ_num = 1_642_717
elif year == 2032:
    econ_num = 1_662_848
elif year == 2035:
    econ_num = 1_682_424
elif year == 2040:
    econ_num = 1_705_327
elif year == 2050:
    econ_num = 1_710_936

In [22]:
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,3,4,5]))])
persons_emp - econ_num

57466

In [23]:
persons_emp

1700183

In [24]:
econ_num

1642717

In [25]:
# Workers - [ESR] = (1, 2, 4, 5) only controlled at the household level in categories of 0-worker households, 1-worker households, 2-worker households, and 3+ worker households
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,4,5]))])
print(persons_emp)
print(persons_emp - econ_num)

1591867
-50850


In [26]:
# Labor Force Participants - [ESR] = (1, 2, 3, 4, 5) controlled by Industry Type (job_1 to job_14)
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,3,4,5]))])
print(persons_emp)
print(persons_emp - econ_num)

1700183
57466


In [27]:
# Workers - [ESR] = (1, 2, 4, 5) only controlled at the household level in categories of 0-worker households, 1-worker households, 2-worker households, and 3+ worker households
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,3]))])
print(persons_emp)
print(persons_emp - econ_num)

1597720
-44997


In [28]:
# Have a labor force check as well 
'''The control for above is from the econ team'''

'The control for above is from the econ team'

# E&F table creation

In [29]:
emp_dict = []
for test_year in [2026, 2029, 2032, 2035, 2040, 2050]:
    test_persons = pd.read_csv(path + "\synthetic_persons_{}.csv".format(test_year))
    test_persons_emp = len(test_persons[(~test_persons['ESR'].isna()) & (test_persons['ESR'].isin([1,2,4,5]))])
    
    if test_year == 2022:
        test_econ_num = 1_598_836
    elif test_year == 2026:
        test_econ_num = 1_623_691
    elif test_year == 2029:
        test_econ_num = 1_642_717
    elif test_year == 2032:
        test_econ_num = 1_662_848
    elif test_year == 2035:
        test_econ_num = 1_682_424
    elif test_year == 2040:
        test_econ_num = 1_705_327
    elif test_year == 2050:
        test_econ_num = 1_710_936
        
    
    diff = test_persons_emp - test_econ_num
    
    
    emp_dict.append([test_year, test_persons_emp, test_econ_num, diff])

In [30]:
pd.DataFrame(emp_dict, columns=['year', 'ESR_1245', 'Econ', 'diff'])

Unnamed: 0,year,ESR_1245,Econ,diff
0,2026,1576223,1623691,-47468
1,2029,1591867,1642717,-50850
2,2032,1609660,1662848,-53188
3,2035,1629001,1682424,-53423
4,2040,1658487,1705327,-46840
5,2050,1680663,1710936,-30273


# Check #9 - Compare total jobs and jobs by sector with MGRA-based input file

In [31]:
naics2_xwalk = pd.read_excel(r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2023\2023-034 PopSim Series 15 Subregional forecast (2022) QC\Data\production_naics_xwalk.xlsx',
                            skiprows=[0]).rename(columns={'Unnamed: 3':'job'})
naics2_xwalk['NAICS2'] = naics2_xwalk['Unnamed: 4'].apply(lambda x: list(map(lambda x: x.replace('persons.NAICS2 == ','').strip("()' "), x.split('|'))))
naics2_xwalk = naics2_xwalk.explode(['NAICS2'])

syn_sectors = synthetic_persons.merge(naics2_xwalk[['job', 'NAICS2']], on='NAICS2', how='left').groupby('job').count().reset_index()[['job', 'mgra']].rename(columns={'mgra':'syn_jobs'})
syn_sectors

Unnamed: 0,job,syn_jobs
0,job_1,111207
1,job_10,194022
2,job_11,56339
3,job_12,19557
4,job_13,153177
5,job_14,93419
6,job_2,106948
7,job_3,12039
8,job_4,327076
9,job_5,169783


In [32]:
econ_sectors = pd.read_excel(r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2023\2023-078 REVISED- Regional Employment Estimates & Forecast (series 15)\Data\Post QA\Forecasted Employment, Regional Controls for 2022 through 2060_Sept 2023 Revised.xlsx',
                            sheet_name = '1b. Summary Forecast_Production',
                            skiprows=[0,1,30,31,32]).rename(columns={'Unnamed: 0':'Industry'})

# Race for popsim
def get_indxwalk(row):
        if row['Industry'] in ['State and Local Government', 'Federal Civilian']:
            return 'job_1'
        elif row['Industry'] in ['Federal Military']:
            return 'job_2'
        elif row['Industry'] in ['Forestry, fishing, and hunting', 'Farm', 'Mining', 'NAICS 11 (ag, fisheries and forestry)']:
            return 'job_3'
        elif row['Industry'] in ['Information',
                                'Professional, scientific, and technical services', 
                                 'Administrative, support, waste management, and remediation services']:
            return 'job_4'
        elif row['Industry'] in ['Finance and insurance','Real estate and rental and leasing',
                                 'Management of companies and enterprises']:
            return 'job_5'        
        elif row['Industry'] in ['Educational services; private']:
            return 'job_6'
        elif row['Industry'] in ['Health care and social assistance']:
            return 'job_7'
        elif row['Industry'] in ['Retail trade']:
            return 'job_8'
        elif row['Industry'] in ['Construction','Transportation and warehousing' ]:
            return 'job_9'
        elif row['Industry'] in ['Utilities','Manufacturing', 'Wholesale trade']:
            return 'job_10'
        elif row['Industry'] in ['Arts, entertainment, and recreation']:
            return 'job_11'
        elif row['Industry'] in ['Accommodation']:
            return 'job_12'
        elif row['Industry'] in ['Food Service']:
            return 'job_13'
        elif row['Industry'] in ['Other services (except public administration)']:
            return 'job_14'

econ_sectors['job'] = econ_sectors.apply(get_indxwalk, axis = 1)
econ_sectors = (econ_sectors[['job', year]].groupby('job').sum()*1000).reset_index().rename(columns={year:'mgra_jobs'})
econ_sectors

Unnamed: 0,job,mgra_jobs
0,job_1,97251.631745
1,job_10,171831.239088
2,job_11,45100.766095
3,job_12,17339.354718
4,job_13,131817.739962
5,job_14,77233.490434
6,job_2,104000.0
7,job_3,10388.245274
8,job_4,291478.665823
9,job_5,154076.057596


In [34]:
sector_diff = syn_sectors.merge(econ_sectors, on='job')
sector_diff['diff'] = sector_diff['syn_jobs'] - sector_diff['mgra_jobs']
sector_diff

Unnamed: 0,job,syn_jobs,mgra_jobs,diff
0,job_1,111207,97251.631745,13955.368255
1,job_10,194022,171831.239088,22190.760912
2,job_11,56339,45100.766095,11238.233905
3,job_12,19557,17339.354718,2217.645282
4,job_13,153177,131817.739962,21359.260038
5,job_14,93419,77233.490434,16185.509566
6,job_2,106948,104000.0,2948.0
7,job_3,12039,10388.245274,1650.754726
8,job_4,327076,291478.665823,35597.334177
9,job_5,169783,154076.057596,15706.942404


# -------------------------------------------------
# Check - Vehicles

In [56]:
low_inc_cars = len(synthetic_households[(synthetic_households.HHADJINC < 30000) & (synthetic_households.VEH > 2)])
high_inc_nocars = len(synthetic_households[((synthetic_households.HHADJINC > 100000) & (synthetic_households.VEH == 0))])

In [57]:
low_inc_cars

12242

In [58]:
high_inc_nocars

9320

# Check - Households with household income 0 where persons are employed in for profit organization

In [59]:
synthetic_households[synthetic_households['HHADJINC']==0]

Unnamed: 0,household_id,mgra,SERIALNO,NP,HHADJINC,HHT,HUPAC,VEH,BLD,gq_type,workers,Income Category
24,25,100,2018HU0425523,4.0,0.0,3.0,3.0,2.0,2.0,0,0,"Less than 15,000"
148,149,245,2017001001315,1.0,0.0,4.0,4.0,0.0,10.0,0,0,"Less than 15,000"
187,188,245,2021HU0337895,1.0,0.0,6.0,4.0,1.0,2.0,0,0,"Less than 15,000"
193,194,245,2021HU1213590,1.0,0.0,6.0,4.0,0.0,2.0,0,0,"Less than 15,000"
216,217,322,2018HU0425523,4.0,0.0,3.0,3.0,2.0,2.0,0,0,"Less than 15,000"
...,...,...,...,...,...,...,...,...,...,...,...,...
1227201,1227202,10222,2020HU0151891,1.0,0.0,6.0,4.0,1.0,9.0,0,0,"Less than 15,000"
1227202,1227203,10222,2020HU0151891,1.0,0.0,6.0,4.0,1.0,9.0,0,0,"Less than 15,000"
1227220,1227221,10222,2020HU0685229,3.0,0.0,3.0,3.0,0.0,6.0,0,0,"Less than 15,000"
1227229,1227230,10222,2020HU1045955,1.0,0.0,4.0,4.0,1.0,2.0,0,0,"Less than 15,000"


In [60]:
hhinc_working = synthetic_persons.merge(synthetic_households[['household_id', 'HHADJINC']], how='left', on='household_id')
hhinc_working[(hhinc_working.ESR.isin([1, 2, 4, 5]))& (hhinc_working.HHADJINC == 0)]

Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,HISP,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,HHADJINC
200576,16762,74467,2020HU0933627,1.0,60.0,2,1.0,8.0,4.0,0,...,1,4.0,17.0,6260.0,5.0,23,23,472061,47.0,0.0
226839,17248,83873,2020HU0933627,1.0,60.0,2,1.0,8.0,4.0,0,...,1,4.0,17.0,6260.0,5.0,23,23,472061,47.0,0.0
226899,17250,83896,2020HU0933627,1.0,60.0,2,1.0,8.0,4.0,0,...,1,4.0,17.0,6260.0,5.0,23,23,472061,47.0,0.0
228864,17364,84638,2020HU0933627,1.0,60.0,2,1.0,8.0,4.0,0,...,1,4.0,17.0,6260.0,5.0,23,23,472061,47.0,0.0
232791,17433,86075,2017001208011,1.0,47.0,1,1.0,6.0,40.0,0,...,1,2.0,19.0,310.0,1.0,722Z,722,119051,11.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2301881,11147,925112,2019HU0777110,1.0,36.0,1,1.0,6.0,30.0,0,...,1,4.0,22.0,2640.0,1.0,5414,54,27102X,27.0,0.0
2302238,11151,925355,2019HU0777110,1.0,36.0,1,1.0,6.0,30.0,0,...,1,4.0,22.0,2640.0,1.0,5414,54,27102X,27.0,0.0
2304655,14403,926703,2017000112925,1.0,26.0,1,1.0,6.0,50.0,0,...,1,4.0,21.0,2640.0,1.0,5414,54,27102X,27.0,0.0
2325656,1495,935154,2018HU0335286,1.0,56.0,2,1.0,6.0,54.0,0,...,1,4.0,19.0,4700.0,1.0,4533,45,411011,41.0,0.0


In [61]:
len(hhinc_working[(hhinc_working.ESR.isin([1, 2, 4, 5]))& (hhinc_working.HHADJINC == 0)]['household_id'].unique())

285

# Check - Compare student enrollment (k-8 and 9-12) with MGRA base file. 

In [62]:
print(synthetic_persons['SCHG'].isin([2,3,4,5,6,7,8,9,10]).sum())
synthetic_persons[synthetic_persons['SCHG'].isin([2,3,4,5,6,7,8,9,10])].groupby('mgra').count().reset_index()[['mgra', 'SCHG']]

337764


Unnamed: 0,mgra,SCHG
0,1,18
1,3,78
2,5,16
3,6,11
4,7,16
...,...,...
15749,24313,6
15750,24314,1
15751,24316,48
15752,24318,21


In [63]:
print(synthetic_persons['SCHG'].isin([11,12,13,14]).sum())
synthetic_persons[synthetic_persons['SCHG'].isin([11,12,13,14])].groupby('mgra').count().reset_index()[['mgra', 'SCHG']]

165060


Unnamed: 0,mgra,SCHG
0,1,14
1,3,43
2,5,7
3,6,7
4,7,1
...,...,...
14715,24312,2
14716,24314,4
14717,24316,28
14718,24318,17


In [64]:
print(mgra_based_data[['mgra', 'enrollgradekto8', 'enrollgrade9to12']].sum())
mgra_based_data[['mgra', 'enrollgradekto8', 'enrollgrade9to12']]

mgra                295767681
enrollgradekto8        348342
enrollgrade9to12       161189
dtype: int64


Unnamed: 0,mgra,enrollgradekto8,enrollgrade9to12
0,21274,0,0
1,21323,0,0
2,21374,0,0
3,21423,0,0
4,21474,0,0
...,...,...,...
24316,459,0,0
24317,8662,0,0
24318,2760,0,0
24319,11409,0,0


In [65]:
mgra_based_data.groupby('pseudomsa').sum()[['enrollgradekto8', 'enrollgrade9to12']]

Unnamed: 0_level_0,enrollgradekto8,enrollgrade9to12
pseudomsa,Unnamed: 1_level_1,Unnamed: 2_level_1
1,715,3446
2,53966,19772
3,86715,39161
4,52908,26019
5,55290,26458
6,43229,19123
7,52136,23660
8,3383,3550


In [66]:
synthetic_persons[['SCHG', 'SCHL']]

Unnamed: 0,SCHG,SCHL
0,0,24.0
1,0,18.0
2,0,23.0
3,0,18.0
4,0,21.0
...,...,...
3327552,0,17.0
3327553,0,19.0
3327554,0,18.0
3327555,0,19.0


In [67]:
school_enrollment_attainment = {2: [1, 2],
3:[3],
4:[4],
5:[5],
6:[6],
7:[7],
8:[8],
9:[9],
10:[10],
11:[11],
12:[12],
13:[13],
14:[14],
15:[16,17,18,19,20],
16:[21,22,23,24]}

In [68]:
enrollment_calcs = synthetic_persons[~synthetic_persons['SCHG'].isin([0,1])]
enrollment_calcs['possible_attainments'] = enrollment_calcs['SCHG'].apply(lambda x: school_enrollment_attainment[x])
enrollment_calcs['attainment_check'] = enrollment_calcs.apply(lambda x: x.SCHL in x.possible_attainments, axis=1)
enrollment_calcs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enrollment_calcs['possible_attainments'] = enrollment_calcs['SCHG'].apply(lambda x: school_enrollment_attainment[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enrollment_calcs['attainment_check'] = enrollment_calcs.apply(lambda x: x.SCHL in x.possible_attainments, axis=1)


Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,possible_attainments,attainment_check
6,57,4,2019HU0394645,1.0,25.0,2,6.0,2.0,0.0,15,...,4.0,19.0,4710.0,0.0,8129,81,411012,41.0,"[16, 17, 18, 19, 20]",True
7,57,4,2019HU0394645,2.0,27.0,1,6.0,1.0,0.0,15,...,4.0,18.0,4020.0,0.0,722Z,722,352010,35.0,"[16, 17, 18, 19, 20]",True
10,57,5,2019HU0838385,3.0,9.0,2,0.0,0.0,0.0,5,...,0.0,5.0,0.0,0.0,,,,,[5],True
24,57,12,2021HU0537552,1.0,30.0,1,6.0,0.0,0.0,15,...,2.0,20.0,0.0,0.0,,,,,"[16, 17, 18, 19, 20]",True
34,100,15,2017000152247,3.0,26.0,1,6.0,0.0,0.0,15,...,4.0,18.0,0.0,0.0,,,,,"[16, 17, 18, 19, 20]",True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3327506,24280,1341506,2018GQ0125534,1.0,17.0,1,6.0,0.0,0.0,13,...,4.0,13.0,0.0,0.0,,,,,[13],True
3327518,24280,1341518,2019GQ0039460,1.0,15.0,1,0.0,0.0,0.0,12,...,0.0,12.0,0.0,0.0,,,,,[12],True
3327520,24280,1341520,2021GQ0028539,1.0,16.0,1,6.0,0.0,0.0,13,...,0.0,13.0,0.0,0.0,,,,,[13],True
3327527,24280,1341527,2018GQ0077804,1.0,83.0,1,6.0,0.0,0.0,13,...,4.0,13.0,0.0,0.0,,,,,[13],True


In [69]:
enrollment_calcs[enrollment_calcs.possible_attainments == False]

Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,possible_attainments,attainment_check
