In [1]:
import pandas as pd
import numpy as np
import matplotlib as plt
import pyodbc
import sqlalchemy as sql
version = 10
path = r"C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2023\Series 15 Subregional Forecast- Baseline\2023-089 PopSim- Series 15 Subregional Baseline Forecast\Data\version{}".format(str(version))

year = 2050

# Download the data

In [2]:
mgra_based_data = pd.read_csv(path + "\mgra15_based_input_{}.csv".format(year))
print("MGRA based data is downloaded")
synthetic_households = pd.read_csv(path + "\synthetic_households_{}.csv".format(year))
print("Synthetic Households is downloaded")
synthetic_persons = pd.read_csv(path + "\synthetic_persons_{}.csv".format(year))
print("Synthetic Persons is downloaded")

MGRA based data is downloaded
Synthetic Households is downloaded
Synthetic Persons is downloaded


# Check 1 - Null Value and Negative Value check – synthetic persons, and synthetic households file  

In [3]:
mgra_based_data.isna().sum()[mgra_based_data.isna().sum() > 0]

Series([], dtype: int64)

In [4]:
synthetic_households.isna().sum()

household_id         0
mgra                 0
SERIALNO             0
NP                   0
HHADJINC        114574
HHT                  0
HUPAC                0
VEH             114574
BLD                  0
gq_type              0
workers              0
dtype: int64

# Check 2, 3, 4, 5 - Consistency in Household Population between MGRA-based input, synthetic persons, and synthetic households file

In [5]:
synthetic_persons_with_GQ = synthetic_persons.merge(synthetic_households[['household_id', 'gq_type']], how='left', on='household_id')
synthetic_persons_with_GQ

Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,HISP,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,gq_type
0,57,1,2017000303096,1.0,64.0,1,6.0,0.0,0.0,0,...,1,4.0,19.0,0.0,0.0,,,,,0
1,57,1,2017000303096,2.0,61.0,2,1.0,1.0,30.0,0,...,1,4.0,19.0,3649.0,1.0,622M,62,319097,31.0,0
2,57,2,2017000664896,1.0,59.0,2,1.0,1.0,40.0,0,...,1,2.0,22.0,3255.0,1.0,5613,56,291141,29.0,0
3,57,2,2017000664896,2.0,59.0,1,1.0,1.0,30.0,0,...,1,4.0,21.0,4920.0,1.0,531M,53,419020,41.0,0
4,57,3,2017000779630,1.0,62.0,1,6.0,6.0,0.0,0,...,2,2.0,24.0,2100.0,0.0,5411,54,2310XX,23.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3397433,24280,1449113,2018GQ0104735,1.0,66.0,1,6.0,0.0,0.0,0,...,24,4.0,19.0,0.0,0.0,,,,,3
3397434,24280,1449114,2019GQ0129680,1.0,50.0,1,6.0,0.0,0.0,0,...,1,4.0,14.0,0.0,0.0,,,,,3
3397435,24280,1449115,2017001456866,1.0,27.0,1,6.0,0.0,0.0,0,...,2,4.0,18.0,0.0,0.0,,,,,3
3397436,24280,1449116,2020GQ0009636,1.0,32.0,1,6.0,0.0,0.0,0,...,9,4.0,1.0,0.0,0.0,,,,,3


In [6]:
# Synthetic Households Work 
sh_number_of_non_gq_households = len(synthetic_households[synthetic_households['gq_type'] == 0])
sh_number_of_gq_households = len(synthetic_households[synthetic_households['gq_type'] != 0])
sh_population_non_gq = sum(synthetic_households[synthetic_households['gq_type'] == 0]['NP'])
sh_population_gq = sum(synthetic_households[synthetic_households['gq_type'] != 0]['NP'])

In [7]:
# Synthetic Persons Work 
sp_number_of_non_gq_households = synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] == 0]['household_id'].nunique()
sp_number_of_gq_households = synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] != 0]['household_id'].nunique()
sp_population_non_gq = len(synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] == 0])
sp_population_gq = len(synthetic_persons_with_GQ[synthetic_persons_with_GQ['gq_type'] != 0])
sp_population_gq

114574

In [8]:
# MGRA based information 
mb_number_of_non_gq_households = sum(mgra_based_data['hh'])
mb_population_non_gq = sum(mgra_based_data['hhp'])
mb_population_gq = sum(mgra_based_data['gq_civ'] + mgra_based_data['gq_mil'])

## Consistency in Household Population
This is non-gq

In [9]:
print(f"Synthetic Households non-gq population {sh_population_non_gq}")
print(f"Synthetic Persons non-gq population {sp_population_non_gq}")
print(f"MGRA Based non-gq population {mb_population_non_gq}")

Synthetic Households non-gq population 3282864.0
Synthetic Persons non-gq population 3282864
MGRA Based non-gq population 3285676


In [10]:
# # Synthetic Households non-gq dataframe
# sh_non_qg_pop_df = synthetic_households[synthetic_households['GQ_type'] == 0][['mgra', 'NP']]
# sh_non_qg_pop_df = sh_non_qg_pop_df.groupby('mgra').sum().reset_index()
# sh_non_qg_pop_df.columns = ['mgra', 'non_gq_pop_SH']

# # MGRA Based non-gq dataframe
# mgra_based_data_non_gq_pop = mgra_based_data[['mgra','hhp']]
# mgra_based_data_non_gq_pop.columns = ['mgra', 'non_gq_pop_MB']

In [11]:
# merged_sh_mb_non_gq_pop = sh_non_qg_pop_df.merge(mgra_based_data_non_gq_pop, how='left', on='mgra')
# merged_sh_mb_non_gq_pop['Diff'] = merged_sh_mb_non_gq_pop['non_gq_pop_SH'] - merged_sh_mb_non_gq_pop['non_gq_pop_MB']
# merged_sh_mb_non_gq_pop

## Consistency in Group Quarter Population

In [12]:
print(f"Synthetic Households gq population {sh_population_gq}")
print(f"Synthetic Persons gq population {sp_population_gq}")
print(f"MGRA Based gq population {mb_population_gq}")

Synthetic Households gq population 114574.0
Synthetic Persons gq population 114574
MGRA Based gq population 114574


## Consistency in number of non-gq households

In [13]:
print(f"Synthetic Households -  number of non-gq households: {sh_number_of_non_gq_households}")
print(f"Synthetic Persons -  number of non-gq households: {sp_number_of_non_gq_households}")
print(f"MGRA Based -  number of non-gq households: {mb_number_of_non_gq_households}")

Synthetic Households -  number of non-gq households: 1334543
Synthetic Persons -  number of non-gq households: 1334543
MGRA Based -  number of non-gq households: 1334543


## Consistency in number of gq households

In [14]:
print(f"Synthetic Households -  number of gq households: {sh_number_of_gq_households}")
print(f"Synthetic Persons -  number of gq households: {sp_number_of_gq_households}")

Synthetic Households -  number of gq households: 114574
Synthetic Persons -  number of gq households: 114574


# Check #6 - Compare total 'NP' in the synthetic households output with count of 'SPORDER' in the persons output 

In [15]:
synthetic_households['NP'].sum() == synthetic_persons['SPORDER'].count()

True

# Check #7 Compare distribution of household income categories between synthetic households file and households' distributions sent by Econ team [Region and MGRA level] 

In [16]:
bins = [0, 15000, 30000, 45000, 60000, 75000, 100000, 125000, 150000, 200000, np.inf]
labels = [
    'Less than 15,000', 
    '15,000 to 30,000', 
    '30,000 to 45,000', 
    '45,000 to 60,000', 
    '60,000 to 75,000', 
    '75,000 to 100,000',
    '100,000 to 125,000',
    '125,000 to 150,000',
    '150,000 to 200,000',
    '200,000 or more'
]

synthetic_households['Income Category'] = pd.cut(synthetic_households['HHADJINC'], bins=bins, labels=labels, include_lowest=True)

In [17]:
synthetic_households

Unnamed: 0,household_id,mgra,SERIALNO,NP,HHADJINC,HHT,HUPAC,VEH,BLD,gq_type,workers,Income Category
0,1,57,2017000303096,2.0,35535.0,5.0,4.0,3.0,2.0,0,1,"30,000 to 45,000"
1,2,57,2017000664896,2.0,189847.0,7.0,4.0,3.0,2.0,0,2,"150,000 to 200,000"
2,3,57,2017000779630,1.0,35292.0,4.0,4.0,1.0,2.0,0,0,"30,000 to 45,000"
3,4,57,2017000793877,5.0,304242.0,1.0,1.0,2.0,2.0,0,1,"200,000 or more"
4,5,57,2017001189777,2.0,42594.0,1.0,4.0,2.0,7.0,0,2,"30,000 to 45,000"
...,...,...,...,...,...,...,...,...,...,...,...,...
1449112,1449113,24280,2018GQ0104735,1.0,,0.0,0.0,,0.0,3,0,
1449113,1449114,24280,2019GQ0129680,1.0,,0.0,0.0,,0.0,3,0,
1449114,1449115,24280,2017001456866,1.0,,0.0,0.0,,0.0,3,0,
1449115,1449116,24280,2020GQ0009636,1.0,,0.0,0.0,,0.0,3,0,


In [18]:
synthetic_households_only = synthetic_households[synthetic_households.gq_type == 0]
income_frequency_distribution = synthetic_households_only.groupby('Income Category').size().div(len(synthetic_households_only)).reset_index(name='SH_percentage')
income_frequency_distribution['SH_percentage'] = round(income_frequency_distribution['SH_percentage']*100, 2)
income_frequency_distribution

Unnamed: 0,Income Category,SH_percentage
0,"Less than 15,000",6.25
1,"15,000 to 30,000",8.71
2,"30,000 to 45,000",8.9
3,"45,000 to 60,000",8.92
4,"60,000 to 75,000",8.49
5,"75,000 to 100,000",11.56
6,"100,000 to 125,000",9.99
7,"125,000 to 150,000",7.92
8,"150,000 to 200,000",11.49
9,"200,000 or more",17.76


In [19]:
if year == 2022:
    income_frequency_distribution['Econ_percentage'] = [5.4, 8.2, 8.7, 8.6, 8.1, 12.1, 10.1, 8.3, 11.9, 18.6]
elif year == 2026:
    income_frequency_distribution['Econ_percentage'] = [5.21, 8.01, 8.66, 8.56, 8.11, 12.10, 10.17, 8.34, 12.08, 18.75]
elif year == 2029:
    income_frequency_distribution['Econ_percentage'] = [5.42, 8.16, 8.74, 8.60, 8.11, 12.06, 10.11, 8.27, 11.96, 18.56]
elif year == 2032:
    income_frequency_distribution['Econ_percentage'] = [5.61, 8.29, 8.81, 8.62, 8.11, 12.02, 10.05, 8.21, 11.86, 18.41]
elif year == 2035:
    income_frequency_distribution['Econ_percentage'] = [5.74, 8.39, 8.87, 8.65, 8.11, 12.00, 10.02, 8.17, 11.78, 18.28]
elif year == 2040:
    income_frequency_distribution['Econ_percentage'] = [5.98, 8.54, 8.94, 8.67, 8.10, 11.95, 9.94, 8.10, 11.66, 18.13]
elif year == 2050:
    income_frequency_distribution['Econ_percentage'] = [6.20, 8.71, 9.05, 8.73, 8.13, 11.93, 9.89, 8.03, 11.53, 17.80]
elif year == 2060:
    income_frequency_distribution['Econ_percentage'] = [6.22, 8.72, 9.05, 8.73, 8.13, 11.93, 9.89, 8.03, 11.52, 17.79]
income_frequency_distribution

Unnamed: 0,Income Category,SH_percentage,Econ_percentage
0,"Less than 15,000",6.25,6.2
1,"15,000 to 30,000",8.71,8.71
2,"30,000 to 45,000",8.9,9.05
3,"45,000 to 60,000",8.92,8.73
4,"60,000 to 75,000",8.49,8.13
5,"75,000 to 100,000",11.56,11.93
6,"100,000 to 125,000",9.99,9.89
7,"125,000 to 150,000",7.92,8.03
8,"150,000 to 200,000",11.49,11.53
9,"200,000 or more",17.76,17.8


In [20]:
income_frequency_distribution['Diff'] = income_frequency_distribution['SH_percentage'] - income_frequency_distribution['Econ_percentage']
income_frequency_distribution

Unnamed: 0,Income Category,SH_percentage,Econ_percentage,Diff
0,"Less than 15,000",6.25,6.2,0.05
1,"15,000 to 30,000",8.71,8.71,0.0
2,"30,000 to 45,000",8.9,9.05,-0.15
3,"45,000 to 60,000",8.92,8.73,0.19
4,"60,000 to 75,000",8.49,8.13,0.36
5,"75,000 to 100,000",11.56,11.93,-0.37
6,"100,000 to 125,000",9.99,9.89,0.1
7,"125,000 to 150,000",7.92,8.03,-0.11
8,"150,000 to 200,000",11.49,11.53,-0.04
9,"200,000 or more",17.76,17.8,-0.04


# Check #8 - Compare number of workers in non-GQ households between synthetic persons file and number of workers calculated by the Econ team [see production side jobs for workers] 

In [21]:
if year == 2022:
    econ_num = 1_598_836
elif year == 2026:
    econ_num = 1_623_691
elif year == 2029:
    econ_num = 1_642_717
elif year == 2032:
    econ_num = 1_662_848
elif year == 2035:
    econ_num = 1_682_424
elif year == 2040:
    econ_num = 1_705_327
elif year == 2050:
    econ_num = 1_710_936

In [22]:
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,3,4,5]))])
persons_emp - econ_num

57673

In [23]:
persons_emp

1768609

In [24]:
econ_num

1710936

In [25]:
# Workers - [ESR] = (1, 2, 4, 5) only controlled at the household level in categories of 0-worker households, 1-worker households, 2-worker households, and 3+ worker households
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,4,5]))])
print(persons_emp)
print(persons_emp - econ_num)

1680663
-30273


In [26]:
# Labor Force Participants - [ESR] = (1, 2, 3, 4, 5) controlled by Industry Type (job_1 to job_14)
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,3,4,5]))])
print(persons_emp)
print(persons_emp - econ_num)

1768609
57673


In [27]:
# Workers - [ESR] = (1, 2, 4, 5) only controlled at the household level in categories of 0-worker households, 1-worker households, 2-worker households, and 3+ worker households
persons_emp = len(synthetic_persons[(~synthetic_persons['ESR'].isna()) & (synthetic_persons['ESR'].isin([1,2,3]))])
print(persons_emp)
print(persons_emp - econ_num)

1665801
-45135


In [28]:
# Have a labor force check as well 
'''The control for above is from the econ team'''

'The control for above is from the econ team'

# E&F Labor Force values table creation

In [29]:
emp_dict = []
for test_year in [2026, 2029, 2032, 2035, 2040, 2050]:
    test_persons = pd.read_csv(path + "\synthetic_persons_{}.csv".format(test_year))
    test_households = pd.read_csv(path + "\synthetic_households_{}.csv".format(test_year))
    
    test_persons_emp = test_persons.merge(test_households, how='left', on='household_id')
    test_persons_emp = len(test_persons_emp[(test_persons_emp['gq_type'] == 0) & (~test_persons_emp['ESR'].isna()) & (test_persons_emp['ESR'].isin([1,2,3,4,5]))])
    
    if test_year == 2022:
        test_econ_num = 1_598_836
    elif test_year == 2026:
        test_econ_num = 1_623_691
    elif test_year == 2029:
        test_econ_num = 1_642_717
    elif test_year == 2032:
        test_econ_num = 1_662_848
    elif test_year == 2035:
        test_econ_num = 1_682_424
    elif test_year == 2040:
        test_econ_num = 1_705_327
    elif test_year == 2050:
        test_econ_num = 1_710_936
        
    
    diff = test_persons_emp - test_econ_num
    
    
    emp_dict.append([test_year, test_persons_emp, test_econ_num, diff])

In [30]:
pd.DataFrame(emp_dict, columns=['year', 'ESR_12345', 'Econ-Labor Force Participants', 'diff'])

Unnamed: 0,year,ESR_12345,Econ-Labor Force Participants,diff
0,2026,1629217,1623691,5526
1,2029,1648010,1642717,5293
2,2032,1668497,1662848,5649
3,2035,1688203,1682424,5779
4,2040,1710353,1705327,5026
5,2050,1716465,1710936,5529


# Check #9 - Compare total jobs and jobs by sector with MGRA-based input file

In [31]:
naics2_xwalk = pd.read_excel(r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2023\2023-034 PopSim Series 15 Subregional forecast (2022) QC\Data\production_naics_xwalk.xlsx',
                            skiprows=[0]).rename(columns={'Unnamed: 3':'job'})
naics2_xwalk['NAICS2'] = naics2_xwalk['Unnamed: 4'].apply(lambda x: list(map(lambda x: x.replace('persons.NAICS2 == ','').strip("()' "), x.split('|'))))
naics2_xwalk = naics2_xwalk.explode(['NAICS2'])

syn_sectors = synthetic_persons.merge(naics2_xwalk[['job', 'NAICS2']], on='NAICS2', how='left').groupby('job').count().reset_index()[['job', 'mgra']].rename(columns={'mgra':'syn_jobs'})
syn_sectors

Unnamed: 0,job,syn_jobs
0,job_1,119138
1,job_10,216894
2,job_11,59792
3,job_12,17166
4,job_13,152822
5,job_14,89148
6,job_2,106874
7,job_3,12834
8,job_4,349341
9,job_5,163769


In [32]:
econ_sectors = pd.read_excel(r'C:\Users\jchu\OneDrive - San Diego Association of Governments\Projects\2023\2023-078 REVISED- Regional Employment Estimates & Forecast (series 15)\Data\Post QA\Forecasted Employment, Regional Controls for 2022 through 2060_Sept 2023 Revised.xlsx',
                            sheet_name = '1b. Summary Forecast_Production',
                            skiprows=[0,1,30,31,32]).rename(columns={'Unnamed: 0':'Industry'})

# Race for popsim
def get_indxwalk(row):
        if row['Industry'] in ['State and Local Government', 'Federal Civilian']:
            return 'job_1'
        elif row['Industry'] in ['Federal Military']:
            return 'job_2'
        elif row['Industry'] in ['Forestry, fishing, and hunting', 'Farm', 'Mining', 'NAICS 11 (ag, fisheries and forestry)']:
            return 'job_3'
        elif row['Industry'] in ['Information',
                                'Professional, scientific, and technical services', 
                                 'Administrative, support, waste management, and remediation services']:
            return 'job_4'
        elif row['Industry'] in ['Finance and insurance','Real estate and rental and leasing',
                                 'Management of companies and enterprises']:
            return 'job_5'        
        elif row['Industry'] in ['Educational services; private']:
            return 'job_6'
        elif row['Industry'] in ['Health care and social assistance']:
            return 'job_7'
        elif row['Industry'] in ['Retail trade']:
            return 'job_8'
        elif row['Industry'] in ['Construction','Transportation and warehousing' ]:
            return 'job_9'
        elif row['Industry'] in ['Utilities','Manufacturing', 'Wholesale trade']:
            return 'job_10'
        elif row['Industry'] in ['Arts, entertainment, and recreation']:
            return 'job_11'
        elif row['Industry'] in ['Accommodation']:
            return 'job_12'
        elif row['Industry'] in ['Food Service']:
            return 'job_13'
        elif row['Industry'] in ['Other services (except public administration)']:
            return 'job_14'

econ_sectors['job'] = econ_sectors.apply(get_indxwalk, axis = 1)
econ_sectors = (econ_sectors[['job', year]].groupby('job').sum()*1000).reset_index().rename(columns={year:'mgra_jobs'})
econ_sectors

Unnamed: 0,job,mgra_jobs
0,job_1,105708.420017
1,job_10,194903.377142
2,job_11,48810.366787
3,job_12,15234.933802
4,job_13,132477.197758
5,job_14,73724.925069
6,job_2,104000.0
7,job_3,10881.672394
8,job_4,315493.877486
9,job_5,147588.561059


In [33]:
sector_diff = syn_sectors.merge(econ_sectors, on='job')
sector_diff['diff'] = sector_diff['syn_jobs'] - sector_diff['mgra_jobs']
sector_diff

Unnamed: 0,job,syn_jobs,mgra_jobs,diff
0,job_1,119138,105708.420017,13429.579983
1,job_10,216894,194903.377142,21990.622858
2,job_11,59792,48810.366787,10981.633213
3,job_12,17166,15234.933802,1931.066198
4,job_13,152822,132477.197758,20344.802242
5,job_14,89148,73724.925069,15423.074931
6,job_2,106874,104000.0,2874.0
7,job_3,12834,10881.672394,1952.327606
8,job_4,349341,315493.877486,33847.122514
9,job_5,163769,147588.561059,16180.438941


# -------------------------------------------------
# Check - Vehicles

In [34]:
low_inc_cars = len(synthetic_households[(synthetic_households.HHADJINC < 30000) & (synthetic_households.VEH > 2)])
high_inc_nocars = len(synthetic_households[((synthetic_households.HHADJINC > 100000) & (synthetic_households.VEH == 0))])

In [35]:
low_inc_cars

12815

In [36]:
high_inc_nocars

10588

# Check - Households with household income 0 where persons are employed in for profit organization

In [37]:
synthetic_households[synthetic_households['HHADJINC']==0]

Unnamed: 0,household_id,mgra,SERIALNO,NP,HHADJINC,HHT,HUPAC,VEH,BLD,gq_type,workers,Income Category
94,95,165,2018HU0682435,2.0,0.0,3.0,4.0,1.0,9.0,0,0,"Less than 15,000"
115,116,218,2018HU0032019,5.0,0.0,1.0,3.0,0.0,3.0,0,0,"Less than 15,000"
133,134,218,2021HU0337895,1.0,0.0,6.0,4.0,1.0,2.0,0,0,"Less than 15,000"
138,139,245,2017000386819,1.0,0.0,4.0,4.0,1.0,1.0,0,0,"Less than 15,000"
182,183,245,2021HU0337895,1.0,0.0,6.0,4.0,1.0,2.0,0,0,"Less than 15,000"
...,...,...,...,...,...,...,...,...,...,...,...,...
1334322,1334323,10219,2020HU1045955,1.0,0.0,4.0,4.0,1.0,2.0,0,0,"Less than 15,000"
1334471,1334472,10222,2020HU0151891,1.0,0.0,6.0,4.0,1.0,9.0,0,0,"Less than 15,000"
1334472,1334473,10222,2020HU0151891,1.0,0.0,6.0,4.0,1.0,9.0,0,0,"Less than 15,000"
1334473,1334474,10222,2020HU0151891,1.0,0.0,6.0,4.0,1.0,9.0,0,0,"Less than 15,000"


In [38]:
hhinc_working = synthetic_persons.merge(synthetic_households[['household_id', 'HHADJINC']], how='left', on='household_id')
hhinc_working[(hhinc_working.ESR.isin([1, 2, 4, 5]))& (hhinc_working.HHADJINC == 0)]

Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,HISP,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,HHADJINC
186101,9159,73625,2017001208011,1.0,47.0,1,1.0,6.0,40.0,0,...,1,2.0,19.0,310.0,1.0,722Z,722,119051,11.0,0.0
192649,16558,76341,2017001208011,1.0,47.0,1,1.0,6.0,40.0,0,...,1,2.0,19.0,310.0,1.0,722Z,722,119051,11.0,0.0
193148,16563,76557,2017001208011,1.0,47.0,1,1.0,6.0,40.0,0,...,1,2.0,19.0,310.0,1.0,722Z,722,119051,11.0,0.0
194581,16588,77182,2017001208011,1.0,47.0,1,1.0,6.0,40.0,0,...,1,2.0,19.0,310.0,1.0,722Z,722,119051,11.0,0.0
195183,16588,77441,2020HU0933627,1.0,60.0,2,1.0,8.0,4.0,0,...,1,4.0,17.0,6260.0,5.0,23,23,472061,47.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2862315,9120,1188658,2019HU0006429,2.0,60.0,1,1.0,1.0,30.0,0,...,1,4.0,21.0,140.0,4.0,3254,32,113051,11.0,0.0
2891945,9547,1198420,2019HU0006429,2.0,60.0,1,1.0,1.0,30.0,0,...,1,4.0,21.0,140.0,4.0,3254,32,113051,11.0,0.0
2917114,9645,1206488,2019HU0006429,2.0,60.0,1,1.0,1.0,30.0,0,...,1,4.0,21.0,140.0,4.0,3254,32,113051,11.0,0.0
2919533,9648,1207267,2019HU0006429,2.0,60.0,1,1.0,1.0,30.0,0,...,1,4.0,21.0,140.0,4.0,3254,32,113051,11.0,0.0


In [39]:
len(hhinc_working[(hhinc_working.ESR.isin([1, 2, 4, 5]))& (hhinc_working.HHADJINC == 0)]['household_id'].unique())

395

# Check - Compare student enrollment (k-8 and 9-12) with MGRA base file. 

In [40]:
print(synthetic_persons['SCHG'].isin([2,3,4,5,6,7,8,9,10]).sum())
synthetic_persons[synthetic_persons['SCHG'].isin([2,3,4,5,6,7,8,9,10])].groupby('mgra').count().reset_index()[['mgra', 'SCHG']]

316655


Unnamed: 0,mgra,SCHG
0,1,14
1,3,72
2,5,13
3,6,9
4,7,15
...,...,...
15824,24313,5
15825,24314,4
15826,24316,34
15827,24318,14


In [41]:
print(synthetic_persons['SCHG'].isin([11,12,13,14]).sum())
synthetic_persons[synthetic_persons['SCHG'].isin([11,12,13,14])].groupby('mgra').count().reset_index()[['mgra', 'SCHG']]

149619


Unnamed: 0,mgra,SCHG
0,1,12
1,3,26
2,5,8
3,6,2
4,7,1
...,...,...
14395,24313,7
14396,24314,3
14397,24316,27
14398,24318,13


In [42]:
print(mgra_based_data[['mgra', 'enrollgradekto8', 'enrollgrade9to12']].sum())
mgra_based_data[['mgra', 'enrollgradekto8', 'enrollgrade9to12']]

mgra                295767681
enrollgradekto8        326853
enrollgrade9to12       146635
dtype: int64


Unnamed: 0,mgra,enrollgradekto8,enrollgrade9to12
0,16512,0,0
1,13476,0,0
2,13511,0,0
3,9782,0,0
4,9817,0,0
...,...,...,...
24316,13957,0,0
24317,19400,0,0
24318,19419,0,0
24319,19434,0,0


In [43]:
mgra_based_data.groupby('pseudomsa').sum()[['enrollgradekto8', 'enrollgrade9to12']]

Unnamed: 0_level_0,enrollgradekto8,enrollgrade9to12
pseudomsa,Unnamed: 1_level_1,Unnamed: 2_level_1
1,663,3146
2,50644,17975
3,81345,35593
4,49726,23682
5,51856,24080
6,40493,17430
7,48969,21504
8,3157,3225


In [44]:
synthetic_persons[['SCHG', 'SCHL']]

Unnamed: 0,SCHG,SCHL
0,0,19.0
1,0,19.0
2,0,22.0
3,0,21.0
4,0,24.0
...,...,...
3397433,0,19.0
3397434,0,14.0
3397435,0,18.0
3397436,0,1.0


In [45]:
school_enrollment_attainment = {2: [1, 2],
3:[3],
4:[4],
5:[5],
6:[6],
7:[7],
8:[8],
9:[9],
10:[10],
11:[11],
12:[12],
13:[13],
14:[14],
15:[16,17,18,19,20],
16:[21,22,23,24]}

In [46]:
enrollment_calcs = synthetic_persons[~synthetic_persons['SCHG'].isin([0,1])]
enrollment_calcs['possible_attainments'] = enrollment_calcs['SCHG'].apply(lambda x: school_enrollment_attainment[x])
enrollment_calcs['attainment_check'] = enrollment_calcs.apply(lambda x: x.SCHL in x.possible_attainments, axis=1)
enrollment_calcs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enrollment_calcs['possible_attainments'] = enrollment_calcs['SCHG'].apply(lambda x: school_enrollment_attainment[x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  enrollment_calcs['attainment_check'] = enrollment_calcs.apply(lambda x: x.SCHL in x.possible_attainments, axis=1)


Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,possible_attainments,attainment_check
13,57,6,2018HU1166882,2.0,9.0,1,0.0,0.0,0.0,6,...,0.0,6.0,0.0,0.0,,,,,[6],True
14,57,6,2018HU1166882,3.0,6.0,2,0.0,0.0,0.0,3,...,0.0,3.0,0.0,0.0,,,,,[3],True
24,57,12,2021HU0537552,1.0,30.0,1,6.0,0.0,0.0,15,...,2.0,20.0,0.0,0.0,,,,,"[16, 17, 18, 19, 20]",True
31,100,16,2017000327031,1.0,30.0,2,6.0,0.0,0.0,15,...,4.0,19.0,0.0,0.0,,,,,"[16, 17, 18, 19, 20]",True
33,100,16,2017000327031,3.0,7.0,2,0.0,0.0,0.0,4,...,0.0,4.0,0.0,0.0,,,,,[4],True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3397393,24280,1449073,2018GQ0125534,1.0,17.0,1,6.0,0.0,0.0,13,...,4.0,13.0,0.0,0.0,,,,,[13],True
3397405,24280,1449085,2019GQ0039460,1.0,15.0,1,0.0,0.0,0.0,12,...,0.0,12.0,0.0,0.0,,,,,[12],True
3397407,24280,1449087,2021GQ0028539,1.0,16.0,1,6.0,0.0,0.0,13,...,0.0,13.0,0.0,0.0,,,,,[13],True
3397414,24280,1449094,2018GQ0077804,1.0,83.0,1,6.0,0.0,0.0,13,...,4.0,13.0,0.0,0.0,,,,,[13],True


In [47]:
enrollment_calcs[enrollment_calcs.possible_attainments == False]

Unnamed: 0,mgra,household_id,SERIALNO,SPORDER,AGEP,SEX,ESR,COW,WKHP,SCHG,...,MIL,SCHL,OCCP,WKW,NAICSP,NAICS2,SOCP,SOC2,possible_attainments,attainment_check
