In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc
from mgra_denormalize_creation import create_and_merge_all_data

# MGRA denorm

In [2]:
mgra_denorm = create_and_merge_all_data()
mgra_denorm['region'] = 'San Diego'
mgra_denorm

Unnamed: 0,mgra_id,mgra,census_tract,LUZ,cpa,jurisdiction,SRA,region
0,1500000100,1,2705,10,Mid-City:Eastern Area,San Diego,MID-CITY,San Diego
1,1500000200,2,5601,28,Downtown,San Diego,CENTRAL SAN DIEGO,San Diego
2,1500000300,3,15407,112,*Not in a CPA*,El Cajon,EL CAJON,San Diego
3,1500000400,4,17407,151,*Not in a CPA*,Encinitas,SAN DIEGUITO,San Diego
4,1500000500,5,17407,151,*Not in a CPA*,Encinitas,SAN DIEGUITO,San Diego
...,...,...,...,...,...,...,...,...
24316,1502431700,24317,2905,11,County Islands,Unincorporated,LA MESA,San Diego
24317,1502431800,24318,20706,133,North County Metro,Unincorporated,ESCONDIDO,San Diego
24318,1502431900,24319,21001,225,Mountain Empire,Unincorporated,ANZA-BORREGO SPRINGS,San Diego
24319,1502432000,24320,21002,224,Desert,Unincorporated,ANZA-BORREGO SPRINGS,San Diego


In [6]:
set(mgra_denorm[mgra_denorm['SRA'] == 'ELLIOTT-NAVAJO']['jurisdiction'])

{'San Diego', 'Santee'}

# Ethnicity

In [45]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\ethnicity_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,long_name,population
0,1500000100,2020,Hispanic,131
1,1500000100,2020,"Non-Hispanic, White",86
2,1500000100,2020,"Non-Hispanic, Black",58
3,1500000100,2020,"Non-Hispanic, American Indian or Alaska Native",0
4,1500000100,2020,"Non-Hispanic, Asian",169
...,...,...,...,...
583699,1502422300,2022,"Non-Hispanic, Two or More Races",0
583700,1502422400,2020,Hispanic,97
583701,1502422400,2020,"Non-Hispanic, White",12
583702,1502422400,2020,"Non-Hispanic, Black",4


In [46]:
def eth_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['long_name'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [5]:
output = eth_cleaning(df, geo_level='SRA')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\ethnicity\SRA_ethnicty_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Hispanic,"Non-Hispanic, American Indian or Alaska Native","Non-Hispanic, Asian","Non-Hispanic, Black","Non-Hispanic, Hawaiian or Pacific Islander","Non-Hispanic, Other","Non-Hispanic, Two or More Races","Non-Hispanic, White"
SRA,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
ALPINE,2020,2018,27,580,255,13,6,495,12373
ALPINE,2021,2572,0,203,69,1,5,180,13181
ALPINE,2022,2953,1,6,23,1,5,4,13120
ANZA-BORREGO SPRINGS,2020,452,1,0,39,0,0,67,1957
ANZA-BORREGO SPRINGS,2021,491,0,0,0,0,0,4,2037
...,...,...,...,...,...,...,...,...,...
VALLEY CENTER,2021,8303,931,564,125,0,2,671,14238
VALLEY CENTER,2022,9827,1048,237,19,0,2,388,13789
VISTA,2020,49794,327,4084,3574,1116,242,3789,45599
VISTA,2021,49550,154,3656,3110,913,80,3586,45228


In [9]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Income Group

In [6]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\income_group_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,income_group,households
0,1500000100,2020,"Less than $15,000",14
1,1500000200,2020,"Less than $15,000",9
2,1500000300,2020,"Less than $15,000",28
3,1500000400,2020,"Less than $15,000",0
4,1500000500,2020,"Less than $15,000",2
...,...,...,...,...
729625,1501978900,2020,"$75,000 to $99,999",3
729626,1501979000,2020,"$75,000 to $99,999",0
729627,1501979100,2020,"$75,000 to $99,999",11
729628,1501979200,2020,"$75,000 to $99,999",6


In [7]:
def income_group_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='households', index=['mgra_id', 'yr_id'], columns=['income_group'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [9]:
output = income_group_cleaning(df, geo_level='SRA')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\income_group\SRA_income_group_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,"$100,000 to $124,999","$125,000 to $149,999","$15,000 to $29,999","$150,000 to $199,999","$200,000 or more","$30,000 to $44,999","$45,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","Less than $15,000"
SRA,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ALPINE,2020,714,456,506,865,1128,397,241,324,883,163
ALPINE,2021,719,527,461,772,1391,336,210,356,809,223
ALPINE,2022,708,526,471,771,1377,306,254,346,859,219
ANZA-BORREGO SPRINGS,2020,113,36,94,229,87,117,273,121,69,197
ANZA-BORREGO SPRINGS,2021,175,82,83,212,64,180,235,149,78,88
...,...,...,...,...,...,...,...,...,...,...,...
VALLEY CENTER,2021,1236,597,500,1623,1562,465,330,369,1091,405
VALLEY CENTER,2022,1244,628,514,1696,1583,406,408,374,1174,418
VISTA,2020,3884,2605,2704,3765,3972,2721,3270,3674,5292,1325
VISTA,2021,4590,2664,2484,3742,4538,3077,2987,3570,4498,1523


In [21]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Household Type

In [2]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\series_15_estimates_automation\SQL_Queries\household_type_query_2021.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df['region'] = 'San Diego'
df

Unnamed: 0,cpa,yr_id,household_type,households,region
0,*Not in a CPA*,2020,Family household:married-couple,0,San Diego
1,*Not in a CPA*,2020,"Family household:male householder, no wife pre...",0,San Diego
2,*Not in a CPA*,2020,"Family household:female householder, no husban...",0,San Diego
3,*Not in a CPA*,2020,"Nonfamily household:male householder, living a...",0,San Diego
4,*Not in a CPA*,2020,"Nonfamily household:male householder, not livi...",0,San Diego
...,...,...,...,...,...
382447,Greater North Park,2021,Family household:married-couple,0,San Diego
382448,Greater North Park,2021,"Family household:male householder, no wife pre...",0,San Diego
382449,Greater North Park,2021,"Family household:female householder, no husban...",0,San Diego
382450,Greater North Park,2021,"Nonfamily household:male householder, living a...",0,San Diego


In [82]:
geo_level = 'cpa'
output = df.groupby([f'{geo_level}', 'yr_id', 'household_type']).sum().reset_index()
output = pd.pivot(output, values='households', index=[f'{geo_level}', 'yr_id'], columns=['household_type'])
output['Total_Household_Type'] = output.sum(axis=1)
output.columns.name = ''
output = output.reset_index()

output

Unnamed: 0,cpa,yr_id,"Family household:female householder, no husband present","Family household:male householder, no wife present",Family household:married-couple,"Nonfamily household:female householder, living alone","Nonfamily household:female householder, not living alone","Nonfamily household:male householder, living alone","Nonfamily household:male householder, not living alone",Total_Household_Type
0,*Not in a CPA*,2020,87109,144849,89086,22195,12394,73277,42440,471350
1,*Not in a CPA*,2021,86928,148439,89437,21543,12500,73013,42361,474221
2,32nd Street Naval Station,2020,0,0,0,0,0,0,0,0
3,32nd Street Naval Station,2021,0,0,0,0,0,0,0,0
4,Alpine,2020,1138,2326,1232,201,112,932,457,6398
...,...,...,...,...,...,...,...,...,...,...
173,Valley Center,2021,1103,2506,1166,254,167,856,463,6515
174,Via De La Valle,2020,33,67,62,8,1,19,8,198
175,Via De La Valle,2021,31,68,61,9,3,18,8,198
176,Via De La Valle Reserve,2020,0,0,1,0,0,0,0,1


In [83]:
output.to_csv(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Results\Test 5- Vintage Comparison\2021_01\households\QA_2021_01_{geo_level}_households.csv', index=False)

In [None]:
def household_type_2021_01_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='households', index=['mgra_id', 'yr_id'], columns=['household_type'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    age_output['Total_Household_Type'] = age_output.sum(axis=1)
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [13]:
def household_type_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='households', index=['mgra_id', 'yr_id'], columns=['household_type'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    age_output['Total_Household_Type'] = age_output.sum(axis=1)
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [14]:
test = household_type_cleaning(df, geo_level='region')
test

Unnamed: 0_level_0,Unnamed: 1_level_0,"Family household:female householder, no husband present","Family household:male householder, no wife present",Family household:married-couple,"Nonfamily household:female householder, living alone","Nonfamily household:female householder, not living alone","Nonfamily household:male householder, living alone","Nonfamily household:male householder, not living alone",Total_Household_Type
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
San Diego,2020,191908,377440,272996,32070,22109,169561,78186,1144270
San Diego,2021,192652,378902,274054,32195,22195,170219,78489,1148706
San Diego,2022,194626,382784,276861,32524,22422,171962,79293,1160472


In [22]:
output = household_type_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\households\region_households_est_2022_01_ind_QA.csv')
#output

In [15]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction', 'SRA',
       'region'],
      dtype='object')

# Housing

In [48]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\\series_15_estimates_automation\SQL_Queries\housing_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,long_name,units,unoccupiable,occupied,vacancy
0,1500000100,2020,Single Family - Detached,84,,84,0
1,1500000200,2020,Single Family - Detached,0,,0,0
2,1500000300,2020,Single Family - Detached,18,,18,0
3,1500000400,2020,Single Family - Detached,0,,0,0
4,1500000500,2020,Single Family - Detached,5,,4,1
...,...,...,...,...,...,...,...
291847,1502431700,2022,Mobile Home,0,,0,0
291848,1502431800,2022,Mobile Home,0,,0,0
291849,1502431900,2022,Mobile Home,0,,0,0
291850,1502432000,2022,Mobile Home,0,,0,0


In [49]:
df['unoccupiable'] = [x if x != None else 0 for x in df['unoccupiable']]

In [50]:
df.describe()

Unnamed: 0,mgra_id,yr_id,units,unoccupiable,occupied,vacancy
count,291852.0,291852.0,291852.0,291852.0,291852.0,291852.0
mean,1501216000.0,2021.0,12.59168,0.0,11.832874,0.758806
std,702088.0,0.816498,40.587324,0.0,37.887113,4.721335
min,1500000000.0,2020.0,0.0,0.0,0.0,0.0
25%,1500608000.0,2020.0,0.0,0.0,0.0,0.0
50%,1501216000.0,2021.0,0.0,0.0,0.0,0.0
75%,1501824000.0,2022.0,4.0,0.0,3.0,0.0
max,1502432000.0,2022.0,2602.0,0.0,2381.0,834.0


In [31]:
def housing_cleaning(df, geo_level):
    output_1 = pd.pivot_table(df[['mgra_id', 'yr_id', 'long_name', 'units']], index=['mgra_id', 'yr_id'], columns=['long_name'], values='units')
    output_1.columns.name = ''
    grouping = df[['mgra_id', 'yr_id', 'units', 'unoccupiable', 'occupied','vacancy']].groupby(['mgra_id', 'yr_id']).sum()
    print(grouping.head())
    age_output = output_1.merge(grouping, left_index=True, right_index=True)
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    age_output['true_vacancy_rate'] = (age_output['vacancy']*100)/age_output['units']
    print(age_output.columns)
    #age_output['effective_vacancy_rate'] = ((age_output['vacancy']-age_output['unoccupiable'])*100)/age_output['units']
    # I don't see unoccupiable for effective vacancy rate 
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [32]:
housing_cleaning(df, geo_level='region')

                  units  occupied  vacancy
mgra_id    yr_id                          
1500000100 2020     176       176        0
           2021     176       174        2
           2022     176       174        2
1500000200 2020      56        49        7
           2021      56        48        8
Index(['Mobile Home', 'Multifamily', 'Single Family - Detached',
       'Single Family - Multiple Unit', 'units', 'occupied', 'vacancy',
       'true_vacancy_rate'],
      dtype='object')


Unnamed: 0_level_0,Unnamed: 1_level_0,Mobile Home,Multifamily,Single Family - Detached,Single Family - Multiple Unit,units,occupied,vacancy,true_vacancy_rate
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
San Diego,2020,42169,448296,567452,158606,1216523,1144270,72253,5.939304
San Diego,2021,42112,452544,569254,159048,1222958,1148706,74252,6.071509
San Diego,2022,42130,459767,571112,162417,1235426,1160472,74954,6.067057


In [25]:
for type in ['mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction', 'SRA','region']:
    housing_cleaning(df, geo_level=type).to_csv(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\housing\{type}_housing_est_2022_01_ind_QA.csv')

# output = housing_cleaning(df, geo_level='region')
# output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\housing\region_housing_est_2022_01_ind_QA.csv')
# output

KeyError: 'unoccupiable'

In [26]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction', 'SRA',
       'region'],
      dtype='object')

# Population 

In [84]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\population_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\cra\\OneDrive - San Diego Association of Governments\\QA_Repository\\2023\\2023-024 Estimates 2022\\SQL_Queries\\population_query.sql'

In [30]:
def population_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['long_name'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    age_output['total_population'] = age_output.sum(axis=1)

    # Calculating total GQ 
    age_output['total_pop_GQ'] = age_output['Group Quarters - College'] + age_output['Group Quarters - Military'] + age_output['Group Quarters - Other']


    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [32]:
for type in ['mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction', 'SRA','region']:
    population_cleaning(df, geo_level=type).to_csv(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\population\{type}_population_est_2022_01_ind_QA.csv')
    print(f"{type} is outputted")

mgra is outputted
census_tract is outputted
LUZ is outputted
cpa is outputted
jurisdiction is outputted
SRA is outputted
region is outputted


In [16]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Sex

In [51]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\sex_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,sex,population
0,1500000100,2020,Female,250
1,1500000100,2020,Male,215
2,1500000100,2021,Female,252
3,1500000100,2021,Male,197
4,1500000100,2022,Female,251
...,...,...,...,...
145921,1502424000,2020,Male,12
145922,1502424000,2021,Female,4
145923,1502424000,2021,Male,12
145924,1502424000,2022,Female,4


In [52]:
def sex_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['sex'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [55]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction', 'SRA',
       'region'],
      dtype='object')

In [61]:
output = sex_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\sex\region_sex_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Female,Male
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1
San Diego,2020,1650535,1680744
San Diego,2021,1625992,1662511
San Diego,2022,1626747,1660559


In [30]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Children

In [38]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\children_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,yr_id,mgra,with_children,without_children
0,2020,1,42,134
1,2020,2,7,42
2,2020,3,88,104
3,2020,4,3,0
4,2020,5,14,23
...,...,...,...,...
72958,2022,24317,0,2
72959,2022,24318,0,46
72960,2022,24319,0,0
72961,2022,24320,0,0


In [39]:
def children_cleaning(df, geo_level):
    age_output = df
    if geo_level != 'mgra':
        age_output = age_output.merge(mgra_denorm[['mgra', geo_level]], how='left', on='mgra')
        age_output = age_output.drop('mgra', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [41]:
output = children_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\children\region_children_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,with_children,without_children
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1
San Diego,2020,371497,772773
San Diego,2021,372937,775769
San Diego,2022,376757,783715


In [41]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Workers

In [42]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\workers_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,yr_id,mgra,workers_0,workers_1,workers_2,workers_3plus
0,2020,1,58,57,16,45
1,2020,2,15,13,6,15
2,2020,3,65,62,16,49
3,2020,4,0,0,1,2
4,2020,5,10,11,4,12
...,...,...,...,...,...,...
72958,2022,24317,2,0,0,0
72959,2022,24318,35,9,2,0
72960,2022,24319,0,0,0,0
72961,2022,24320,0,0,0,0


In [43]:
def workers_cleaning(df, geo_level):
    age_output = df
    if geo_level != 'mgra':
        age_output = age_output.merge(mgra_denorm[['mgra', geo_level]], how='left', on='mgra')
        age_output = age_output.drop('mgra', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [44]:
output = workers_cleaning(df, geo_level='SRA')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\workers\SRA_workers_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,workers_0,workers_1,workers_2,workers_3plus
SRA,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ALPINE,2020,2084,1752,502,1339
ALPINE,2021,2135,1788,511,1370
ALPINE,2022,2003,1767,589,1478
ANZA-BORREGO SPRINGS,2020,488,416,119,313
ANZA-BORREGO SPRINGS,2021,494,417,117,318
...,...,...,...,...,...
VALLEY CENTER,2021,3021,2527,717,1913
VALLEY CENTER,2022,3008,2592,874,1971
VISTA,2020,12229,10264,2916,7803
VISTA,2021,12409,10407,2951,7906


In [66]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')