In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc
from mgra_denormalize_creation import create_and_merge_all_data

# MGRA denorm

In [2]:
mgra_denorm = create_and_merge_all_data()
mgra_denorm['region'] = 'San Diego'
mgra_denorm

Unnamed: 0,mgra_id,mgra,census_tract,LUZ,cpa,jurisdiction,region
0,1500000100,1,2705,10,Mid-City:Eastern Area,San Diego,San Diego
1,1500000200,2,5601,28,Downtown,San Diego,San Diego
2,1500000300,3,15407,112,*Not in a CPA*,El Cajon,San Diego
3,1500000400,4,17407,151,*Not in a CPA*,Encinitas,San Diego
4,1500000500,5,17407,151,*Not in a CPA*,Encinitas,San Diego
...,...,...,...,...,...,...,...
24316,1502431700,24317,2905,11,County Islands,Unincorporated,San Diego
24317,1502431800,24318,20706,133,North County Metro,Unincorporated,San Diego
24318,1502431900,24319,21001,225,Mountain Empire,Unincorporated,San Diego
24319,1502432000,24320,21002,224,Desert,Unincorporated,San Diego


# Ethnicity

In [10]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\ethnicity_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,long_name,population
0,1500000100,2020,Hispanic,131
1,1500000100,2020,"Non-Hispanic, White",86
2,1500000100,2020,"Non-Hispanic, Black",58
3,1500000100,2020,"Non-Hispanic, American Indian or Alaska Native",0
4,1500000100,2020,"Non-Hispanic, Asian",169
...,...,...,...,...
583699,1502422300,2022,"Non-Hispanic, Two or More Races",0
583700,1502422400,2020,Hispanic,97
583701,1502422400,2020,"Non-Hispanic, White",12
583702,1502422400,2020,"Non-Hispanic, Black",4


In [11]:
def eth_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['long_name'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [17]:
output = eth_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\ethnicity\region_ethnicty_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Hispanic,"Non-Hispanic, American Indian or Alaska Native","Non-Hispanic, Asian","Non-Hispanic, Black","Non-Hispanic, Hawaiian or Pacific Islander","Non-Hispanic, Other","Non-Hispanic, Two or More Races","Non-Hispanic, White"
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
San Diego,2020,1087686,11463,388579,155508,11966,13008,131514,1531555
San Diego,2021,1098438,9541,366316,138084,9947,10534,127309,1528334
San Diego,2022,1160750,7374,317683,123049,7516,9404,110425,1551105


In [9]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Income Group

In [22]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\income_group_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,income_group,households
0,1500000100,2020,"Less than $15,000",14
1,1500000200,2020,"Less than $15,000",9
2,1500000300,2020,"Less than $15,000",28
3,1500000400,2020,"Less than $15,000",0
4,1500000500,2020,"Less than $15,000",2
...,...,...,...,...
729625,1501607100,2020,"$125,000 to $149,999",32
729626,1501607200,2020,"$125,000 to $149,999",13
729627,1501607300,2020,"$125,000 to $149,999",0
729628,1501607400,2020,"$125,000 to $149,999",11


In [23]:
def income_group_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='households', index=['mgra_id', 'yr_id'], columns=['income_group'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [31]:
output = income_group_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\income_group\region_income_group_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,"$100,000 to $124,999","$125,000 to $149,999","$15,000 to $29,999","$150,000 to $199,999","$200,000 or more","$30,000 to $44,999","$45,000 to $59,999","$60,000 to $74,999","$75,000 to $99,999","Less than $15,000"
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
San Diego,2020,122510,95909,90120,130936,214126,87706,92428,94151,140754,75630
San Diego,2021,131499,95021,89780,131480,213094,94588,81799,97863,132462,81120
San Diego,2022,127502,95157,92088,131804,213482,91176,93988,95779,141660,77836


In [21]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Household Type

In [38]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\household_type_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,household_type,households
0,1500000100,2020,Family household:married-couple,40
1,1500000200,2020,Family household:married-couple,31
2,1500000300,2020,Family household:married-couple,40
3,1500000400,2020,Family household:married-couple,1
4,1500000500,2020,Family household:married-couple,8
...,...,...,...,...
510736,1502431700,2022,"Nonfamily household:female householder, not li...",1
510737,1502431800,2022,"Nonfamily household:female householder, not li...",0
510738,1502431900,2022,"Nonfamily household:female householder, not li...",0
510739,1502432000,2022,"Nonfamily household:female householder, not li...",0


In [39]:
def household_type_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='households', index=['mgra_id', 'yr_id'], columns=['household_type'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [47]:
output = household_type_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\households\region_households_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,"Family household:female householder, no husband present","Family household:male householder, no wife present",Family household:married-couple,"Nonfamily household:female householder, living alone","Nonfamily household:female householder, not living alone","Nonfamily household:male householder, living alone","Nonfamily household:male householder, not living alone"
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
San Diego,2020,191908,377440,272996,32070,22109,169561,78186
San Diego,2021,192652,378902,274054,32195,22195,170219,78489
San Diego,2022,194626,382784,276861,32524,22422,171962,79293


In [42]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Housing

In [3]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\housing_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,long_name,units,unoccupiable,occupied,vacancy
0,1500000100,2020,Single Family - Detached,84,,84,0
1,1500000200,2020,Single Family - Detached,0,,0,0
2,1500000300,2020,Single Family - Detached,18,,18,0
3,1500000400,2020,Single Family - Detached,0,,0,0
4,1500000500,2020,Single Family - Detached,5,,4,1
...,...,...,...,...,...,...,...
291847,1502431700,2022,Mobile Home,0,,0,0
291848,1502431800,2022,Mobile Home,0,,0,0
291849,1502431900,2022,Mobile Home,0,,0,0
291850,1502432000,2022,Mobile Home,0,,0,0


In [4]:
def housing_cleaning(df, geo_level):
    output_1 = pd.pivot_table(df[['mgra_id', 'yr_id', 'long_name', 'units']], index=['mgra_id', 'yr_id'], columns=['long_name'], values='units')
    output_1.columns.name = ''
    grouping = df[['mgra_id', 'yr_id', 'units', 'unoccupiable', 'occupied','vacancy']].groupby(['mgra_id', 'yr_id']).sum()
    age_output = output_1.merge(grouping, left_index=True, right_index=True)
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [13]:
output = housing_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\housing\region_housing_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Mobile Home,Multifamily,Single Family - Detached,Single Family - Multiple Unit,units,occupied,vacancy
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
San Diego,2020,42169,448296,567452,158606,1216523,1144270,72253
San Diego,2021,42112,452544,569254,159048,1222958,1148706,74252
San Diego,2022,42130,459767,571112,162417,1235426,1160472,74954


In [6]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Population 

In [19]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\population_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,long_name,population
0,1500000100,2020,Household Population,465
1,1500000200,2020,Household Population,73
2,1500000300,2020,Household Population,546
3,1500000400,2020,Household Population,8
4,1500000500,2020,Household Population,94
...,...,...,...,...
291847,1502334500,2022,Group Quarters - Other,0
291848,1502334600,2022,Group Quarters - Other,0
291849,1502334700,2022,Group Quarters - Other,0
291850,1502334800,2022,Group Quarters - Other,0


In [20]:
def population_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['long_name'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [27]:
output = population_cleaning(df, geo_level='region')
# output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\population\region_population_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Group Quarters - College,Group Quarters - Military,Group Quarters - Other,Household Population
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
San Diego,2020,26881,41611,43918,3218869
San Diego,2021,17810,41422,43767,3185504
San Diego,2022,30637,43210,42564,3170895


In [16]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Sex

In [28]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\sex_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,sex,population
0,1500000100,2020,Female,250
1,1500000100,2020,Male,215
2,1500000100,2021,Female,252
3,1500000100,2021,Male,197
4,1500000100,2022,Female,251
...,...,...,...,...
995,1500016600,2022,Male,199
996,1500016700,2020,Female,43
997,1500016700,2020,Male,48
998,1500016700,2021,Female,52


In [29]:
def sex_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['sex'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [37]:
output = sex_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\sex\region_sex_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,Female,Male
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1
San Diego,2020,12361,12458
San Diego,2021,12441,12034
San Diego,2022,12367,11848


In [30]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Children

In [55]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\children_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,yr_id,mgra,with_children,without_children
0,2020,1,42,134
1,2020,2,7,42
2,2020,3,88,104
3,2020,4,3,0
4,2020,5,14,23
...,...,...,...,...
72958,2022,24317,0,2
72959,2022,24318,0,46
72960,2022,24319,0,0
72961,2022,24320,0,0


In [56]:
def children_cleaning(df, geo_level):
    age_output = df
    if geo_level != 'mgra':
        age_output = age_output.merge(mgra_denorm[['mgra', geo_level]], how='left', on='mgra')
        age_output = age_output.drop('mgra', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [62]:
output = children_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\children\region_tract_children_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,with_children,without_children
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1
San Diego,2020,371497,772773
San Diego,2021,372937,775769
San Diego,2022,376757,783715


In [41]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')

# Workers

In [73]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\workers_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,yr_id,mgra,workers_0,workers_1,workers_2,workers_3plus
0,2020,1,58,57,16,45
1,2020,2,15,13,6,15
2,2020,3,65,62,16,49
3,2020,4,0,0,1,2
4,2020,5,10,11,4,12
...,...,...,...,...,...,...
72958,2022,24317,2,0,0,0
72959,2022,24318,35,9,2,0
72960,2022,24319,0,0,0,0
72961,2022,24320,0,0,0,0


In [74]:
def workers_cleaning(df, geo_level):
    age_output = df
    if geo_level != 'mgra':
        age_output = age_output.merge(mgra_denorm[['mgra', geo_level]], how='left', on='mgra')
        age_output = age_output.drop('mgra', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [80]:
output = workers_cleaning(df, geo_level='region')
output.to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\workers\region_workers_est_2022_01_ind_QA.csv')
output

Unnamed: 0_level_0,Unnamed: 1_level_0,workers_0,workers_1,workers_2,workers_3plus
region,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
San Diego,2020,421317,353753,100342,268858
San Diego,2021,422950,355125,100731,269900
San Diego,2022,427283,358762,101762,272665


In [66]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')