In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc
from mgra_denormalize_creation import create_and_merge_all_data

# Create the MGRA Denormalize Table

In [2]:
mgra_denorm = create_and_merge_all_data()
mgra_denorm['region'] = 'San Diego'
mgra_denorm

Unnamed: 0,mgra_id,mgra,census_tract,LUZ,cpa,jurisdiction,region
0,1500000100,1,2705,10,Mid-City:Eastern Area,San Diego,San Diego
1,1500000200,2,5601,28,Downtown,San Diego,San Diego
2,1500000300,3,15407,112,*Not in a CPA*,El Cajon,San Diego
3,1500000400,4,17407,151,*Not in a CPA*,Encinitas,San Diego
4,1500000500,5,17407,151,*Not in a CPA*,Encinitas,San Diego
...,...,...,...,...,...,...,...
24316,1502431700,24317,2905,11,County Islands,Unincorporated,San Diego
24317,1502431800,24318,20706,133,North County Metro,Unincorporated,San Diego
24318,1502431900,24319,21001,225,Mountain Empire,Unincorporated,San Diego
24319,1502432000,24320,21002,224,Desert,Unincorporated,San Diego


# Age Group Table

In [21]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\age_group_query.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra_id,yr_id,name,population
0,1500000100,2020,Under 5,19
1,1500000100,2020,5 to 9,9
2,1500000100,2020,10 to 14,28
3,1500000100,2020,15 to 17,18
4,1500000100,2020,18 and 19,5
...,...,...,...,...
1459255,1502014700,2020,15 to 17,10
1459256,1502014700,2020,18 and 19,6
1459257,1502014700,2020,20 to 24,25
1459258,1502014700,2020,25 to 29,22


In [26]:
def age_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['name'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    age_output = age_output.merge(mgra_denorm[['mgra_id', geo_level]], how='left', on='mgra_id')
    age_output = age_output.drop('mgra_id', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id']).sum()
    first_column = age_output.pop('Under 5')
    age_output.insert(0, 'Under 5', first_column)

    return age_output

In [28]:
age_cleaning(df, geo_level='cpa')

Unnamed: 0_level_0,Unnamed: 1_level_0,Under 5,10 to 14,15 to 17,18 and 19,20 to 24,25 to 29,30 to 34,35 to 39,40 to 44,45 to 49,5 to 9,50 to 54,55 to 59,60 and 61,62 to 64,65 to 69,70 to 74,75 to 79,80 to 84,85 and Older
cpa,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
*Not in a CPA*,2020,97378,105426,57728,38003,103403,113490,105018,99386,90323,89614,95921,86231,84679,28942,40112,57066,40541,27237,19015,25595
*Not in a CPA*,2021,92116,104554,56221,37169,110774,115292,101947,98556,91236,87132,97842,80894,86472,29188,38378,56384,44239,27238,20061,26601
*Not in a CPA*,2022,75675,97646,63148,39898,107726,105529,112117,101292,92100,83986,98023,85896,83339,29716,42005,56698,46936,28277,20781,28558
32nd Street Naval Station,2020,0,0,0,1241,3842,932,216,128,82,17,0,1,2,0,0,0,0,0,0,0
32nd Street Naval Station,2021,0,0,0,1271,3945,955,222,130,84,18,0,2,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Valley Center,2021,2256,1333,710,409,1243,1521,1264,1593,1034,1067,1422,869,1691,623,503,805,652,462,261,317
Valley Center,2022,1832,1144,1058,469,1261,1631,1332,1658,1072,1110,1701,879,1407,851,540,807,694,528,247,342
Via De La Valle,2020,15,31,15,0,2,21,14,5,17,18,10,53,17,25,13,30,52,28,10,20
Via De La Valle,2021,19,28,13,2,0,17,20,7,11,14,12,56,18,7,16,44,47,26,8,21


In [22]:
age_output = pd.pivot(df, values='population', index=['mgra_id', 'yr_id'], columns=['name'])
age_output.columns.name = ''
age_output = age_output.reset_index()
age_output = age_output.merge(mgra_denorm[['mgra_id', 'mgra']], how='left', on='mgra_id')
age_output = age_output.drop('mgra_id', axis=1)
age_output = age_output.groupby(['mgra', 'yr_id']).sum()
first_column = age_output.pop('Under 5')
age_output.insert(0, 'Under 5', first_column)

age_output

Unnamed: 0_level_0,Unnamed: 1_level_0,Under 5,10 to 14,15 to 17,18 and 19,20 to 24,25 to 29,30 to 34,35 to 39,40 to 44,45 to 49,5 to 9,50 to 54,55 to 59,60 and 61,62 to 64,65 to 69,70 to 74,75 to 79,80 to 84,85 and Older
mgra,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2020,19,28,18,5,49,20,6,23,30,30,9,33,35,14,15,32,53,23,18,5
1,2021,12,26,18,7,60,25,5,21,35,38,12,21,33,10,9,34,41,15,21,6
1,2022,14,21,6,11,68,24,11,15,26,33,7,21,40,16,19,27,32,21,20,8
2,2020,2,1,1,1,6,8,12,19,9,8,0,6,6,6,8,11,7,4,3,12
2,2021,2,0,2,1,7,7,11,21,9,9,0,7,6,3,6,12,8,5,3,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24320,2021,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24320,2022,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
24321,2020,8,2,3,2,13,6,1,4,7,6,2,8,4,0,0,3,0,0,0,0
24321,2021,10,6,4,1,13,2,1,0,4,6,6,8,1,1,0,3,0,1,0,1


In [40]:
age_cleaning(df, geo_level='region').to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\age\region_age_est_2022_01_ind_QA.csv')

In [30]:
mgra_denorm

Unnamed: 0,mgra_id,mgra,census_tract,LUZ,cpa,jurisdiction,region
0,1500000100,1,2705,10,Mid-City:Eastern Area,San Diego,San Diego
1,1500000200,2,5601,28,Downtown,San Diego,San Diego
2,1500000300,3,15407,112,*Not in a CPA*,El Cajon,San Diego
3,1500000400,4,17407,151,*Not in a CPA*,Encinitas,San Diego
4,1500000500,5,17407,151,*Not in a CPA*,Encinitas,San Diego
...,...,...,...,...,...,...,...
24316,1502431700,24317,2905,11,County Islands,Unincorporated,San Diego
24317,1502431800,24318,20706,133,North County Metro,Unincorporated,San Diego
24318,1502431900,24319,21001,225,Mountain Empire,Unincorporated,San Diego
24319,1502432000,24320,21002,224,Desert,Unincorporated,San Diego


# Age Sex Ethnicity

In [4]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\SQL_Queries\age_sex_eth_2.sql', 'r') as sql_file:
    sql_query = sql_file.read()

df =  pd.read_sql_query(sql_query, conn)
df

Unnamed: 0,mgra,yr_id,age_group,sex,race,Unnamed: 6
0,16521,2022,20 to 24,Male,"Non-Hispanic, White",14
1,19041,2021,40 to 44,Female,"Non-Hispanic, Asian",5
2,8616,2020,62 to 64,Female,"Non-Hispanic, White",4
3,10795,2022,18 and 19,Female,"Non-Hispanic, Asian",0
4,11577,2022,60 and 61,Female,"Non-Hispanic, White",5
...,...,...,...,...,...,...
23348155,20301,2022,45 to 49,Male,"Non-Hispanic, Other",0
23348156,9741,2021,10 to 14,Female,Hispanic,47
23348157,21011,2022,60 and 61,Female,"Non-Hispanic, Asian",0
23348158,9311,2022,20 to 24,Male,"Non-Hispanic, Hawaiian or Pacific Islander",0


In [9]:
df.columns

Index(['mgra', 'yr_id', 'age_group', 'sex', 'race', ''], dtype='object')

In [10]:
def age_sex_eth_cleaning(df, geo_level):
    age_output = pd.pivot(df, values='', index=['mgra', 'yr_id', 'age_group', 'sex'], columns=['race'])
    age_output.columns.name = ''
    age_output = age_output.reset_index()
    if geo_level != 'mgra':
        age_output = age_output.merge(mgra_denorm[['mgra', geo_level]], how='left', on='mgra')
        age_output = age_output.drop('mgra', axis=1)
    age_output = age_output.groupby([geo_level, 'yr_id', 'age_group', 'sex']).sum()
    # first_column = age_output.pop('Under 5')
    # age_output.insert(0, 'Under 5', first_column)

    return age_output

In [12]:
output = age_sex_eth_cleaning(df, geo_level='mgra').to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\age_sex_ethnicity\mgra_age_sex_ethnicty_est_2022_01_ind_QA.csv')

output

In [None]:
age_sex_eth_cleaning(df, geo_level='census_tract').to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\age_sex_ethnicity\census_tract_age_sex_ethnicty_est_2022_01_ind_QA.csv')

In [None]:
age_sex_eth_cleaning(df, geo_level='LUZ').to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\age_sex_ethnicity\LUZ_age_sex_ethnicty_est_2022_01_ind_QA.csv')

In [None]:
age_sex_eth_cleaning(df, geo_level='cpa').to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\age_sex_ethnicity\cpa_age_sex_ethnicty_est_2022_01_ind_QA.csv')

In [None]:
age_sex_eth_cleaning(df, geo_level='jurisdiction').to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\age_sex_ethnicity\jurisdiction_age_sex_ethnicty_est_2022_01_ind_QA.csv')

In [None]:
age_sex_eth_cleaning(df, geo_level='region').to_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-024 Estimates 2022\Data\age_sex_ethnicity\region_age_sex_ethnicty_est_2022_01_ind_QA.csv')

In [54]:
mgra_denorm.columns

Index(['mgra_id', 'mgra', 'census_tract', 'LUZ', 'cpa', 'jurisdiction',
       'region'],
      dtype='object')