# CRA Employment Checks 

In [1]:
# Libraries 
import numpy as np
import pandas as pd
import pyodbc

# EC2_Data_File1_Build

## 1. Reviewing SQL Data Extract
Ensuring that the SQL output matches what we have in the excel 

### Downloading Data From SQL

In [2]:
# Staging Data (SQL)
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=ws;'
                    'Trusted_Connection=yes;')

In [3]:
query = """
-- Population by Age
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_age_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

Pop_by_Age_SQL =  pd.read_sql_query(query, conn)

In [4]:
query = """
-- Population by HH & GQ
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_demographics_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

Pop_by_Type_SQL =  pd.read_sql_query(query, conn)

In [5]:
Pop_by_Type_SQL['employment_center_id'].nunique()

146

In [6]:
query = """
-- Population by Race/Ethnicity
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_ethnicity_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

Pop_by_Race_Eth_SQL =  pd.read_sql_query(query, conn)

In [7]:
Pop_by_Race_Eth_SQL['employment_center_id'].nunique()

146

In [8]:
query = """
-- Housing Units by Structure Type
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_housing_by_structure_type_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

HU_by_Type_SQL =  pd.read_sql_query(query, conn)

In [9]:
query = """
-- Land Area (in square miles) 
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_land_area]
		@release_id = 2
"""

EC_by_Land_Area_SQL =  pd.read_sql_query(query, conn)

# Building out the automation

In [20]:
def download_and_clean_employment_center_data(conn):
    query = """
            -- EC List Info 
            SELECT [employment_center_id],
                [employment_center_name],
                [tier],
                [parent]
            FROM [ws].[employment_centers].[dim_employment_center_2]
            """
    ec_list =  pd.read_sql_query(query, conn)

    ec_list['Type'] = np.where((ec_list['tier'] == 0) & (ec_list['parent'].isna()), 'Combined Center',
                            np.where((ec_list['tier'] == 0) & (~ec_list['parent'].isna()), 'Sub-Center',
                                     np.nan))
    ec_list['Type'].replace('nan', np.nan, inplace=True)
    ec_list = ec_list.rename(columns={'employment_center_id': 'EC_ID', 'employment_center_name': 'EC_Name', 'tier': 'Tier', 'parent':'Parent'})
    
    ec_list.sort_values("EC_ID", ascending=True, inplace=True)

    return ec_list

In [21]:
ec_list = download_and_clean_employment_center_data(conn)
ec_list

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type
64,1,San Diego Airport,3,,
65,2,Alpine,4,,
118,3,Barrio Logan,4,,
119,4,Carlsbad Palomar Airport,2,,
120,5,Carlsbad State Beach,3,,
...,...,...,...,...,...
70,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,71.0,Sub-Center
71,1040,Sorrento Valley West Sub-Center: UCSD,0,71.0,Sub-Center
72,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,71.0,Sub-Center
73,1042,Sorrento Valley West Sub-Center: UTC Office,0,71.0,Sub-Center


## EC_by_Land_Area Transformations

In [None]:
def ec_by_land_area_transformations(ec_list):

    query = """
            -- Land Area (in square miles) 
            DECLARE	@return_value int

            EXEC	@return_value = [employment_centers].[sp_land_area]
                    @release_id = 2
            """

    EC_by_Land_Area_SQL =  pd.read_sql_query(query, conn)


    ec_sq_miles_dict = dict(zip(EC_by_Land_Area_SQL.employment_center_id, EC_by_Land_Area_SQL.sq_miles))

    ec_list['Area_Sq_Mi'] = ec_list['EC_ID'].map(ec_sq_miles_dict)

    return ec_list


In [22]:
ec_sq_miles_dict = dict(zip(EC_by_Land_Area_SQL.employment_center_id, EC_by_Land_Area_SQL.sq_miles))

In [23]:
ec_list['Area_Sq_Mi'] = ec_list['EC_ID'].map(ec_sq_miles_dict)

In [24]:
ec_list

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi
64,1,San Diego Airport,3,,,2.58
65,2,Alpine,4,,,2.67
118,3,Barrio Logan,4,,,0.81
119,4,Carlsbad Palomar Airport,2,,,4.72
120,5,Carlsbad State Beach,3,,,1.57
...,...,...,...,...,...,...
70,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,71.0,Sub-Center,2.52
71,1040,Sorrento Valley West Sub-Center: UCSD,0,71.0,Sub-Center,1.13
72,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,71.0,Sub-Center,1.02
73,1042,Sorrento Valley West Sub-Center: UTC Office,0,71.0,Sub-Center,0.88


## Pop_by_Type Transformations

In [None]:
def pop_by_type_transformations():
    query = """
        -- Population by HH & GQ
        DECLARE	@return_value int

        EXEC	@return_value = [employment_centers].[sp_demographics_by_center]
                @release_id = 2,
                @demographic_warehouse_datasource_id = 45,
                @year = 2021
        """

    Pop_by_Type_SQL =  pd.read_sql_query(query, conn)

    pop_by_type_clean = Pop_by_Type_SQL[['employment_center_id', 'long_name', 'pop']]

    pop_by_type_pivot = pd.pivot_table(pop_by_type_clean, index='employment_center_id', columns='long_name', values='pop')
    pop_by_type_pivot['Total'] = pop_by_type_pivot.sum(axis=1)
    pop_by_type_pivot.columns.name = ''
    pop_by_type_pivot = pop_by_type_pivot.rename(columns={'Total':'Pop_Total', 'Household Population':'Pop_HH', 'Group Quarters - Military':'GQ_Mil', 'Group Quarters - College': 'GQ_Col', 'Group Quarters - Other': 'GQ_Oth'})
    pop_by_type_pivot['Pop_GQ'] = pop_by_type_pivot['GQ_Mil'] + pop_by_type_pivot['GQ_Col'] + pop_by_type_pivot['GQ_Oth']
    pop_by_type_pivot = pop_by_type_pivot[['Pop_Total', 'Pop_HH', 'Pop_GQ', 'GQ_Mil', 'GQ_Col', 'GQ_Oth']]
    pop_by_type_pivot['Pop_HH_%'] = round(pop_by_type_pivot['Pop_HH'] / pop_by_type_pivot['Pop_Total'], 4)*100
    pop_by_type_pivot['Pop_GQ_%'] = round(pop_by_type_pivot['Pop_GQ'] / pop_by_type_pivot['Pop_Total'], 4)*100 
    pop_by_type_pivot['GQ_Mil_%'] = round(pop_by_type_pivot['GQ_Mil'] / pop_by_type_pivot['Pop_Total'], 4)*100
    pop_by_type_pivot['GQ_Col_%'] = round(pop_by_type_pivot['GQ_Col'] / pop_by_type_pivot['Pop_Total'], 4)*100
    pop_by_type_pivot['GQ_Oth_%'] = round(pop_by_type_pivot['GQ_Oth'] / pop_by_type_pivot['Pop_Total'], 4)*100

    return pd.merge(ec_list, pop_by_type_pivot, how = 'inner', left_on='EC_ID', right_index=True)

In [25]:
pop_by_type_clean = Pop_by_Type_SQL[['employment_center_id', 'long_name', 'pop']]

In [26]:
pop_by_type_pivot = pd.pivot_table(pop_by_type_clean, index='employment_center_id', columns='long_name', values='pop')
pop_by_type_pivot['Total'] = pop_by_type_pivot.sum(axis=1)
pop_by_type_pivot.columns.name = ''
pop_by_type_pivot = pop_by_type_pivot.rename(columns={'Total':'Pop_Total', 'Household Population':'Pop_HH', 'Group Quarters - Military':'GQ_Mil', 'Group Quarters - College': 'GQ_Col', 'Group Quarters - Other': 'GQ_Oth'})
pop_by_type_pivot['Pop_GQ'] = pop_by_type_pivot['GQ_Mil'] + pop_by_type_pivot['GQ_Col'] + pop_by_type_pivot['GQ_Oth']
pop_by_type_pivot = pop_by_type_pivot[['Pop_Total', 'Pop_HH', 'Pop_GQ', 'GQ_Mil', 'GQ_Col', 'GQ_Oth']]
pop_by_type_pivot['Pop_HH_%'] = round(pop_by_type_pivot['Pop_HH'] / pop_by_type_pivot['Pop_Total'], 4)*100
pop_by_type_pivot['Pop_GQ_%'] = round(pop_by_type_pivot['Pop_GQ'] / pop_by_type_pivot['Pop_Total'], 4)*100 
pop_by_type_pivot['GQ_Mil_%'] = round(pop_by_type_pivot['GQ_Mil'] / pop_by_type_pivot['Pop_Total'], 4)*100
pop_by_type_pivot['GQ_Col_%'] = round(pop_by_type_pivot['GQ_Col'] / pop_by_type_pivot['Pop_Total'], 4)*100
pop_by_type_pivot['GQ_Oth_%'] = round(pop_by_type_pivot['GQ_Oth'] / pop_by_type_pivot['Pop_Total'], 4)*100

In [27]:
pop_by_type_pivot

Unnamed: 0_level_0,Pop_Total,Pop_HH,Pop_GQ,GQ_Mil,GQ_Col,GQ_Oth,Pop_HH_%,Pop_GQ_%,GQ_Mil_%,GQ_Col_%,GQ_Oth_%
employment_center_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,6540,6294,246,17,0,229,96.24,3.76,0.26,0.0,3.50
2,4435,4326,109,0,0,109,97.54,2.46,0.00,0.0,2.46
3,5420,5232,188,0,0,188,96.53,3.47,0.00,0.0,3.47
4,876,817,59,0,0,59,93.26,6.74,0.00,0.0,6.74
5,836,827,9,0,0,9,98.92,1.08,0.00,0.0,1.08
...,...,...,...,...,...,...,...,...,...,...,...
1039,4,4,0,0,0,0,100.00,0.00,0.00,0.0,0.00
1040,9356,0,9356,0,9356,0,0.00,100.00,0.00,100.0,0.00
1041,4244,4177,67,0,0,67,98.42,1.58,0.00,0.0,1.58
1042,1502,1502,0,0,0,0,100.00,0.00,0.00,0.0,0.00


In [28]:
ec_list = pd.merge(ec_list, pop_by_type_pivot, how = 'inner', left_on='EC_ID', right_index=True)
ec_list.head(3)

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi,Pop_Total,Pop_HH,Pop_GQ,GQ_Mil,GQ_Col,GQ_Oth,Pop_HH_%,Pop_GQ_%,GQ_Mil_%,GQ_Col_%,GQ_Oth_%
64,1,San Diego Airport,3,,,2.58,6540,6294,246,17,0,229,96.24,3.76,0.26,0.0,3.5
65,2,Alpine,4,,,2.67,4435,4326,109,0,0,109,97.54,2.46,0.0,0.0,2.46
118,3,Barrio Logan,4,,,0.81,5420,5232,188,0,0,188,96.53,3.47,0.0,0.0,3.47


## Pop_by_Age Transformations

In [None]:
def pop_by_age_transformations():
    
 

In [31]:
Pop_by_Age_SQL['age_group'] = Pop_by_Age_SQL['age_group'].str.replace(' ', '_')

In [32]:
Pop_by_Age_SQL

Unnamed: 0,tier,employment_center_id,employment_center_name,age_group_id,age_group,pop
0,0,80,Kearny Mesa,7,25_to_29,1203
1,0,80,Kearny Mesa,11,45_to_49,1199
2,0,80,Kearny Mesa,2,5_to_9,1339
3,0,80,Kearny Mesa,20,85_and_Older,360
4,0,80,Kearny Mesa,18,75_to_79,511
...,...,...,...,...,...,...
2915,6,108,Viejas Casino & Resort,7,25_to_29,34
2916,6,108,Viejas Casino & Resort,9,35_to_39,29
2917,6,108,Viejas Casino & Resort,13,55_to_59,28
2918,6,108,Viejas Casino & Resort,16,65_to_69,31


In [33]:
pd.pivot_table(Pop_by_Age_SQL, index='employment_center_id', columns='age_group_id', values='pop')

age_group_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20
employment_center_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,359,333,347,241,147,425,351,398,534,527,493,394,449,169,217,336,306,227,118,169
2,251,310,306,139,86,298,239,263,295,264,243,275,312,108,196,282,234,125,95,114
3,366,483,483,274,167,471,391,339,335,362,350,317,271,96,160,191,140,91,79,54
4,76,66,55,24,29,85,55,46,29,66,43,85,50,23,28,30,36,23,7,20
5,40,55,50,28,19,56,42,49,45,61,62,58,64,20,41,45,51,19,16,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1039,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1040,0,0,0,22,5792,3016,396,78,14,10,9,11,5,2,1,0,0,0,0,0
1041,315,339,271,128,97,223,238,327,489,432,298,212,207,59,119,153,143,79,65,50
1042,104,99,83,46,20,56,93,159,159,155,98,88,70,25,37,64,56,37,22,31


In [34]:
Pop_by_Age_pivot = pd.pivot_table(Pop_by_Age_SQL, index='employment_center_id', columns='age_group_id', values='pop')
Pop_by_Age_pivot.columns.name = ''
Pop_by_Age_pivot['Pop_Total'] = Pop_by_Age_pivot.sum(axis=1)

age_group_encoding = dict(zip(Pop_by_Age_SQL.age_group_id, Pop_by_Age_SQL.age_group))
for col in range(1,21):
    Pop_by_Age_pivot[f"Pop_{age_group_encoding[col]}_Yrs_%"] = round(Pop_by_Age_pivot[col] / Pop_by_Age_pivot['Pop_Total'], 4)*100

for key in age_group_encoding.keys():
    Pop_by_Age_pivot = Pop_by_Age_pivot.rename(columns={key: f"Pop_{age_group_encoding[key]}_Yrs"})

# Move Pop_Total to the first column 
pop_column = Pop_by_Age_pivot.pop('Pop_Total')
Pop_by_Age_pivot.insert(0, 'Pop_Total', pop_column)

In [35]:
ec_list = pd.merge(ec_list, Pop_by_Age_pivot, how = 'inner', left_on='EC_ID', right_index=True)
ec_list.head(3)

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi,Pop_Total_x,Pop_HH,Pop_GQ,GQ_Mil,...,Pop_45_to_49_Yrs_%,Pop_50_to_54_Yrs_%,Pop_55_to_59_Yrs_%,Pop_60_and_61_Yrs_%,Pop_62_to_64_Yrs_%,Pop_65_to_69_Yrs_%,Pop_70_to_74_Yrs_%,Pop_75_to_79_Yrs_%,Pop_80_to_84_Yrs_%,Pop_85_and_Older_Yrs_%
64,1,San Diego Airport,3,,,2.58,6540,6294,246,17,...,7.54,6.02,6.87,2.58,3.32,5.14,4.68,3.47,1.8,2.58
65,2,Alpine,4,,,2.67,4435,4326,109,0,...,5.48,6.2,7.03,2.44,4.42,6.36,5.28,2.82,2.14,2.57
118,3,Barrio Logan,4,,,0.81,5420,5232,188,0,...,6.46,5.85,5.0,1.77,2.95,3.52,2.58,1.68,1.46,1.0


## Pop_by_Race_Eth Transformations

In [36]:
Pop_by_Race_Eth_SQL['short_name'] = Pop_by_Race_Eth_SQL['short_name'].str.replace(' ', '_')

In [37]:
Pop_by_Race_pivot = pd.pivot_table(Pop_by_Race_Eth_SQL, index='employment_center_id', columns='ethnicity_id', values='pop')
Pop_by_Race_pivot.columns.name = ''
Pop_by_Race_pivot['Pop_Total'] = Pop_by_Race_pivot.sum(axis=1)

race_group_encoding = dict(zip(Pop_by_Race_Eth_SQL.ethnicity_id, Pop_by_Race_Eth_SQL.short_name))
for col in range(1,9):
    Pop_by_Race_pivot[f"{race_group_encoding[col]}_%"] = round(Pop_by_Race_pivot[col] / Pop_by_Race_pivot['Pop_Total'], 4)*100

for key in race_group_encoding.keys():
    Pop_by_Race_pivot = Pop_by_Race_pivot.rename(columns={key: f"Pop_{race_group_encoding[key]}"})

# Move Pop_Total to the first column 
pop_column = Pop_by_Race_pivot.pop('Pop_Total')
Pop_by_Race_pivot.insert(0, 'Pop_Total', pop_column)


In [38]:
ec_list = pd.merge(ec_list, Pop_by_Race_pivot, how = 'inner', left_on='EC_ID', right_index=True)
ec_list

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi,Pop_Total_x,Pop_HH,Pop_GQ,GQ_Mil,...,Pop_Other,Pop_Two_or_More,Hispanic_%,White_%,Black_%,American_Indian_%,Asian_%,Pacific_Islander_%,Other_%,Two_or_More_%
64,1,San Diego Airport,3,,,2.58,6540,6294,246,17,...,21,287,23.96,59.57,4.22,0.43,6.71,0.40,0.32,4.39
65,2,Alpine,4,,,2.67,4435,4326,109,0,...,10,150,19.21,69.33,2.39,0.36,4.80,0.29,0.23,3.38
118,3,Barrio Logan,4,,,0.81,5420,5232,188,0,...,2,108,74.17,14.45,5.28,0.30,3.45,0.33,0.04,1.99
119,4,Carlsbad Palomar Airport,2,,,4.72,876,817,59,0,...,2,25,22.83,58.11,1.71,0.34,13.36,0.57,0.23,2.85
120,5,Carlsbad State Beach,3,,,1.57,836,827,9,0,...,0,32,18.18,67.22,0.60,0.24,9.81,0.12,0.00,3.83
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,71.0,Sub-Center,2.52,4,4,0,0,...,0,0,0.00,100.00,0.00,0.00,0.00,0.00,0.00,0.00
71,1040,Sorrento Valley West Sub-Center: UCSD,0,71.0,Sub-Center,1.13,9356,0,9356,0,...,42,453,11.98,46.49,2.58,0.17,33.41,0.07,0.45,4.84
72,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,71.0,Sub-Center,1.02,4244,4177,67,0,...,8,216,28.75,39.44,2.62,0.24,23.30,0.38,0.19,5.09
73,1042,Sorrento Valley West Sub-Center: UTC Office,0,71.0,Sub-Center,0.88,1502,1502,0,0,...,5,71,9.19,40.75,2.26,0.53,41.41,0.80,0.33,4.73


## HU_by_Type Transformations

In [39]:
# Housing Units Work 
HU_by_Type_SQL_clean = HU_by_Type_SQL[HU_by_Type_SQL['structure_type_id'].isin([1,2,3,4])]
HU_by_Type_pivot_HU = pd.pivot_table(HU_by_Type_SQL_clean, index='employment_center_id', columns='structure_type_id', values='units')
HU_by_Type_pivot_HU
HU_by_Type_pivot_HU.columns.name = ''
HU_by_Type_pivot_HU['HU_Total'] = HU_by_Type_pivot_HU.sum(axis=1)
HU_by_Type_pivot_HU

HU_structure_type_group_encoding = {1:'HU_SFD', 2:'HU_SFA', 3:'HU_MF', 4:'HU_MH' }
for col in range(1,5):
    HU_by_Type_pivot_HU[f"{HU_structure_type_group_encoding[col]}_%"] = round(HU_by_Type_pivot_HU[col] / HU_by_Type_pivot_HU['HU_Total'], 4)*100

HU_by_Type_pivot_HU = HU_by_Type_pivot_HU.rename(columns=HU_structure_type_group_encoding)

HU_by_Type_pivot_HU['HU_SF'] = HU_by_Type_pivot_HU['HU_SFD'] + HU_by_Type_pivot_HU['HU_SFA']
HU_by_Type_pivot_HU['HU_SF_%'] = round(HU_by_Type_pivot_HU['HU_SF'] / HU_by_Type_pivot_HU['HU_Total'], 4)*100


hu_sf_column = HU_by_Type_pivot_HU.pop('HU_SF')
HU_by_Type_pivot_HU.insert(2, 'HU_SF', hu_sf_column)

HU_by_Type_pivot_HU

Unnamed: 0_level_0,HU_SFD,HU_SFA,HU_SF,HU_MF,HU_MH,HU_Total,HU_SFD_%,HU_SFA_%,HU_MF_%,HU_MH_%,HU_SF_%
employment_center_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,770,266,1036,1748,0,2784,27.66,9.55,62.79,0.00,37.21
2,816,55,871,649,154,1674,48.75,3.29,38.77,9.20,52.03
3,642,173,815,867,0,1682,38.17,10.29,51.55,0.00,48.45
4,76,21,97,245,0,342,22.22,6.14,71.64,0.00,28.36
5,1,0,1,272,108,381,0.26,0.00,71.39,28.35,0.26
...,...,...,...,...,...,...,...,...,...,...,...
1039,0,4,4,0,0,4,0.00,100.00,0.00,0.00,100.00
1040,0,0,0,0,0,0,,,,,
1041,33,131,164,1419,0,1583,2.08,8.28,89.64,0.00,10.36
1042,0,0,0,669,0,669,0.00,0.00,100.00,0.00,0.00


In [40]:
ec_list = pd.merge(ec_list, HU_by_Type_pivot_HU, how = 'inner', left_on='EC_ID', right_index=True)
ec_list

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi,Pop_Total_x,Pop_HH,Pop_GQ,GQ_Mil,...,HU_SFA,HU_SF,HU_MF,HU_MH,HU_Total,HU_SFD_%,HU_SFA_%,HU_MF_%,HU_MH_%,HU_SF_%
64,1,San Diego Airport,3,,,2.58,6540,6294,246,17,...,266,1036,1748,0,2784,27.66,9.55,62.79,0.00,37.21
65,2,Alpine,4,,,2.67,4435,4326,109,0,...,55,871,649,154,1674,48.75,3.29,38.77,9.20,52.03
118,3,Barrio Logan,4,,,0.81,5420,5232,188,0,...,173,815,867,0,1682,38.17,10.29,51.55,0.00,48.45
119,4,Carlsbad Palomar Airport,2,,,4.72,876,817,59,0,...,21,97,245,0,342,22.22,6.14,71.64,0.00,28.36
120,5,Carlsbad State Beach,3,,,1.57,836,827,9,0,...,0,1,272,108,381,0.26,0.00,71.39,28.35,0.26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,71.0,Sub-Center,2.52,4,4,0,0,...,4,4,0,0,4,0.00,100.00,0.00,0.00,100.00
71,1040,Sorrento Valley West Sub-Center: UCSD,0,71.0,Sub-Center,1.13,9356,0,9356,0,...,0,0,0,0,0,,,,,
72,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,71.0,Sub-Center,1.02,4244,4177,67,0,...,131,164,1419,0,1583,2.08,8.28,89.64,0.00,10.36
73,1042,Sorrento Valley West Sub-Center: UTC Office,0,71.0,Sub-Center,0.88,1502,1502,0,0,...,0,0,669,0,669,0.00,0.00,100.00,0.00,0.00


In [41]:
# HH Work 
HH_by_Type_SQL_clean = HU_by_Type_SQL[HU_by_Type_SQL['structure_type_id'].isin([1,2,3,4])]
HU_by_Type_pivot_HH = pd.pivot_table(HH_by_Type_SQL_clean, index='employment_center_id', columns='structure_type_id', values='hh')
HU_by_Type_pivot_HH
HU_by_Type_pivot_HH.columns.name = ''
HU_by_Type_pivot_HH['HH_Total'] = HU_by_Type_pivot_HH.sum(axis=1)
HU_by_Type_pivot_HH

HH_structure_type_group_encoding = {1:'HH_SFD', 2:'HH_SFA', 3:'HH_MF', 4:'HH_MH' }
for col in range(1,5):
    HU_by_Type_pivot_HH[f"{HH_structure_type_group_encoding[col]}_%"] = round(HU_by_Type_pivot_HH[col] / HU_by_Type_pivot_HH['HH_Total'], 4)*100

HU_by_Type_pivot_HH = HU_by_Type_pivot_HH.rename(columns=HH_structure_type_group_encoding)

HU_by_Type_pivot_HH['HH_SF'] = HU_by_Type_pivot_HH['HH_SFD'] + HU_by_Type_pivot_HH['HH_SFA']
HU_by_Type_pivot_HH['HH_SF_%'] = round(HU_by_Type_pivot_HH['HH_SF'] / HU_by_Type_pivot_HH['HH_Total'], 4)*100


hh_sf_column = HU_by_Type_pivot_HH.pop('HH_SF')
HU_by_Type_pivot_HH.insert(2, 'HH_SF', hh_sf_column)

HU_by_Type_pivot_HH

Unnamed: 0_level_0,HH_SFD,HH_SFA,HH_SF,HH_MF,HH_MH,HH_Total,HH_SFD_%,HH_SFA_%,HH_MF_%,HH_MH_%,HH_SF_%
employment_center_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,748,257,1005,1673,0,2678,27.93,9.60,62.47,0.00,37.53
2,777,52,829,626,150,1605,48.41,3.24,39.00,9.35,51.65
3,628,170,798,816,0,1614,38.91,10.53,50.56,0.00,49.44
4,74,18,92,193,0,285,25.96,6.32,67.72,0.00,32.28
5,1,0,1,250,87,338,0.30,0.00,73.96,25.74,0.30
...,...,...,...,...,...,...,...,...,...,...,...
1039,0,1,1,0,0,1,0.00,100.00,0.00,0.00,100.00
1040,0,0,0,0,0,0,,,,,
1041,32,128,160,1402,0,1562,2.05,8.19,89.76,0.00,10.24
1042,0,0,0,647,0,647,0.00,0.00,100.00,0.00,0.00


In [42]:
ec_list = pd.merge(ec_list, HU_by_Type_pivot_HH, how = 'inner', left_on='EC_ID', right_index=True)
ec_list

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi,Pop_Total_x,Pop_HH,Pop_GQ,GQ_Mil,...,HH_SFA,HH_SF,HH_MF,HH_MH,HH_Total,HH_SFD_%,HH_SFA_%,HH_MF_%,HH_MH_%,HH_SF_%
64,1,San Diego Airport,3,,,2.58,6540,6294,246,17,...,257,1005,1673,0,2678,27.93,9.60,62.47,0.00,37.53
65,2,Alpine,4,,,2.67,4435,4326,109,0,...,52,829,626,150,1605,48.41,3.24,39.00,9.35,51.65
118,3,Barrio Logan,4,,,0.81,5420,5232,188,0,...,170,798,816,0,1614,38.91,10.53,50.56,0.00,49.44
119,4,Carlsbad Palomar Airport,2,,,4.72,876,817,59,0,...,18,92,193,0,285,25.96,6.32,67.72,0.00,32.28
120,5,Carlsbad State Beach,3,,,1.57,836,827,9,0,...,0,1,250,87,338,0.30,0.00,73.96,25.74,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,71.0,Sub-Center,2.52,4,4,0,0,...,1,1,0,0,1,0.00,100.00,0.00,0.00,100.00
71,1040,Sorrento Valley West Sub-Center: UCSD,0,71.0,Sub-Center,1.13,9356,0,9356,0,...,0,0,0,0,0,,,,,
72,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,71.0,Sub-Center,1.02,4244,4177,67,0,...,128,160,1402,0,1562,2.05,8.19,89.76,0.00,10.24
73,1042,Sorrento Valley West Sub-Center: UTC Office,0,71.0,Sub-Center,0.88,1502,1502,0,0,...,0,0,647,0,647,0.00,0.00,100.00,0.00,0.00


# Cleaning the final data frame 

In [43]:
ec_list.fillna(0, inplace=True) # Dave filled all the columns where a zero was devided, to NaN - do at end

In [44]:
ec_list

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi,Pop_Total_x,Pop_HH,Pop_GQ,GQ_Mil,...,HH_SFA,HH_SF,HH_MF,HH_MH,HH_Total,HH_SFD_%,HH_SFA_%,HH_MF_%,HH_MH_%,HH_SF_%
64,1,San Diego Airport,3,0.0,0,2.58,6540,6294,246,17,...,257,1005,1673,0,2678,27.93,9.60,62.47,0.00,37.53
65,2,Alpine,4,0.0,0,2.67,4435,4326,109,0,...,52,829,626,150,1605,48.41,3.24,39.00,9.35,51.65
118,3,Barrio Logan,4,0.0,0,0.81,5420,5232,188,0,...,170,798,816,0,1614,38.91,10.53,50.56,0.00,49.44
119,4,Carlsbad Palomar Airport,2,0.0,0,4.72,876,817,59,0,...,18,92,193,0,285,25.96,6.32,67.72,0.00,32.28
120,5,Carlsbad State Beach,3,0.0,0,1.57,836,827,9,0,...,0,1,250,87,338,0.30,0.00,73.96,25.74,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,71.0,Sub-Center,2.52,4,4,0,0,...,1,1,0,0,1,0.00,100.00,0.00,0.00,100.00
71,1040,Sorrento Valley West Sub-Center: UCSD,0,71.0,Sub-Center,1.13,9356,0,9356,0,...,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00
72,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,71.0,Sub-Center,1.02,4244,4177,67,0,...,128,160,1402,0,1562,2.05,8.19,89.76,0.00,10.24
73,1042,Sorrento Valley West Sub-Center: UTC Office,0,71.0,Sub-Center,0.88,1502,1502,0,0,...,0,0,647,0,647,0.00,0.00,100.00,0.00,0.00


In [45]:
ec_list.columns

Index(['EC_ID', 'EC_Name', 'Tier', 'Parent', 'Type', 'Area_Sq_Mi',
       'Pop_Total_x', 'Pop_HH', 'Pop_GQ', 'GQ_Mil', 'GQ_Col', 'GQ_Oth',
       'Pop_HH_%', 'Pop_GQ_%', 'GQ_Mil_%', 'GQ_Col_%', 'GQ_Oth_%',
       'Pop_Total_y', 'Pop_Under_5_Yrs', 'Pop_5_to_9_Yrs', 'Pop_10_to_14_Yrs',
       'Pop_15_to_17_Yrs', 'Pop_18_and_19_Yrs', 'Pop_20_to_24_Yrs',
       'Pop_25_to_29_Yrs', 'Pop_30_to_34_Yrs', 'Pop_35_to_39_Yrs',
       'Pop_40_to_44_Yrs', 'Pop_45_to_49_Yrs', 'Pop_50_to_54_Yrs',
       'Pop_55_to_59_Yrs', 'Pop_60_and_61_Yrs', 'Pop_62_to_64_Yrs',
       'Pop_65_to_69_Yrs', 'Pop_70_to_74_Yrs', 'Pop_75_to_79_Yrs',
       'Pop_80_to_84_Yrs', 'Pop_85_and_Older_Yrs', 'Pop_Under_5_Yrs_%',
       'Pop_5_to_9_Yrs_%', 'Pop_10_to_14_Yrs_%', 'Pop_15_to_17_Yrs_%',
       'Pop_18_and_19_Yrs_%', 'Pop_20_to_24_Yrs_%', 'Pop_25_to_29_Yrs_%',
       'Pop_30_to_34_Yrs_%', 'Pop_35_to_39_Yrs_%', 'Pop_40_to_44_Yrs_%',
       'Pop_45_to_49_Yrs_%', 'Pop_50_to_54_Yrs_%', 'Pop_55_to_59_Yrs_%',
       'Pop_60_

In [46]:
(ec_list['Pop_Total_x'] == ec_list['Pop_Total_y']).sum()

146

In [47]:
(ec_list['Pop_Total_y'] == ec_list['Pop_Total']).sum()

146

In [48]:
# Beacuse I have confirmed they are equal i will begin dropping columns
ec_list = ec_list.drop(columns=['Pop_Total_x', 'Pop_Total_y'], axis=1)

In [49]:
# Move Pop_Total to the first column 
pop_column = ec_list.pop('Pop_Total')
ec_list.insert(6, 'Pop_Total', pop_column)

In [50]:
# ec_list.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\results\cra_file_1_creation.xlsx', index= False)

In [51]:
# ec_list.to_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\results\cra_file_1_creation.xlsx', index= False)

In [52]:
# HU_by_Type_excel = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\data\EC2_Data_File_1_Build.xlsx', sheet_name='HU_by_Type', header=2, usecols="A:G")

In [53]:
ec_list

Unnamed: 0,EC_ID,EC_Name,Tier,Parent,Type,Area_Sq_Mi,Pop_Total,Pop_HH,Pop_GQ,GQ_Mil,...,HH_SFA,HH_SF,HH_MF,HH_MH,HH_Total,HH_SFD_%,HH_SFA_%,HH_MF_%,HH_MH_%,HH_SF_%
64,1,San Diego Airport,3,0.0,0,2.58,6540,6294,246,17,...,257,1005,1673,0,2678,27.93,9.60,62.47,0.00,37.53
65,2,Alpine,4,0.0,0,2.67,4435,4326,109,0,...,52,829,626,150,1605,48.41,3.24,39.00,9.35,51.65
118,3,Barrio Logan,4,0.0,0,0.81,5420,5232,188,0,...,170,798,816,0,1614,38.91,10.53,50.56,0.00,49.44
119,4,Carlsbad Palomar Airport,2,0.0,0,4.72,876,817,59,0,...,18,92,193,0,285,25.96,6.32,67.72,0.00,32.28
120,5,Carlsbad State Beach,3,0.0,0,1.57,836,827,9,0,...,0,1,250,87,338,0.30,0.00,73.96,25.74,0.30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,71.0,Sub-Center,2.52,4,4,0,0,...,1,1,0,0,1,0.00,100.00,0.00,0.00,100.00
71,1040,Sorrento Valley West Sub-Center: UCSD,0,71.0,Sub-Center,1.13,9356,0,9356,0,...,0,0,0,0,0,0.00,0.00,0.00,0.00,0.00
72,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,71.0,Sub-Center,1.02,4244,4177,67,0,...,128,160,1402,0,1562,2.05,8.19,89.76,0.00,10.24
73,1042,Sorrento Valley West Sub-Center: UTC Office,0,71.0,Sub-Center,0.88,1502,1502,0,0,...,0,0,647,0,647,0.00,0.00,100.00,0.00,0.00
