# CRA Employment Checks 

In [85]:
# Libraries 
import numpy as np
import pandas as pd
import pyodbc

# EC2_Data_File1_Build

## 1. Reviewing SQL Data Extract
Ensuring that the SQL output matches what we have in the excel 

### Downloading Data From SQL

In [86]:
# Staging Data (SQL)
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=ws;'
                    'Trusted_Connection=yes;')

In [87]:
query = """
-- Population by Age
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_age_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

Pop_by_Age_SQL =  pd.read_sql_query(query, conn)
Pop_by_Age_SQL

Unnamed: 0,tier,employment_center_id,employment_center_name,age_group_id,age_group,pop
0,0,80,Kearny Mesa,7,25 to 29,1203
1,0,80,Kearny Mesa,11,45 to 49,1199
2,0,80,Kearny Mesa,2,5 to 9,1339
3,0,80,Kearny Mesa,20,85 and Older,360
4,0,80,Kearny Mesa,18,75 to 79,511
...,...,...,...,...,...,...
2915,6,108,Viejas Casino & Resort,7,25 to 29,34
2916,6,108,Viejas Casino & Resort,9,35 to 39,29
2917,6,108,Viejas Casino & Resort,13,55 to 59,28
2918,6,108,Viejas Casino & Resort,16,65 to 69,31


In [88]:
query = """
-- Population by HH & GQ
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_demographics_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

Pop_by_Type_SQL =  pd.read_sql_query(query, conn)
Pop_by_Type_SQL

Unnamed: 0,tier,employment_center_id,employment_center_name,housing_type_id,long_name,pop
0,0,80,Kearny Mesa,1,Household Population,19610
1,0,80,Kearny Mesa,2,Group Quarters - Military,0
2,0,80,Kearny Mesa,3,Group Quarters - College,0
3,0,80,Kearny Mesa,4,Group Quarters - Other,71
4,0,81,Sorrento Valley,1,Household Population,48782
...,...,...,...,...,...,...
579,6,107,Valley View Casino & Hotel,4,Group Quarters - Other,1
580,6,108,Viejas Casino & Resort,1,Household Population,538
581,6,108,Viejas Casino & Resort,2,Group Quarters - Military,0
582,6,108,Viejas Casino & Resort,3,Group Quarters - College,0


In [89]:
query = """
-- Population by Race/Ethnicity
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_ethnicity_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

Pop_by_Race_Eth_SQL =  pd.read_sql_query(query, conn)
Pop_by_Race_Eth_SQL

Unnamed: 0,tier,employment_center_id,employment_center_name,ethnicity_id,short_name,pop,pct
0,0,80,Kearny Mesa,1,Hispanic,5306,26.96
1,0,80,Kearny Mesa,2,White,8415,42.76
2,0,80,Kearny Mesa,3,Black,1534,7.79
3,0,80,Kearny Mesa,4,American Indian,93,0.47
4,0,80,Kearny Mesa,5,Asian,3579,18.19
...,...,...,...,...,...,...,...
1163,6,108,Viejas Casino & Resort,4,American Indian,11,2.04
1164,6,108,Viejas Casino & Resort,5,Asian,13,2.42
1165,6,108,Viejas Casino & Resort,6,Pacific Islander,1,0.19
1166,6,108,Viejas Casino & Resort,7,Other,2,0.37


In [90]:
query = """
-- Housing Units by Structure Type
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_housing_by_structure_type_by_center]
		@release_id = 2,
		@demographic_warehouse_datasource_id = 45,
		@year = 2021
"""

HU_by_Type_SQL =  pd.read_sql_query(query, conn)
HU_by_Type_SQL

Unnamed: 0,tier,employment_center_id,employment_center_name,structure_type_id,long_name,units,hh
0,0,80,Kearny Mesa,1,Single Family - Detached,1010,984
1,0,80,Kearny Mesa,2,Single Family - Multiple Unit,283,272
2,0,80,Kearny Mesa,3,Multifamily,5954,5735
3,0,80,Kearny Mesa,4,Mobile Home,325,297
4,0,80,Kearny Mesa,5,Single-family Detached,0,0
...,...,...,...,...,...,...,...
871,6,108,Viejas Casino & Resort,2,Single Family - Multiple Unit,3,2
872,6,108,Viejas Casino & Resort,3,Multifamily,0,0
873,6,108,Viejas Casino & Resort,4,Mobile Home,34,33
874,6,108,Viejas Casino & Resort,5,Single-family Detached,0,0


In [91]:
query = """
-- Land Area (in square miles) 
DECLARE	@return_value int

EXEC	@return_value = [employment_centers].[sp_land_area]
		@release_id = 2
"""

EC_by_Land_Area_SQL =  pd.read_sql_query(query, conn)
EC_by_Land_Area_SQL

Unnamed: 0,release_id,employment_center_id,employment_center_name,tier,sq_miles
0,2,1,San Diego Airport,3,2.58
1,2,2,Alpine,4,2.67
2,2,3,Barrio Logan,4,0.81
3,2,4,Carlsbad Palomar Airport,2,4.72
4,2,5,Carlsbad State Beach,3,1.57
...,...,...,...,...,...
142,2,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,2.52
143,2,1040,Sorrento Valley West Sub-Center: UCSD,0,1.13
144,2,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,1.02
145,2,1042,Sorrento Valley West Sub-Center: UTC Office,0,0.88


### Downloading the data from Excel

In [109]:
EC_by_Land_Area_excel = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\data\EC2_Data_File_1_Build.xlsx', sheet_name='EC_by_Land_Area', header=2)
EC_by_Land_Area_excel = EC_by_Land_Area_excel.rename(columns={'Area_Sq_Mi':'sq_miles'})
EC_by_Land_Area_excel

Unnamed: 0,release_id,employment_center_id,employment_center_name,tier,sq_miles
0,2,1,San Diego Airport,3,2.58
1,2,2,Alpine,4,2.67
2,2,3,Barrio Logan,4,0.81
3,2,4,Carlsbad Palomar Airport,2,4.72
4,2,5,Carlsbad State Beach,3,1.57
...,...,...,...,...,...
142,2,1039,Sorrento Valley West Sub-Center: Scripps_Research,0,2.52
143,2,1040,Sorrento Valley West Sub-Center: UCSD,0,1.13
144,2,1041,Sorrento Valley West Sub-Center: UCSD East Hea...,0,1.02
145,2,1042,Sorrento Valley West Sub-Center: UTC Office,0,0.88


In [93]:
Pop_by_Type_excel = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\data\EC2_Data_File_1_Build.xlsx', sheet_name='Pop_by_Type', header=2, usecols="A:F")
Pop_by_Type_excel

Unnamed: 0,tier,employment_center_id,employment_center_name,housing_type_id,long_name,pop
0,3,1,San Diego Airport,1,Household Population,6294
1,3,1,San Diego Airport,2,Group Quarters - Military,17
2,3,1,San Diego Airport,3,Group Quarters - College,0
3,3,1,San Diego Airport,4,Group Quarters - Other,229
4,4,2,Alpine,1,Household Population,4326
...,...,...,...,...,...,...
579,0,1042,Sorrento Valley West Sub-Center: UTC Office,4,Group Quarters - Other,0
580,0,1043,Sorrento Valley West Sub-Center: UTC Retail,1,Household Population,0
581,0,1043,Sorrento Valley West Sub-Center: UTC Retail,2,Group Quarters - Military,0
582,0,1043,Sorrento Valley West Sub-Center: UTC Retail,3,Group Quarters - College,0


In [94]:
Pop_by_Age_excel = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\data\EC2_Data_File_1_Build.xlsx', sheet_name='Pop_by_Age', header=2, usecols="A:F")
Pop_by_Age_excel

Unnamed: 0,tier,employment_center_id,employment_center_name,age_group_id,age_group,pop
0,3,1,San Diego Airport,1,Under 5,359
1,3,1,San Diego Airport,2,5 to 9,333
2,3,1,San Diego Airport,3,10 to 14,347
3,3,1,San Diego Airport,4,15 to 17,241
4,3,1,San Diego Airport,5,18 and 19,147
...,...,...,...,...,...,...
2915,0,1043,Sorrento Valley West Sub-Center: UTC Retail,16,65 to 69,0
2916,0,1043,Sorrento Valley West Sub-Center: UTC Retail,17,70 to 74,0
2917,0,1043,Sorrento Valley West Sub-Center: UTC Retail,18,75 to 79,0
2918,0,1043,Sorrento Valley West Sub-Center: UTC Retail,19,80 to 84,0


In [95]:
Pop_by_Race_Eth_excel = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\data\EC2_Data_File_1_Build.xlsx', sheet_name='Pop_by_Race-Eth', header=2, usecols="A:G")
Pop_by_Race_Eth_excel

Unnamed: 0,tier,employment_center_id,employment_center_name,ethnicity_id,short_name,pop,pct
0,3,1,San Diego Airport,1,Hispanic,1567,23.96
1,3,1,San Diego Airport,2,White,3896,59.57
2,3,1,San Diego Airport,3,Black,276,4.22
3,3,1,San Diego Airport,4,American Indian,28,0.43
4,3,1,San Diego Airport,5,Asian,439,6.71
...,...,...,...,...,...,...,...
1163,0,1043,Sorrento Valley West Sub-Center: UTC Retail,4,American Indian,0,
1164,0,1043,Sorrento Valley West Sub-Center: UTC Retail,5,Asian,0,
1165,0,1043,Sorrento Valley West Sub-Center: UTC Retail,6,Pacific Islander,0,
1166,0,1043,Sorrento Valley West Sub-Center: UTC Retail,7,Other,0,


In [96]:
HU_by_Type_excel = pd.read_excel(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-006 Employment Centers 2.0\data\EC2_Data_File_1_Build.xlsx', sheet_name='HU_by_Type', header=2, usecols="A:G")
HU_by_Type_excel

Unnamed: 0,tier,employment_center_id,employment_center_name,structure_type_id,long_name,units,hh
0,3,1,San Diego Airport,1,Single Family - Detached,770,748
1,3,1,San Diego Airport,2,Single Family - Multiple Unit,266,257
2,3,1,San Diego Airport,3,Multifamily,1748,1673
3,3,1,San Diego Airport,4,Mobile Home,0,0
4,3,1,San Diego Airport,5,Single-family Detached,0,0
...,...,...,...,...,...,...,...
871,0,1043,Sorrento Valley West Sub-Center: UTC Retail,2,Single Family - Multiple Unit,0,0
872,0,1043,Sorrento Valley West Sub-Center: UTC Retail,3,Multifamily,0,0
873,0,1043,Sorrento Valley West Sub-Center: UTC Retail,4,Mobile Home,0,0
874,0,1043,Sorrento Valley West Sub-Center: UTC Retail,5,Single-family Detached,0,0


### Test For Equivalency

In [114]:
def compare_dataframes(sql_df, excel_df):
    if sql_df.shape != excel_df.shape:
        return False
    
    merged = sql_df.merge(excel_df, how='left', on=list(sql_df.columns))

    # If the DFs have the same data, then the merge will output an identically shaped dataframe 
    if merged.shape != sql_df.shape:
        return False
    
    
    return True

In [115]:
SQL_data = [EC_by_Land_Area_SQL,HU_by_Type_SQL,Pop_by_Race_Eth_SQL,Pop_by_Type_SQL,Pop_by_Age_SQL]
excel_data = [EC_by_Land_Area_excel,HU_by_Type_excel,Pop_by_Race_Eth_excel,Pop_by_Type_excel,Pop_by_Age_excel]

In [116]:
for i in range(0,len(SQL_data)):
    print(compare_dataframes(SQL_data[i], excel_data[i]))

True
True
True
True
True
