In [1]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Census Redistricting Data

## Housing Data

### Grabbing Data From SQL

In [2]:
conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

with open(r'C:\Users\cra\OneDrive - San Diego Association of Governments\QA_Repository\2023\2023-024 Estimates 2022\sql_queries\ct_housing.sql', 'r') as sql_file:
    sql_query = sql_file.read()

census_redistricting_ct_data =  pd.read_sql_query(sql_query, conn)
census_redistricting_ct_data

Unnamed: 0,census_tract,units,occupied,vacancy
0,100.0,1295,1221,74
1,201.0,1139,1074,65
2,202.0,2449,2301,148
3,301.0,1394,1300,94
4,302.0,1988,1758,230
...,...,...,...,...
732,21900.0,956,775,181
733,22000.0,1450,1348,102
734,22101.0,1115,1075,40
735,22102.0,2720,2598,122


### Grabbing Internal Data

In [3]:
ct_housing_2022_01_data = pd.read_csv(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\housing\census_tract_housing_est_2022_01_ind_QA.csv', usecols=['yr_id', 'census_tract', 'units', 'occupied', 'vacancy'])
ct_housing_2022_01_data = ct_housing_2022_01_data[ct_housing_2022_01_data['yr_id'] == 2020]
ct_housing_2022_01_data = ct_housing_2022_01_data.drop('yr_id', axis=1)
ct_housing_2022_01_data

Unnamed: 0,census_tract,units,occupied,vacancy
0,100,1317,1238,79
3,201,1204,1157,47
6,202,2479,2336,143
9,301,1499,1472,27
12,302,1966,1662,304
...,...,...,...,...
2193,21800,1036,857,179
2196,21900,963,915,48
2199,22000,1273,1241,32
2202,22101,1115,780,335


### Check column differences

In [4]:
in_cr_not_in_est = [x for x in list(census_redistricting_ct_data['census_tract']) if x not in list(ct_housing_2022_01_data['census_tract'])]
print(f"The following CT are in census redistricting but not in estimates: {in_cr_not_in_est}")

in_est_not_in_cr = [x for x in list(ct_housing_2022_01_data['census_tract']) if x not in list(census_redistricting_ct_data['census_tract'])]
print(f"The following CT are in estimamtes but not in census redistricting: {in_est_not_in_cr}")

The following CT are in census redistricting but not in estimates: [990100.0]
The following CT are in estimamtes but not in census redistricting: []


### Create the diff

In [5]:
census_redistricting_ct_data = census_redistricting_ct_data.set_index('census_tract')
census_redistricting_ct_data

Unnamed: 0_level_0,units,occupied,vacancy
census_tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100.0,1295,1221,74
201.0,1139,1074,65
202.0,2449,2301,148
301.0,1394,1300,94
302.0,1988,1758,230
...,...,...,...
21900.0,956,775,181
22000.0,1450,1348,102
22101.0,1115,1075,40
22102.0,2720,2598,122


In [6]:
ct_housing_2022_01_data = ct_housing_2022_01_data.set_index('census_tract')
ct_housing_2022_01_data

Unnamed: 0_level_0,units,occupied,vacancy
census_tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,1317,1238,79
201,1204,1157,47
202,2479,2336,143
301,1499,1472,27
302,1966,1662,304
...,...,...,...
21800,1036,857,179
21900,963,915,48
22000,1273,1241,32
22101,1115,780,335


In [7]:
census_redistricting_ct_data = census_redistricting_ct_data.loc[ct_housing_2022_01_data.index]
census_redistricting_ct_data

Unnamed: 0_level_0,units,occupied,vacancy
census_tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,1295,1221,74
201,1139,1074,65
202,2449,2301,148
301,1394,1300,94
302,1988,1758,230
...,...,...,...
21800,1022,806,216
21900,956,775,181
22000,1450,1348,102
22101,1115,1075,40


In [8]:
diff = ct_housing_2022_01_data - census_redistricting_ct_data
diff

Unnamed: 0_level_0,units,occupied,vacancy
census_tract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,22,17,5
201,65,83,-18
202,30,35,-5
301,105,172,-67
302,-22,-96,74
...,...,...,...
21800,14,51,-37
21900,7,140,-133
22000,-177,-107,-70
22101,0,-295,295


In [9]:
# Create a Pandas Excel writer using xlsxwriter as the engine
writer = pd.ExcelWriter(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Results\census_redistricting_data\census_tract_diff_2022_01_est_minus_census_redistricting_housing.xlsx', engine='xlsxwriter')
#writer = pd.ExcelWriter('census_tract_diff_2022_01_est_minus_census_redistricting.xlsx')

# Write each data frame to a different sheet
ct_housing_2022_01_data.reset_index().to_excel(writer, sheet_name='Estimates Data', index=False)
census_redistricting_ct_data.reset_index().to_excel(writer, sheet_name='Census Redistricting Data', index=False)
diff.reset_index().to_excel(writer, sheet_name='Diff', index=False)

writer.save()

### Region

In [10]:
# Create a Pandas Excel writer using xlsxwriter as the engine
# writer = pd.ExcelWriter(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Results\census_redistricting_data\jurr_diff_2022_01_est_minus_census_redistricting.xlsx', engine='xlsxwriter')
writer = pd.ExcelWriter(r'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Results\census_redistricting_data\region_diff_2022_01_est_minus_census_redistricting_housing.xlsx')

# Write each data frame to a different sheet
pd.DataFrame(ct_housing_2022_01_data.reset_index(drop=True).sum(axis=0)).T.reset_index().to_excel(writer, sheet_name='Estimates Data', index=False)
pd.DataFrame(census_redistricting_ct_data.reset_index(drop=True).sum(axis=0)).T.reset_index().to_excel(writer, sheet_name='Census Redistricting Data', index=False)
pd.DataFrame(diff.reset_index(drop=True).sum(axis=0)).T.reset_index().to_excel(writer, sheet_name='Diff', index=False)

writer.save()

In [11]:
pd.DataFrame(ct_housing_2022_01_data.reset_index(drop=True).sum(axis=0)).T

Unnamed: 0,units,occupied,vacancy
0,1216523,1144270,72253
