In [10]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

# Download PL94 Data

In [11]:
def download_pl94_housing_data():
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

    with open(r'sql_queries\ct_population.sql', 'r') as sql_file:
        sql_query = sql_file.read()

    df = pd.read_sql_query(sql_query, conn)
    df = df.set_index('census_tract')

    return df

# Download Internal Data

In [12]:
def download_internal_housing_data(estimates_version):
    df = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}\census_tract_est_{estimates_version}_ethnicity_ind_QA.xlsx')
    df = df[df['yr_id'] == 2020]
    df = df.drop('yr_id', axis=1)
    df = df.set_index('census_tract')
    return df

# Create Diff

In [13]:
def create_diff(pl94_data, ind_file):
    pl94_data = pl94_data.loc[ind_file.index]
    
    return ind_file - pl94_data

# Push To Excel

In [14]:
def push_to_excel_census_tract_level(pl94_data, ind_file, diff, estimates_version):
    writer = pd.ExcelWriter(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\pl94_diff_files\{estimates_version}\census_tract_est_{estimates_version}_minus_pl94_data_ethnicity_QA.xlsx', engine='xlsxwriter')
    #writer = pd.ExcelWriter('census_tract_diff_2022_01_est_minus_census_redistricting.xlsx')

    # Write each data frame to a different sheet
    pl94_data.loc[ind_file.index].reset_index().to_excel(writer, sheet_name=f'{estimates_version} Data', index=False)
    ind_file.reset_index().to_excel(writer, sheet_name='Census Redistricting Data', index=False)
    diff.reset_index().to_excel(writer, sheet_name='Diff', index=False)

    writer.save()

In [15]:
def push_to_excel_region_level(pl94_data, ind_file, diff, estimates_version):
    writer = pd.ExcelWriter(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\pl94_diff_files\{estimates_version}\region_est_{estimates_version}_minus_pl94_data_ethnicity_QA.xlsx', engine='xlsxwriter')

    # Write each data frame to a different sheet
    pd.DataFrame(pl94_data.loc[ind_file.index].reset_index(drop=True).sum(axis=0)).T.reset_index().to_excel(writer, sheet_name=f'{estimates_version} Data', index=False)
    pd.DataFrame(ind_file.reset_index(drop=True).sum(axis=0)).T.reset_index().to_excel(writer, sheet_name='Census Redistricting Data', index=False)
    pd.DataFrame(diff.reset_index(drop=True).sum(axis=0)).T.reset_index().to_excel(writer, sheet_name='Diff', index=False)

    writer.save()

# Main Function

In [16]:
def create_ethnicity_pl94_ouptput(estimates_version):
    pl94_data = download_pl94_housing_data()
    ind_file = download_internal_housing_data(estimates_version)
    diff = create_diff(pl94_data, ind_file)

    in_cr_not_in_est = [x for x in list(pl94_data.index) if x not in list(ind_file.index)]
    print(f"The following CT are in census redistricting but not in estimates: {in_cr_not_in_est}")

    in_est_not_in_cr = [x for x in list(ind_file.index) if x not in list(pl94_data.index)]
    print(f"The following CT are in estimamtes but not in census redistricting: {in_est_not_in_cr}")

    push_to_excel_census_tract_level(pl94_data, ind_file, diff, estimates_version)

    push_to_excel_region_level(pl94_data, ind_file, diff, estimates_version)

    print('Outputs Completed')

# Run

In [17]:
create_ethnicity_pl94_ouptput(estimates_version='2022_01')

The following CT are in census redistricting but not in estimates: [990100.0]
The following CT are in estimamtes but not in census redistricting: []
Outputs Completed
