In [3]:
import pandas as pd
import os
import numpy as np

# Helpful functions

In [4]:
def filter_for_our_years(df):
    return df[(df['yr_id'] == 2020) | (df['yr_id'] == 2021)]

# Get Input Data

In [5]:
def get_input_data(category, geo_level, old_estimates_version, new_estimates_version):
    # Old
    df_old = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\mgra_series_15\individual_files\{old_estimates_version}\{geo_level}_est_{old_estimates_version}_{category}_ind_QA.xlsx')
    if geo_level == 'sra':
        df_old['sra'] = df_old['sra'].str.lower()
    df_old = df_old.set_index([geo_level, 'yr_id'])
    

    # New
    df_new = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\mgra_series_15\individual_files\{new_estimates_version}\{geo_level}_est_{new_estimates_version}_{category}_ind_QA.xlsx')
    if geo_level == 'sra':
        df_new['sra'] = df_new['sra'].str.lower()
    
    df_new = df_new.set_index([geo_level, 'yr_id'])
    
    return df_new, df_old

In [18]:
def get_input_data_ase(category, geo_level, old_estimates_version, new_estimates_version):
    # Old
    df_old = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\mgra_series_15\individual_files\{old_estimates_version}\{geo_level}_est_{old_estimates_version}_{category}_ind_QA.xlsx')

    # New
    df_new = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\mgra_series_15\individual_files\{new_estimates_version}\{geo_level}_est_{new_estimates_version}_{category}_ind_QA.xlsx')
    
    return df_new, df_old

# Subtraction Function

In [7]:
def subtract_dataframes(df1, df2):
    # Filter the second data frame to keep only rows with index values in the first data frame
    common_index = pd.merge(df1, df2, left_index=True, right_index=True)
    df1 = df1.loc[common_index.index]
    df2 = df2.loc[common_index.index]
    
    # Find the common columns between the two data frames
    common_cols = list(set(df1.columns) & set(df2.columns))
    common_cols = [x for x in df2.columns if x in common_cols]
    
    # Filter the data frames to keep only the common columns
    df1 = df1[common_cols]
    df2 = df2[common_cols]
    
    # Subtract the second data frame from the first
    result = df1.subtract(df2)
    
    return result

# Write to excel function

In [8]:
def write_to_excel(df_new, df_old, old_estimates_version, new_estimates_version, category, geo_level):
    diff = subtract_dataframes(df_new, df_old)
    # Create a Pandas Excel writer using xlsxwriter as the engine
    writer = pd.ExcelWriter(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\mgra_series_15\diff_files\{new_estimates_version}\{geo_level}_{category}_est_{new_estimates_version}_minus_{old_estimates_version}_QA.xlsx', engine='xlsxwriter')

    # Write each data frame to a different sheet
    df_new.reset_index().to_excel(writer, sheet_name=f'{new_estimates_version} Data', index=False)
    df_old.reset_index().to_excel(writer, sheet_name=f'{old_estimates_version} Data', index=False)
    diff.reset_index().to_excel(writer, sheet_name='Diff', index=False)

    # Save the writer
    writer.save()

# Run

In [10]:
categories = ['age', 'ethnicity', 'households', 'housing', 'income', 'population', 'sex']
#categories = ['workers', 'children']
#geo_levels = ['census_tract', 'luz', 'mgra', 'sra', 'cpa', 'jurisdiction', 'region']
geo_levels = ['mgra', 'sra', 'cpa', 'jurisdiction', 'region']
#geo_levels = ['mgra']

## For Series 15 Data

In [19]:
for category in categories:
    for geo_level in geo_levels:
        df_new, df_old = get_input_data(category=category, geo_level=geo_level, old_estimates_version='2022_03', new_estimates_version='2022_04')
        write_to_excel(df_new=df_new, df_old=df_old, old_estimates_version='2022_03', new_estimates_version='2022_04', category=category, geo_level=geo_level)
        print(f'{category}-{geo_level} is complete')

age-mgra is complete
age-sra is complete
age-cpa is complete
age-jurisdiction is complete
age-region is complete
ethnicity-mgra is complete
ethnicity-sra is complete
ethnicity-cpa is complete
ethnicity-jurisdiction is complete
ethnicity-region is complete
households-mgra is complete
households-sra is complete
households-cpa is complete
households-jurisdiction is complete
households-region is complete
housing-mgra is complete
housing-sra is complete
housing-cpa is complete
housing-jurisdiction is complete
housing-region is complete
income-mgra is complete
income-sra is complete
income-cpa is complete
income-jurisdiction is complete
income-region is complete
population-mgra is complete
population-sra is complete
population-cpa is complete
population-jurisdiction is complete
population-region is complete
sex-mgra is complete
sex-sra is complete
sex-cpa is complete
sex-jurisdiction is complete
sex-region is complete


# Age Sex Ethnicity
Run this last as it takes a long time. Or run at larger geographies. 

In [24]:
category = 'age_sex_ethnicity'
for geo_level in ['jurisdiction', 'region']:#geo_levels:
    df_new, df_old = get_input_data_ase(category=category, geo_level=geo_level, old_estimates_version='2022_03', new_estimates_version='2022_04')
    df_new = df_new.reset_index(drop=True).groupby([geo_level, 'yr_id', 'age group', 'sex']).sum()
    df_old = df_old.reset_index(drop=True).groupby([geo_level, 'yr_id', 'age group', 'sex']).sum()
    write_to_excel(new_estimates_version = '2022_04', df_new = df_new, old_estimates_version='2022_03', df_old = df_old, category = category, geo_level = geo_level)
    print(f'{geo_level}-is complete')

jurisdiction-is complete
region-is complete
