In [11]:
import pandas as pd
import os
import numpy as np

# Helpful functions

In [12]:
def filter_for_our_years(df):
    return df[(df['yr_id'] == 2020) | (df['yr_id'] == 2021)]

# Diff Function

In [13]:
def get_input_data_across_series(category, geo_level, new_estimates_version):
    '''This function compares 2021_01 data with any of the series 15 data products.'''
    # Old
    df_old = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_2021_01\individual_files\QA_2021_01_{geo_level}_{category}.xlsx')
    df_old = filter_for_our_years(df_old)
    if geo_level == 'sra':
        df_old['sra'] = df_old['sra'].str.lower()
    df_old = df_old.set_index([geo_level, 'yr_id'])
    if category == 'age_sex_ethnicity':
        df_old = df_old.rename(columns={'name':'age group'})
    

    # New
    df_new = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{new_estimates_version}\{geo_level}_est_{new_estimates_version}_{category}_ind_QA.xlsx')
    if geo_level == 'sra':
        df_new['sra'] = df_new['sra'].str.lower()
    
    df_new = filter_for_our_years(df_new)
    df_new = df_new.set_index([geo_level, 'yr_id'])
    
    return df_new, df_old

In [14]:
def get_input_data_both_series15(category, geo_level, old_estimates_version, new_estimates_version):
    # Old
    df_old = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{old_estimates_version}\{geo_level}_est_{old_estimates_version}_{category}_ind_QA.xlsx')
    if geo_level == 'sra':
        df_old['sra'] = df_old['sra'].str.lower()
    df_old = df_old.set_index([geo_level, 'yr_id'])
    

    # New
    df_new = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{new_estimates_version}\{geo_level}_est_{new_estimates_version}_{category}_ind_QA.xlsx')
    if geo_level == 'sra':
        df_new['sra'] = df_new['sra'].str.lower()
    
    df_new = df_new.set_index([geo_level, 'yr_id'])
    
    return df_new, df_old

In [15]:
# def get_input_data_combo_files_series15(geo_level):
#     # 2022_01
#     df_2022_01 = pd.read_csv(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Data\combo_files\{geo_level}_housing_population_households_combo_QA.csv')
#     df_2022_01 = filter_for_our_years(df_2022_01)
#     if geo_level != 'LUZ':
#         df_2022_01[geo_level] = df_2022_01[geo_level].str.lower()
#     df_2022_01 = df_2022_01.set_index([geo_level, 'yr_id'])
#     #print('good')

#     # 2021_01 
#     if geo_level == 'SRA':
#         geo_level = 'sra'
#     df_2021_01 = pd.read_csv(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Results\Test 5- Vintage Comparison\2021_01\combo_files\2021_01_{geo_level}_housing_population_households_combo_QA.csv')
#     #print('good')
#     df_2021_01 = filter_for_our_years(df_2021_01)
#     #print(df_2021_01.columns)
#     if geo_level == 'sra':
#         df_2021_01['SRA'] = df_2021_01['sra']
#         df_2021_01 = df_2021_01.drop('sra', axis=1)
#         geo_level = 'SRA'
#         df_2021_01['SRA'] = np.where(df_2021_01['SRA'] == 'harbison-crest', 'harbison crest', df_2021_01['SRA'])
#     if geo_level != 'LUZ':
#         df_2021_01[geo_level] = df_2021_01[geo_level].str.lower()
#     df_2021_01 = df_2021_01.set_index([geo_level, 'yr_id'])
    
#     return df_2022_01, df_2021_01

# Subtraction Function

In [16]:
def subtract_dataframes(df1, df2):
    # Filter the second data frame to keep only rows with index values in the first data frame
    common_index = pd.merge(df1, df2, left_index=True, right_index=True)
    df1 = df1.loc[common_index.index]
    df2 = df2.loc[common_index.index]
    
    # Find the common columns between the two data frames
    common_cols = list(set(df1.columns) & set(df2.columns))
    common_cols = [x for x in df2.columns if x in common_cols]
    
    # Filter the data frames to keep only the common columns
    df1 = df1[common_cols]
    df2 = df2[common_cols]
    
    # Subtract the second data frame from the first
    result = df1.subtract(df2)
    
    return result

# Write to excel function

In [17]:
def write_to_excel(df_new, df_old, old_estimates_version, new_estimates_version, category, geo_level):
    diff = subtract_dataframes(df_new, df_old)
    # Create a Pandas Excel writer using xlsxwriter as the engine
    writer = pd.ExcelWriter(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\diff_files\{new_estimates_version}\{geo_level}_{category}_est_{new_estimates_version}_minus_{old_estimates_version}_QA.xlsx', engine='xlsxwriter')

    # Write each data frame to a different sheet
    df_new.reset_index().to_excel(writer, sheet_name=f'{new_estimates_version} Data', index=False)
    df_old.reset_index().to_excel(writer, sheet_name=f'{old_estimates_version} Data', index=False)
    diff.reset_index().to_excel(writer, sheet_name='Diff', index=False)

    # Save the writer
    writer.save()

In [18]:
# def write_to_excel_combo_files(df_2022, df_2021, geo_level):
#     diff = subtract_dataframes(df_2022, df_2021)
#     # Create a Pandas Excel writer using xlsxwriter as the engine
#     writer = pd.ExcelWriter(rf'C:\Users\cra\San Diego Association of Governments\SANDAG QA QC - Documents\Projects\2023\2023-023 Estimates 2022\Results\Test 5- Vintage Comparison\diff_outputs\combo_files\{geo_level}_diff_2022_minus_2021.xlsx', engine='xlsxwriter')

#     # Write each data frame to a different sheet
#     df_2022.reset_index().to_excel(writer, sheet_name='2022_01 Data', index=False)
#     df_2021.reset_index().to_excel(writer, sheet_name='2021_01 Data', index=False)
#     diff.reset_index().to_excel(writer, sheet_name='Diff', index=False)

#     # Save the writer
#     writer.save()

# Run

In [19]:
categories = ['age', 'ethnicity', 'households', 'housing', 'income', 'population', 'sex']
#categories = ['age']
geo_levels = ['sra', 'cpa', 'jurisdiction', 'region']
#geo_levels = ['jurisdiction']

## Accross Series Versions

In [20]:
for category in categories:
    for geo_level in geo_levels:
        df_new, df_old = get_input_data_across_series(category=category, geo_level=geo_level, new_estimates_version='2022_02')
        write_to_excel(df_new=df_new, df_old=df_old, old_estimates_version='2021_01', new_estimates_version='2022_02', category=category, geo_level=geo_level)
        print(f'{category}-{geo_level} is complete')

age-sra is complete
age-cpa is complete
age-jurisdiction is complete
age-region is complete
ethnicity-sra is complete
ethnicity-cpa is complete
ethnicity-jurisdiction is complete
ethnicity-region is complete
households-sra is complete
households-cpa is complete
households-jurisdiction is complete
households-region is complete
housing-sra is complete
housing-cpa is complete
housing-jurisdiction is complete
housing-region is complete
income-sra is complete
income-cpa is complete
income-jurisdiction is complete
income-region is complete
population-sra is complete
population-cpa is complete
population-jurisdiction is complete
population-region is complete
sex-sra is complete
sex-cpa is complete
sex-jurisdiction is complete
sex-region is complete


## For Series 15 Data

In [21]:
# df_new = pd.read_excel(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\2022_02\cpa_est_2022_02_housing_ind_QA.xlsx')
# df_new

In [22]:
for category in categories:
    for geo_level in geo_levels:
        df_new, df_old = get_input_data_both_series15(category=category, geo_level=geo_level, old_estimates_version='2022_01', new_estimates_version='2022_02')
        write_to_excel(df_new=df_new, df_old=df_old, old_estimates_version='2022_01', new_estimates_version='2022_02', category=category, geo_level=geo_level)
        print(f'{category}-{geo_level} is complete')

age-sra is complete
age-cpa is complete
age-jurisdiction is complete
age-region is complete
ethnicity-sra is complete
ethnicity-cpa is complete
ethnicity-jurisdiction is complete
ethnicity-region is complete
households-sra is complete
households-cpa is complete
households-jurisdiction is complete
households-region is complete
housing-sra is complete
housing-cpa is complete
housing-jurisdiction is complete
housing-region is complete
income-sra is complete
income-cpa is complete
income-jurisdiction is complete
income-region is complete
population-sra is complete
population-cpa is complete
population-jurisdiction is complete
population-region is complete
sex-sra is complete
sex-cpa is complete
sex-jurisdiction is complete
sex-region is complete


In [9]:
category = 'age_sex_ethnicity'
for geo_level in geo_levels:
    df_2022_01, df_2021_01 = get_input_data(category=category, geo_level=geo_level)
    df_2022_01 = df_2022_01.reset_index().groupby([geo_level, 'yr_id', 'age_group', 'sex']).sum()
    df_2021_01 = df_2021_01.reset_index().groupby([geo_level, 'yr_id', 'age_group', 'sex']).sum()
    write_to_excel(df_2022_01, df_2021_01, category = category, geo_level = geo_level)
    print(f'{geo_level}-is complete')

SRA-is complete


In [125]:
df_2021_01 = df_2021_01.reset_index().groupby([geo_level, 'yr_id', 'age_group', 'sex']).sum()

In [126]:
write_to_excel(df_2022_01, df_2021_01, category = category, geo_level = geo_level)

In [None]:
# Add name and sex in the groupby in data input