In [1]:
import pandas as pd
import os
import numpy as np

# Find the necessary files

In [2]:
def find_files(url, geo_level, category, estimates_version):
    path = url.replace("\\", "/")

    if category == 'age':
        category = 'age_i'
    files_with_words = []
    for filename in os.listdir(path):
        if geo_level in filename and category in filename and estimates_version in filename:
            file_path = os.path.join(path, filename)
            files_with_words.append(file_path)

    return files_with_words

# Create the combo file

In [14]:
def create_combo_file(geo_level, estimates_version):
    # Housing 
    housing_file = find_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level=geo_level,category='housing', estimates_version=estimates_version)
    housing_df = pd.read_excel(housing_file[0])
    housing_df = housing_df.set_index([geo_level, 'yr_id'])

    # Population 
    population_file = find_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level=geo_level,category='population', estimates_version=estimates_version)
    population_df = pd.read_excel(population_file[0])
    population_df = population_df.set_index([geo_level, 'yr_id'])

    # Households
    households_file = find_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', geo_level=geo_level,category='households', estimates_version=estimates_version)
    households_df = pd.read_excel(households_file[0])
    households_df = households_df.set_index([geo_level, 'yr_id'])

    # Merge Data
    merged_df = housing_df.merge(population_df, left_index=True, right_index=True).merge(households_df, left_index=True, right_index=True)

    # Add in HHS
    merged_df['hhs'] = round(merged_df['Household Population']/merged_df['Total Households'],2)
    return merged_df


In [15]:
test = create_combo_file(geo_level='cpa', estimates_version='2022_02')
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Mobile Home,Multifamily,Single Family - Detached,Single Family - Multiple Unit,units,occupied,vacancy,Group Quarters - College,Group Quarters - Military,Group Quarters - Other,...,Total GQ Population,1,2,3,4,5,6,7,Total Households,hhs
cpa,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
32nd Street Naval Station,2020,0,0,0,0,0,0,0,0,6674,0,...,6674,0,0,0,0,0,0,0,0,
32nd Street Naval Station,2021,0,0,0,0,0,0,0,0,6441,0,...,6441,0,0,0,0,0,0,0,0,
32nd Street Naval Station,2022,0,0,0,0,0,0,0,0,7610,0,...,7610,0,0,0,0,0,0,0,0,
Alpine,2020,272,778,4980,690,6720,6349,371,0,0,159,...,159,1404,2207,1182,987,384,130,55,6349,2.77
Alpine,2021,272,778,4995,690,6735,6485,250,0,0,189,...,189,1363,2244,1180,1049,424,146,79,6485,2.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Valley Center,2021,555,224,6082,220,7081,6442,639,0,0,49,...,49,1031,2058,1181,1189,566,233,184,6442,3.10
Valley Center,2022,555,223,6370,220,7368,6709,659,0,0,50,...,50,1071,2130,1216,1244,615,236,197,6709,3.06
Via De La Valle,2020,0,42,175,7,224,183,41,0,0,0,...,0,79,74,21,9,0,0,0,183,2.16
Via De La Valle,2021,0,42,175,7,224,183,41,0,0,0,...,0,75,79,19,10,0,0,0,183,2.11


# Output the combo file

In [16]:
def create_and_output_combo_file(geo_level, estimates_version):
    # Create the combo file
    combo_file = create_combo_file(geo_level, estimates_version)
    combo_file = combo_file.reset_index()

    # Output to correct place 
    combo_file.to_excel(rf"J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\combo_files\{estimates_version}\{geo_level}_est_{estimates_version}_combo_file_QA.xlsx", index=False)

# Run and create combo files

In [18]:
for geo_level in ['census_tract', 'cpa', 'jurisdiction', 'luz', 'mgra', 'region', 'sra']:
    for estimates_version in ['2022_02']:
        create_and_output_combo_file(geo_level=geo_level, estimates_version=estimates_version)
        print(f"{geo_level}-{estimates_version} is complete")

census_tract-2022_02 is complete
cpa-2022_02 is complete
jurisdiction-2022_02 is complete
luz-2022_02 is complete
mgra-2022_02 is complete
region-2022_02 is complete
sra-2022_02 is complete
