In [1]:
import pandas as pd
import os
import numpy as np

# Find the necessary files

In [2]:
def find_files(url, geo_level, category, estimates_version):
    path = url.replace("\\", "/")

    if category == 'age':
        category = 'age_i'
    files_with_words = []
    for filename in os.listdir(path):
        if geo_level in filename and category in filename and estimates_version in filename:
            file_path = os.path.join(path, filename)
            files_with_words.append(file_path)

    return files_with_words

# Create the combo file

In [3]:
def create_combo_file(geo_level, estimates_version):
    # Housing 
    housing_file = find_files(url=r'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files', geo_level=geo_level,category='housing', estimates_version=estimates_version)
    housing_df = pd.read_excel(housing_file[0])
    housing_df = housing_df.set_index([geo_level, 'yr_id'])

    # Population 
    population_file = find_files(url=r'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files', geo_level=geo_level,category='population', estimates_version='2022_01')
    population_df = pd.read_excel(population_file[0])
    population_df = population_df.set_index([geo_level, 'yr_id'])

    # Households
    households_file = find_files(url=r'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files', geo_level=geo_level,category='households', estimates_version=estimates_version)
    households_df = pd.read_excel(households_file[0])
    households_df = households_df.set_index([geo_level, 'yr_id'])

    # Merge Data
    merged_df = housing_df.merge(population_df, left_index=True, right_index=True).merge(households_df, left_index=True, right_index=True)

    # Add in HHS
    merged_df['hhs'] = round(merged_df['Household Population']/merged_df['Total Households'],2)
    return merged_df


In [4]:
test = create_combo_file(geo_level='cpa', estimates_version='2022_01')
test

Unnamed: 0_level_0,Unnamed: 1_level_0,Mobile Home,Multifamily,Single Family - Detached,Single Family - Multiple Unit,units,occupied,vacancy,Group Quarters - College,Group Quarters - Military,Group Quarters - Other,...,Total GQ Population,1,2,3,4,5,6,7,Total Households,hhs
cpa,yr_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
*Not in a CPA*,2020,24869,169099,234054,66383,494405,467399,27006,1833,9073,14997,...,25903,95449,151332,82605,76384,36218,14982,10429,467399,2.95
*Not in a CPA*,2021,24866,171307,234840,66974,497987,472359,25628,611,10664,15050,...,26325,95658,152306,84431,77653,36621,15182,10508,472359,2.91
*Not in a CPA*,2022,24882,173382,235767,68716,502747,476890,25857,1862,10473,14896,...,27231,95498,153806,85384,78736,37276,15414,10776,476890,2.88
32nd Street Naval Station,2020,0,0,0,0,0,0,0,0,6461,0,...,6461,0,0,0,0,0,0,0,0,
32nd Street Naval Station,2021,0,0,0,0,0,0,0,0,6627,0,...,6627,0,0,0,0,0,0,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Valley Center,2021,555,224,6082,220,7081,6442,639,0,0,50,...,50,1027,2057,1181,1196,556,239,186,6442,3.10
Valley Center,2022,555,223,6370,220,7368,6709,659,0,0,52,...,52,1072,2129,1214,1243,617,241,193,6709,3.06
Via De La Valle,2020,0,42,175,7,224,183,41,0,0,0,...,0,76,76,18,12,1,0,0,183,2.16
Via De La Valle,2021,0,42,175,7,224,183,41,0,0,0,...,0,73,79,21,10,0,0,0,183,2.11


# Output the combo file

In [5]:
def create_and_output_combo_file(geo_level, estimates_version):
    # Create the combo file
    combo_file = create_combo_file(geo_level, estimates_version)
    combo_file = combo_file.reset_index()

    # Output to correct place 
    combo_file.to_excel(rf"J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\combo_files\{geo_level}_est_{estimates_version}_combo_file_QA.xlsx", index=False)

# Run and create combo files

In [6]:
for geo_level in ['census_tract', 'cpa', 'jurisdiction', 'luz', 'mgra', 'region', 'sra']:
    for estimates_version in ['2022_01']:
        create_and_output_combo_file(geo_level=geo_level, estimates_version=estimates_version)
        print(f"{geo_level}-{estimates_version} is complete")

census_tract-2022_01 is complete
cpa-2022_01 is complete
jurisdiction-2022_01 is complete
luz-2022_01 is complete
mgra-2022_01 is complete
region-2022_01 is complete
sra-2022_01 is complete
