In [14]:
import os 
import pandas as pd
import numpy as np
import glob
import pyodbc
import gc

In [15]:
def dof_data_pull():
    conn = pyodbc.connect('Driver={ODBC Driver 17 for SQL Server};'
                    'Server=DDAMWSQL16.sandag.org;'
                    'Database=estimates;'
                    'Trusted_Connection=yes;')

    with open(r'sql_queries\dof_query.sql', 'r') as sql_file:
        sql_query = sql_file.read()

    dof_data =  pd.read_sql_query(sql_query, conn)
    dof_data = dof_data.set_index(['region', 'yr_id'])
    return dof_data

In [16]:
# Need Housing and population
def find_and_download_files(url, estimates_version, category):
    path = url.replace("\\", "/")

    files_with_words = []
    for filename in os.listdir(path):
        if 'region' in filename and category in filename and estimates_version in filename:
            file_path = os.path.join(path, filename)
            files_with_words.append(file_path)

    df = pd.read_excel(files_with_words[0])

    return df

In [17]:
def housing_data_pull(estimates_version):
    housing_df = find_and_download_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', estimates_version=estimates_version, category='housing')
    housing_df['unoccupied'] = housing_df['units'] - housing_df['occupied']
    return housing_df

In [18]:
def population_data_pull(estimates_version):
    population_df = find_and_download_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', estimates_version=estimates_version, category='population')
    return population_df

In [19]:
def households_data_pull(estimates_version):
    households_df = find_and_download_files(url=rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\individual_files\{estimates_version}', estimates_version=estimates_version, category='households')
    households_df['two_to_four'] = households_df[2] + households_df[3] + households_df[4]
    households_df['five_plus'] = households_df[5] + households_df[6] + households_df[7]
    return households_df

In [20]:
def pull_excel_data_and_merge(estimates_version):
    excel_data = households_data_pull(estimates_version).merge(housing_data_pull(estimates_version), on=['region', 'yr_id']).merge(population_data_pull(estimates_version), on=['region', 'yr_id'])
    excel_data = excel_data.set_index(['region', 'yr_id'])
    return excel_data

In [21]:
def subtract_dataframes(df1, df2):
    # Filter the second data frame to keep only rows with index values in the first data frame
    common_index = pd.merge(df1, df2, left_index=True, right_index=True)
    df1 = df1.loc[common_index.index]
    df2 = df2.loc[common_index.index]
    
    # Find the common columns between the two data frames
    common_cols = list(set(df1.columns) & set(df2.columns))
    common_cols = [x for x in df2.columns if x in common_cols]
    
    # Filter the data frames to keep only the common columns
    df1 = df1[common_cols]
    df2 = df2[common_cols]
    
    # Subtract the second data frame from the first
    result = df1.subtract(df2)
    
    return result

In [22]:
def percent_diff_dataframes(df1, df2):
    # Filter the second data frame to keep only rows with index values in the first data frame
    common_index = pd.merge(df1, df2, left_index=True, right_index=True)
    df1 = df1.loc[common_index.index]
    df2 = df2.loc[common_index.index]
    
    # Find the common columns between the two data frames
    common_cols = list(set(df1.columns) & set(df2.columns))
    common_cols = [x for x in df2.columns if x in common_cols]
    
    # Filter the data frames to keep only the common columns
    df1 = df1[common_cols]
    df2 = df2[common_cols]
    
    # Subtract the second data frame from the first
    subtraction = df1.subtract(df2)
    division = subtraction.divide(df2)
    result = division*100
    result = result.round(decimals=2)
    return result

In [23]:
def create_output(excel_data, dof_data, estimates_data):
    dof_data = dof_data_pull()
    excel_data = pull_excel_data_and_merge(estimates_data)
    diff_file = subtract_dataframes(excel_data, dof_data)
    pct_diff_file = percent_diff_dataframes(excel_data, dof_data)

    return diff_file, pct_diff_file

In [24]:
def write_to_excel(excel_data, dof_data, estimates_version):
    diff, pct = create_output(excel_data, dof_data, estimates_version)
    # Create a Pandas Excel writer using xlsxwriter as the engine
    writer = pd.ExcelWriter(rf'J:\DataScience\DataQuality\QAQC\Estimates QC Automation\v_series15\dof_comparison\{estimates_version}\region_est_minus_dof_QA.xlsx', engine='xlsxwriter')

    # Write each data frame to a different sheet
    excel_data.reset_index().to_excel(writer, sheet_name=f'{estimates_version} Data', index=False)
    dof_data.reset_index().to_excel(writer, sheet_name=f'DOF Data', index=False)
    diff.reset_index().to_excel(writer, sheet_name='Diff', index=False)
    pct.reset_index().to_excel(writer, sheet_name='PCT Diff', index=False)

    # Save the writer
    writer.save()

In [25]:
def main(estimates_version):
    dof_data = dof_data_pull()
    estimates_data = pull_excel_data_and_merge(estimates_version)
    diff_file, pct_diff_file = create_output(estimates_data, dof_data, estimates_version)
    write_to_excel(estimates_data, dof_data, estimates_version)


In [28]:
main(estimates_version='2022_01')