# Estimates Analysis Long Format Outputs

<font size="3">

Purpose:  
QA analysis of estimates outputs. Generate YOY and diff files in long format to facilitate analysis in Excel as needed.

Inputs:  
Files generated from Estimates QC Automation process: J:/DataScience/DataQuality/QAQC/Estimates QC Automation

Outputs:
- [YOY threshold analysis for current vintage](#yoy-analysis)
- [Difference threshold analysis between two vintages](#diff-analysis)

Author: Dante Lee  
Date Created: 3/29/2023  
Last Updated: 5/4/2023
</font>

In [1]:
import pandas as pd
import os
from openpyxl.workbook import Workbook

## Data Preparation

In [6]:
current = '2022_04'
previous = '2022_03'
folder_input = 'J:/DataScience/DataQuality/QAQC/Estimates QC Automation/v_series15'
folder_output = 'C:/Users/dle/OneDrive - San Diego Association of Governments/Projects/2023/2023-023 Estimates 2022/Results'

# GEOGRAPHY LEVELS AVAILABLE:
# 'region', 'jurisdiction', 'cpa', 'sra', 'census_tract', 'luz', 'mgra'
geography = ['jurisdiction', 'cpa']
# geography = ['cpa']
# geography = ['region', 'jurisdiction', 'cpa', 'sra', 'census_tract', 'luz', 'mgra']

# VARIABLES AVAILABLE:
# 'age', 'children', 'ethnicity', 'households', 'housing', 'income', 'population', 'sex', 'workers', 'age_sex_ethnicity'
# variable = ['age_sex_ethnicity', 'income']
variable = ['sex']
# variable = ['age', 'children', 'ethnicity', 'households', 'housing', 'population', 'sex', 'workers']

## YOY Analysis
Transform data into long format and calculate numeric and percentage changes year-over-year by geography/variable group

In [4]:
# Generate YOY change outputs for current vintage
print(current)

for geo in geography:
    for var in variable:
        file_name = geo + '_est_' + current + '_' + var + '_ind_QA.xlsx'
        df = pd.read_excel(folder_input + '/individual_files/' + current + '/' + file_name)
        
        # Rename geography level to generic 'geography'
        df.rename(columns={ df.columns[0]: 'geography' }, inplace = True)

        # Unpivot table by geography and year
        df_unpivot = pd.melt(df,
                            id_vars = df[df.columns[0:2]],
                            value_vars = df[df.columns[2:]])

        # Calculate value differences and percentage differences for threshold analysis
        if var == 'age_sex_ethnicity':
            df_unpivot['diff'] = df_unpivot.groupby(['geography', 'age group', 'sex', 'variable'])['value'].diff()
        else:
            df_unpivot['diff'] = df_unpivot.groupby(['geography', 'variable'])['value'].diff()
        
        # Percentage difference from previous value
        df_unpivot['diff_percent'] = df_unpivot['diff'] / df_unpivot['value'].shift(1)

        path = folder_output + '/Test 3 - Threshold YOY/{current}/{var}'
        isExist = os.path.exists(path)
        if not isExist:
            os.makedirs(path)

        # Save results to Excel file
        writer = pd.ExcelWriter(folder_output + '/Test 3 - Threshold YOY/{current}/{var}/{geo}_{var}_est_{current}_YOY.xlsx')
        df_unpivot.to_excel(writer, sheet_name=var, merge_cells=False, index=False)
        writer.close()

        print('{geo} - {var} complete')

2022_04
jurisdiction - sex complete
cpa - sex complete


## Diff Analysis
Combine current and previous vintages into a single sheet in long format, and  calculate numeric and percentage differences between the two vintages.

In [5]:
print('{0}-{1}'.format(current, previous))

for geo in geography:
    for var in variable:
        file_diff = geo + '_' + var + '_est_' + current + '_minus_' + previous + '_QA.xlsx'
        file_diff_path = folder_output + '/Test 4- Vintage Comparison/diff_outputs/' + current + ' and ' + previous + ' diff/' + var + '/' + file_diff

        # First sheet (current)
        df1 = pd.read_excel(file_diff_path, sheet_name=current+' Data')
        df1_unpivot = pd.melt(df1,
                            id_vars = df1[df1.columns[0:2]],
                            value_vars = df1[df1.columns[2:]])

        df1_unpivot = df1_unpivot.rename(columns={ df1_unpivot.columns[0]: 'geography', df1_unpivot.columns[-1]: current })

        # Second sheet (previous)
        df2 = pd.read_excel(file_diff_path, sheet_name=previous+' Data')
        df2_unpivot = pd.melt(df2,
                            id_vars = df2[df2.columns[0:2]],
                            value_vars = df2[df2.columns[2:]])

        df2_unpivot = df2_unpivot.rename(columns={ df2_unpivot.columns[0]: 'geography', df2_unpivot.columns[-1]: previous })

        # Join dataframes together and calculate diff and percent diff
        df_diff = pd.merge(df1_unpivot, df2_unpivot, how='left',
                        left_on=['geography', 'yr_id', 'variable'],
                        right_on=['geography', 'yr_id', 'variable'])

        df_diff['diff'] = df_diff[current] - df_diff[previous]
        df_diff['diff_percent'] = df_diff['diff'] / df_diff[previous]
        df_diff = df_diff.sort_values(by=['geography', 'yr_id'])

        # Save results to Excel file
        with pd.ExcelWriter(file_diff_path, engine='openpyxl', mode='a', if_sheet_exists='replace') as writer:
            df_diff.to_excel(writer, sheet_name='QC diff', index=False)

        print('{geo} - {var} complete')

2022_04-2022_03
cpa - ethnicity complete
