# Data QC Functions
This script houses all of the data QC functions.

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import yaml
import os
import pyodbc
import glob
import copy
import traceback
import warnings
warnings.filterwarnings('ignore')

# Input Data

In [2]:
# Information
DATASOURCE_IDS = ['99']
data_input_folder = 'J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_13_outputs_CSV_data/aggregated_data/'
data_output_folders = 'J:/DataScience/DataQuality/QAQC/forecast_automation/mgra_series_13_outputs_CSV_data'
geography_levels = ['mgra', 'region']

# Download Data

In [4]:
# download all of the data 
all_data = {}
for ds_id in DATASOURCE_IDS:
    temp_dict = {}
    for file in os.listdir(data_input_folder):
        if (ds_id in file):
            print(f"Now creating work for {file}")
            # TODO: Have two folders input and output and have this grab from the input folder 
            temp_dict[file] = pd.read_csv(data_input_folder + file)
            print(f"{file} is uploaded")
    all_data[ds_id] = temp_dict

Now creating work for luz_DS99_ind_QA.csv
luz_DS99_ind_QA.csv is uploaded
Now creating work for mgra_DS99_ind_QA.csv
mgra_DS99_ind_QA.csv is uploaded
Now creating work for mgra_SQLDS99_ind_QA.csv
mgra_SQLDS99_ind_QA.csv is uploaded
Now creating work for region_DS99_ind_QA.csv
region_DS99_ind_QA.csv is uploaded
Now creating work for region_SQLDS99_ind_QA.csv
region_SQLDS99_ind_QA.csv is uploaded
Now creating work for taz_DS99_ind_QA.csv
taz_DS99_ind_QA.csv is uploaded


# Internal Consistency Check

In [83]:
def internal_consistency(DSID, specific_columns:False, level_comparison):
    level_1_df = all_data[DSID][level_comparison[0] + f"_DS{DSID}_ind_QA.csv"]
    level_2_df = all_data[DSID][level_comparison[1] + f"_DS{DSID}_ind_QA.csv"]

    # Check the years 
    non_shared_years = [x for x in set(level_1_df['year']) if x not in set(level_2_df['year'])]
    if len(non_shared_years) != 0:
        print('Years are not consistent accross geography levels')
    else:
        print('Years are consistent accross geography levels')


    if specific_columns != False:
        shared_columns = specific_columns
    else:
        shared_columns = [x for x in level_1_df.columns if x in level_2_df.columns]
        shared_columns = list(set(shared_columns) - set(['year', level_comparison[0], level_comparison[1]]))


    level_1_df = level_1_df[shared_columns]
    level_1_df = pd.DataFrame(level_1_df.sum(), columns=[level_comparison[0]])

    level_2_df = level_2_df[shared_columns]
    level_2_df = pd.DataFrame(level_2_df.sum(), columns=[level_comparison[1]])

    final_df = level_1_df.merge(level_2_df, left_index=True, right_index=True)
    final_df['Diff'] = final_df[level_comparison[0]] - final_df[level_comparison[1]]
    final_df = final_df[final_df['Diff'] != 0]

    if len(final_df) == 0:
        return f"Column Sums are consistent accross {level_comparison[0]} and {level_comparison[1]}"
    else:
        return final_df

In [86]:
internal_consistency('99', specific_columns = False, level_comparison = ['mgra', 'region'])

Years are consistent accross geography levels


Unnamed: 0,mgra,region,Diff
beachactive,17905.31,17905.31,-3.637979e-12
milestocoast,3188995.0,3188995.0,-4.656613e-10
effective_acres,16266230.0,16266230.0,-1.862645e-09
acres,35453650.0,35453650.0,-1.490116e-08
parkactive,85134.91,85134.91,1.455192e-11
land_acres,35081650.0,35081650.0,-7.450581e-09
