In [None]:
#set working directory to the aggregation code folder, path does not require "/" at the end
import os
os.chdir('')

In [None]:
#read in required packages
import yaml
import pandas as pd
import numpy as np
import os
from yaml.loader import SafeLoader
import sys
sys.path.insert(1, "../")

In [None]:
#read in config from the aggregation code folder, the config contains information on file path and field names
#the config must be updated and saved each time the inputs to the notebook change
clustering_refactor_folder_path = os.path.abspath(os.path.join(os.path.realpath('__file__'), '../..'))
config_path = f"config.yaml".replace("\\", "/")
with open(config_path, encoding="utf-8") as f:
    loaded_config = yaml.load(f, Loader=SafeLoader)

In [None]:
#import functions from other scripts, these include functions to aggregate, reshape and export the data.
#these scripts also include functions to fill newly created geographies and calculate rates and percentages.
from function_scripts.export_and_qa import *
from function_scripts.update_boundaries import *
from function_scripts.metric_calculation import *
from function_scripts.reshape_data import *
from function_scripts.aggregate_data import *

In [None]:
#this line of code creates an output folder within your coding space where outputs will be saved, if one doesn't exist
if not os.path.isdir('Outputs'):
    os.makedirs('Outputs')
    print("created folder : ", 'Outputs')
else:
    print('Outputs', "folder already exists.")

In [None]:
#load in data using data paths and file names specified in the config
#you can change the read function based on your input data file type
#the read function replaces common suppression symbols with NAs so numeric operations can be undertaken on the data columns
#you may need to add to this list in the config if there is a new suppression symbol in your data
datapath = loaded_config["lu"] + loaded_config["rawfile"]
raw_data = pd.read_csv(datapath, na_values=loaded_config["nalist"])

In [None]:
#assign variable names from the config
df_areacode = loaded_config['df_areacode']
df_areaname = loaded_config['df_areaname']
numerator_column = loaded_config['numerator_column']
denominator_column = loaded_config['denominator_column']
outname = loaded_config["outname"]
keep_variable = loaded_config["keep_variable"]


In [None]:
#remove unknowns in the area code column
#if any other subsetting or data cleaning is required it should be done here
#at this stage the dataframe should include the data you want to aggregate and nothing else
raw_data = raw_data[raw_data[df_areacode].notna()]

In [None]:
#subset the raw data to include only area name, area code, keep variable and value columns
raw_data = raw_data[[df_areacode, 
                     df_areaname,
                     keep_variable, 
                     numerator_column,
                     denominator_column,]]

In [None]:
#make value columns numeric this must be done at this stage as the next stage involves numeric operations
raw_data[numerator_column] = pd.to_numeric(raw_data[numerator_column])
raw_data[denominator_column] = pd.to_numeric(raw_data[denominator_column])

In [None]:
#this function can be used to generate new geographies if your data does not already include them
#it takes the input data, information in the config and uses a lookup to estimate new geography values
#this may not be required if all new geographies are present
#when using time series data, you need to use the "keep_column" version of this function
working_data = update_boundaries_keep_column(
        df = raw_data,
        loaded_config = loaded_config
)

In [None]:
#create dataset for the geography you are aggregating from with new geographies added
#this is exported in the output file for governance
lowest_geography_data = working_data.copy(deep=True)

#change area code column to consistent "AREACD"
lowest_geography_data = lowest_geography_data.rename(columns={df_areacode: 'AREACD'})

In [None]:
#use percentages function to generate value
#there is also a function to calculate rates in the metric_calculation script
#this takes the data, and columns specified in the loaded config, to generate the variable of interest
lowest_geography_data = add_percentages(
        data= lowest_geography_data,
        loaded_config = loaded_config
)

In [None]:
#unstack the data into wide format to make it more readable
#this is an optional stage, you may want to keep in a long format if using the data for further coding
#if you are using a rate or other value column name, you will need to replace "percent" with the new name
lowest_geography_data= unstack_multiple_values(
    df= lowest_geography_data,
    loaded_config = loaded_config,
    value_cols= [numerator_column,denominator_column,"percent"]
    )

In [None]:
#apply the aggregation function, this requires a lookup containing the original geography code 
#and all other geography codes you wish to aggregate to, you specify these desired geography column names in the config
#the function works by merging the lookup to the data and then grouping by each of the new geography columns
#it returns 1 dataframe containing all specified geographies
#if any underlying data for a larger geography is missing, it will return a missing cell
#when using time series data, you need to use the "keep_column" version of this function
aggregated_data = get_all_desired_geographies_keep_column(
        data=working_data,
        loaded_config = loaded_config
)

In [None]:
#use percentages function to generate value for the aggregated data
#there is also a function to calculate rates in the metric_calculation script
#this takes the data, and columns specified in the loaded config, to generate the variable of interest
aggregated_data = add_percentages(
        data= aggregated_data,
        loaded_config = loaded_config
)


In [None]:
#unstack the data into wide format to make it more readable
#this is an optional stage, you may want to keep in a long format if using the data for further coding
#if you are using a rate or other value column name, you will need to replace "percent" with the new name
aggregated_data= unstack_multiple_values(
    df= aggregated_data,
    loaded_config = loaded_config,
    value_cols= [numerator_column,denominator_column,"percent"]
    )

In [None]:
#this function isolates missing results in your underlying data and merges them to your geography lookup
#this provides you with a dataframe including all the missing underlying data and the geographies that
#will be missing in your aggregation output
missing_geographies = check_missing_geographies(
        data= working_data,
        loaded_config = loaded_config
)


In [None]:
#this function exports your required data in xlsx format into your outputs folder
#it is currently set up to export the data you are aggregating, the aggregated data and the missing geographies
#output file name can be specified in the config
export_to_xlsx(
        frames = {'lowest_geography_data': lowest_geography_data, 'aggregated_data': aggregated_data,
                  'missing_geographies': missing_geographies}, 
        file_path = "Outputs", 
        file_name = outname, 
 )