In [None]:
#set working directory to the clustering refactor folder
import os
os.chdir('FILEPATH')


In [None]:
#Imports the functions from other scripts
#If you make any changes to the underlying scripts you will need to rerun the code from here for those changes to apply
import yaml
import os


import sys
sys.path.insert(1, "../")

from data_prep.subnat_data_clean import *
from data_prep.subnat_data_import import *
from Cluster_code.cluster_functions import *
from yaml.loader import SafeLoader

In [None]:
# Read config file
clustering_refactor_folder_path = os.path.abspath(os.path.join(os.path.realpath('__file__'), '../..'))
config_path = f"config.yaml".replace("\\", "/")
with open(config_path, encoding="utf-8") as f:
    loaded_config = yaml.load(f, Loader=SafeLoader)

In [None]:
# Load data and split into individual metrics
datasets = import_data(
    loaded_config=loaded_config,
    cols_to_select=["AREACD", "AREANM","Indicator", "Value"],
    table_name=loaded_config["subnational_indicators_table_name"],
)

In [None]:
# Cleans the data, including UTLA imputation and boundary changes
for key, value in datasets.items():
    value = clean_groups(loaded_config, value)

In [None]:
# Convert metrics into pivoted tables, your specified data is now stored as tables['custom_metrics']
tables = {}
for key, value in datasets.items():
    tables[key] = metrics_to_table(value)


In [None]:
#set the max rows displayed to 500 so data can be spot checked in script
pd.set_option('display.max_rows', 500)

In [None]:
#Fuction isolates the desired geography type from a specified code column in lookup file
#This can be adapted through the lookup file to run on any geography type or subset of geography
cluster_df = get_desired_geography(
    loaded_config= loaded_config,
    df= tables['custom_metrics'],
    geography_col= "AREACD",
)
cluster_df

In [None]:
#This function takes the dataset and computes pearsons correlation between all metrics
correlation_matrix = get_correlation_matrix(df= cluster_df)
correlation_matrix

In [None]:
#This fucntion outputs a data frame with the winzorisation thresholds (if required) for QA and governance purposes
thresholds = get_winsorization_thresholds(
    df=cluster_df,
    lower_threshold = 0.01,
    upper_threshold = 0.99,
)
thresholds

In [None]:
#This function takes all values below and above a certain percentile and sets it to the specified percentile threshold
#The percentile threshold can be altered and the winzorised data is output
cluster_df_win = winsorze(
    df=cluster_df,
    lower_threshold = 0.01,
    upper_threshold = 0.99,
)

In [None]:
#This function takes the winsorized data (or other data should you wish to use it) and runs the kmeans model
#The data you wish to cluster should be specified in the metrics parameter
#n_init specifies the number of times the model is to be run, recommended 100 for initial and 10000 for final output
#Setting min and max k specifies the range of potential cluster numbers, the code takes longer to run for wider ranges
#A geodataframe including clusters, the cluster centres (for radar plot) and a silouette score df are output
cluster_geodataframe, cluster_centres, sil_score = make_clustering_model(
    loaded_config=loaded_config,
    metrics=cluster_df_win,
    n_init=10000,
    min_k=4,
    max_k=15,
)
cluster_geodataframe

In [None]:
#This function takes the geodataframe, drops geodata columns and adds an area name column to give a cluster table
cluster_table = cluster_table(
    loaded_config=loaded_config,
    clusters_table=cluster_geodataframe,
) 
cluster_table

In [None]:
#This function uses the geodataframe to create a map showing the cluster of each area
#This map is automatically saved into the output folder and can be called into the excel output at the end of the script
cluster_map = cluster_map(
    clusters=cluster_geodataframe, 
)

In [None]:
#This function uses the geodataframe, the cluster centres and the dataframe containing metrics to create a radar plot
#This plot is automatically saved into the output folder and can be called into the excel output at the end of the script
#For models with more than 6 metrics variable names may overlap
radar_plot = radar_plot(
    loaded_config= loaded_config,
    metrics= cluster_df,
    clusters= cluster_geodataframe,
    centres = cluster_centres,
)

In [None]:
#This function takes the cluster geodataframe and creates an ITL1 pivot table based on cluster using lookups
ITL1_table = ITL1_summary(
    loaded_config=loaded_config,
    clusters_table=cluster_geodataframe,
) 
ITL1_table

In [None]:
#This function takes the cluster geodataframe and df of metrics to create a table of mean values for each variable by cluster
#The total column in this table shows an average of the values of the the desired geographies
#this is not the same as a UK average and should not be treated as such
mean_table = clusters_summary_stats(
    table_metrics= cluster_df,
    clusters_table= cluster_geodataframe,
    stats= "mean",
)
mean_table

In [None]:
#This function takes the cluster geodataframe and df of metrics to create a table of median values for each variable by cluster
#The total column in this table shows an average of the values of the the desired geographies
#this is not the same as a UK average and should not be treated as such
median_table = clusters_summary_stats(
    table_metrics= cluster_df,
    clusters_table= cluster_geodataframe,
    stats= "median",
)
median_table

In [None]:
#This function exports all relevant data to a single xlsx file in the outputs folder
#Including the visualisations can be specified by the boolean operator
#If not all data is required, use frames parameter to specify desired sheets.
#File path and file name must be specified, 
#the final "/" at the end of the file path and ".xlsx" file type in the file name must not be included or it won't work
export_to_xlsx(
    frames = {'Cluster_table': cluster_table, 'ITL1_table': ITL1_table,
         'Silhoutte_score': sil_score, 'Cluster_medians': median_table, 'Cluster_means': mean_table,
         'correlation_matrix': correlation_matrix,'data': cluster_df, 'winsorized_data':cluster_df_win},
    file_path = "FILEPATH",
    file_name = "FILENAME",
    include_maps = True,
)
