In [1]:
# clustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
import networkx as nx

# import packages
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
# import holidays

import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
from joblib import dump, load
import joblib
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import mean_absolute_error

from quantile_forest import RandomForestQuantileRegressor

from scipy.cluster.hierarchy import dendrogram, linkage


import pickle
import numpy as np
from functools import lru_cache
from pathlib import Path

import h3


In [2]:
def mean_squared_error(y_true, y_pred, squared=False):
    if squared:
        return np.mean((y_true - y_pred)**2)
    else:
        return np.sqrt(np.mean((y_true - y_pred)**2))

In [3]:
def root_mean_squared_log_error(actual,predicted):
    squared_diffs = np.zeros(len(predicted))
    for i in range(len(predicted)):
        squared_diffs = (np.log(predicted[i]+1) - np.log(actual[i]+1))**2
    rmsle = np.sqrt(np.mean(squared_diffs))
    return rmsle

def variance_residual(y_true, y_pred):
    residuals = y_true - y_pred
    return round(np.std(residuals), 3)

def prediction_evaluate(y_pred, y_true):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    rmsle = root_mean_squared_log_error(y_true, y_pred)
    resid_std = variance_residual(y_true, y_pred)
    return mae, rmse, rmsle, resid_std

In [4]:
def MAE(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred))
    # return mean_absolute_error(y_true, y_pred).round(3)

def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))
    # return mean_squared_error(y_true, y_pred, squared=False).round(3)

def RMSLE(y_true, y_pred):
    return np.sqrt(np.mean((np.log1p(y_true+1) - np.log1p(y_pred+1))**2))

def MAPE(y_true, y_pred, c=1):
    return np.mean(np.abs((y_true - y_pred +c) / (y_true+c)) * 100)

def AE(y_true, y_pred):
    return np.abs(y_true - y_pred)

def prediction_evaluate(y_pred, y_true):
    mae = MAE(y_true, y_pred)
    rmse = RMSE(y_true, y_pred)
    rmsle = RMSLE(y_true, y_pred)
    # mape = MAPE(y_true, y_pred)
    # ae = AE(y_true, y_pred)
    resid_std = variance_residual(y_true, y_pred)
    return mae, rmse, rmsle, resid_std

## Create graph of h3 grids and their connectivity info

In [None]:
# load zone address from txt file
zone_addresses = None

# find the h3 index for each zone
h3_indexes = [h3.latlng_to_cell(lat, lon, 8) for (lat, lon) in zone_addresses.values()]

# a function to check whether two zones are neighbors
def are_zones_neighbors(h1, h2):
    return h3.are_neighbor_cells(h1, h2)  

def return_connected_zones_index_on_list(h3_indexes):
    G = nx.Graph()
    for i in range(len(h3_indexes)):
        for j in range(i + 1, len(h3_indexes)):
            if are_zones_neighbors(h3_indexes[i], h3_indexes[j]):
                G.add_edge(i, j)
    return G

zone_graph = return_connected_zones_index_on_list(h3_indexes)

# check the connected nodes for each node on zone_graph
connected_pairs = []
for node in zone_graph.nodes():
    neighbors = list(zone_graph.neighbors(node))
    # print(f"Node {node} is connected to {neighbors}")
    # for each pair of neighbor, record it as a tuple
    for i in range(len(neighbors)):
        connected_pairs.append((node, neighbors[i]))
    # if (i,j) is inlcuded in connected_pairs, keep only (i,j), remove (j,i)
    connected_pairs = [(i,j) for (i,j) in connected_pairs if i < j]

connected_pairs

[(1, 7),
 (1, 11),
 (2, 3),
 (2, 10),
 (2, 15),
 (3, 8),
 (3, 15),
 (10, 12),
 (10, 18),
 (8, 15),
 (4, 14),
 (4, 16),
 (14, 16),
 (14, 17),
 (16, 17),
 (5, 9),
 (5, 11),
 (5, 16),
 (5, 17),
 (5, 19),
 (9, 11),
 (9, 19),
 (17, 19),
 (6, 13),
 (13, 14),
 (13, 17),
 (12, 18),
 (18, 19)]

# Data PreProcessing

In [5]:
df_15min = pd.read_csv('Data/case 1/processed_data.csv')
df_test = df_15min.iloc[-20*308:]
df_train_21w = df_15min.iloc[-20*308*22:-20*308]
# df_train_4w = df_15min.iloc[-20*308*5:-20*308]
print(len(df_train_21w)/20)
# print(len(df_train_4w)/20)
print(len(df_test)/20)

6468.0
308.0


In [6]:
df_test.head()

Unnamed: 0,date,DayofWeek,Hour,Quarter,OrZone,temp,wspd,prep,Is_Holiday,AR1,AR2,AR3,AR4,counts,zone_id
138520,2020-09-07,0,21,1,zone 1,154,40,0,0,0.0,1.0,0.0,2.0,1.0,2
138521,2020-09-07,0,21,1,zone 10,154,40,0,0,0.0,0.0,0.0,1.0,0.0,7
138522,2020-09-07,0,21,1,zone 11,154,40,0,0,0.0,0.0,1.0,2.0,0.0,15
138523,2020-09-07,0,21,1,zone 12,154,40,0,0,2.0,3.0,3.0,2.0,0.0,3
138524,2020-09-07,0,21,1,zone 13,154,40,0,0,1.0,3.0,3.0,2.0,0.0,19


In [7]:
# load data
df_15min = pd.read_csv('Data/case 1/processed_data.csv')
df_15min_train = df_15min.iloc[-20*308*22:-20*308]
df_15min = df_15min.iloc[-20*308:]

In [None]:
# create historical avergae dictionary
list_of_ha_dict_21 = []
for z in range(20):
    df_train_21w_ = df_train_21w[df_train_21w.OrZone == f'zone {z+1}']
    df_train_21w_['wdhr'] = [(x, y) for x, y in zip(df_train_21w_['DayofWeek'], df_train_21w_['Hour'])]
    temp = df_train_21w_.groupby('wdhr')['counts'].mean()
    list_of_ha_dict_21.append(temp.to_dict())

## CRPS evaluation for HA and LDQRF, QRF

In [45]:
def read_model(num_week, model_name,zone_name):
    'model reading'
    file_name = f'Case_1_Models/week_{num_week}/{model_name}_{zone_name}.joblib'
    model = load(file_name)
    return model

In [50]:
'probabilistic prediction evaluation: CRPD'
from scipy.integrate import quad

def CRPS(df_, pred_model, quantiles, with_LD = True):
    'continuous ranked probability score'
    def empirical_cdf(predicted_values, quantiles):
        # Construct empirical CDF from predicted quantiles
        def cdf(x):
            return np.interp(x, predicted_values, quantiles, left=0, right=1)
        return cdf
    # Define CRPS calculation
    def crps(cdf_function, observed_value):
        def integrand(z):
            return (cdf_function(z) - (z >= observed_value)) ** 2
        crps_value, _ = quad(integrand, -np.inf, np.inf)
        return crps_value
    y_true = df_['counts'].values
    if with_LD:
        X_ = df_[['DayofWeek','Hour','temp', 'wspd', 'prep', 'Is_Holiday','AR1','AR2','AR3','AR4']]
    else:
        X_ = df_[['DayofWeek','Hour','temp', 'wspd', 'prep', 'Is_Holiday']]
    crps_values = np.zeros(len(y_true))
    for i in tqdm(range(len(y_true))):
        predicted_values = pred_model.predict(X_.iloc[i].values.reshape(1,-1), quantiles=quantiles)[0]
        observed_value = y_true[i]
        cdf_function = empirical_cdf(predicted_values, quantiles)
        crps_values[i] = crps(cdf_function, observed_value)
    return crps_values

def CRPS_evaluation_per_grid_LDQRF(df_train, df_test, order_of_grids, 
                                   num_week = 4, model_name = 'LDQRF',
                                   quantiles=[0.1, 0.3, 0.5, 0.7, 0.9]):
    length = 20
    crps_train = np.zeros(length)
    crps_train_std = np.zeros(length)
    crps_test = np.zeros(length)
    crps_test_std = np.zeros(length)

    for i, grid in tqdm(enumerate(order_of_grids)):
        df_train_grid = df_train[df_train['OrZone'] == grid]
        df_test_grid = df_test[df_test['OrZone'] == grid]
        model = read_model(num_week = num_week, model_name=model_name,zone_name=grid)
        if 'LD' in model_name:
            crps_train_temp = CRPS(df_train_grid, model, quantiles, with_LD = True)
            crps_test_temp = CRPS(df_test_grid, model, quantiles, with_LD = True)
        else:
            crps_train_temp = CRPS(df_train_grid, model, quantiles, with_LD = False)
            crps_test_temp = CRPS(df_test_grid, model, quantiles, with_LD = False)

        crps_train[i] = np.mean(crps_train_temp)
        crps_train_std[i] = np.std(crps_train_temp)

        crps_test[i] = np.mean(crps_test_temp)
        crps_test_std[i] = np.std(crps_test_temp)
        
    print(f'Train: CRPS: {np.mean(crps_train):.3f}({np.std(crps_train):.3f})')
    print(f'Test: CRPS: {np.mean(crps_test):.3f}({np.std(crps_test):.3f})')
    
    return crps_train, crps_test, crps_train_std, crps_test_std

In [None]:
'LDQRF - 21 week training'

order_of_grids = [f'zone {i+1}' for i in range(20)]
quantiles_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
crps_train, crps_test, crps_train_std, crps_test_std = CRPS_evaluation_per_grid_LDQRF(df_train_21w, df_test, order_of_grids,
                                                                                    num_week = 21, model_name = 'LDQRF',
                                                                                    quantiles = quantiles_list)

np.save('Results_Saving_Case_1/CRPS/crps_train_21week_LDQRF.npy', crps_train)
np.save('Results_Saving_Case_1/CRPS/crps_test_21week_LDQRF.npy', crps_test)
np.save('Results_Saving_Case_1/CRPS/crps_train_std_21week_LDQRF.npy', crps_train_std)
np.save('Results_Saving_Case_1/CRPS/crps_test_std_21week_LDQRF.npy', crps_test_std)

In [None]:
'QRF - 21 week training'

order_of_grids = [f'zone {i+1}' for i in range(20)]
quantiles_list = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
crps_train, crps_test, crps_train_std, crps_test_std = CRPS_evaluation_per_grid_LDQRF(df_train_21w, df_test, order_of_grids,
                                                                                    num_week = 21, model_name = 'QRF',
                                                                                    quantiles = quantiles_list)


np.save('Results_Saving_Case_1/CRPS/crps_train_21week_QRF.npy', crps_train)
np.save('Results_Saving_Case_1/CRPS/crps_test_21week_QRF.npy', crps_test)
np.save('Results_Saving_Case_1/CRPS/crps_train_std_21week_QRF.npy', crps_train_std)
np.save('Results_Saving_Case_1/CRPS/crps_test_std_21week_QRF.npy', crps_test_std)

In [63]:
# create quantile predictions from the weekday-hour grouped data points
def weekday_hour_quantile(df, quantiles= [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]):
    df = df[['OrZone','DayofWeek', 'Hour','counts']]
    grouped = df.groupby(['OrZone','DayofWeek', 'Hour'])
    # Function to calculate quantiles
    def calculate_quantiles(group):
        return group['counts'].quantile(quantiles)
    # Apply the function to each group
    quantile_results = grouped.apply(calculate_quantiles).reset_index()
    quantile_dict = {}

    for index, row in quantile_results.iterrows():
        key = (row['OrZone'], row['DayofWeek'], row['Hour'])
        values = row[3:].values.astype(float)
        quantile_dict[key] = values
    return quantile_dict

# Example function to retrieve quantiles
def get_quantiles(quantile_dict, grid, day_of_week, hour):
    key = (grid, day_of_week, hour)
    return quantile_dict.get(key, np.array([]))  # Return an empty array if key not found

def CRPS_historical(df_, quantile_dict, quantiles):
    'continuous ranked probability score'
    def empirical_cdf(predicted_values, quantiles):
        # Construct empirical CDF from predicted quantiles
        def cdf(x):
            return np.interp(x, predicted_values, quantiles, left=0, right=1)
        return cdf
    # Define CRPS calculation
    def crps(cdf_function, observed_value):
        def integrand(z):
            return (cdf_function(z) - (z >= observed_value)) ** 2
        crps_value, _ = quad(integrand, -np.inf, np.inf)
        return crps_value
    
    y_true = df_['counts'].values
    X_ = df_[['OrZone', 'DayofWeek','Hour']]
    crps_values = np.zeros(len(y_true))
    for i in tqdm(range(len(y_true))):
        predicted_values = get_quantiles(quantile_dict, X_.iloc[i].OrZone, X_.iloc[i].DayofWeek, X_.iloc[i].Hour)
        observed_value = y_true[i]
        cdf_function = empirical_cdf(predicted_values, quantiles)
        temp = crps(cdf_function, observed_value)
        crps_values[i] = temp
    return crps_values

def CRPS_evaluation_per_grid_HA(df_train, df_test, order_of_grids, quantile_dict, quantiles=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]):
    length = 20
    crps_train = np.zeros(length)
    crps_train_std = np.zeros(length)
    crps_test = np.zeros(length)
    crps_test_std = np.zeros(length)

    for i, grid in tqdm(enumerate(order_of_grids)):
        df_train_grid = df_train[df_train['OrZone'] == grid]
        df_test_grid = df_test[df_test['OrZone'] == grid]
        # model = read_model(model_name='ARQRF',zone_name=grid)
        crps_train_temp = CRPS_historical(df_train_grid, quantile_dict, quantiles)
        crps_train[i] = np.mean(crps_train_temp)
        crps_train_std[i] = np.std(crps_train_temp)
        crps_test_temp = CRPS_historical(df_test_grid, quantile_dict, quantiles)
        crps_test[i] = np.mean(crps_test_temp)
        crps_test_std[i] = np.std(crps_test_temp)

    print(f'Train: CRPS: {np.mean(crps_train):.3f}({np.std(crps_train):.3f})')
    print(f'Test: CRPS: {np.mean(crps_test):.3f}({np.std(crps_test):.3f})')
    
    return crps_train, crps_test, crps_train_std, crps_test_std

In [None]:
'Historical: 21 week - Day-of-a-week, Hour-of-a-day Quantile Prediction'

quantile_dict = weekday_hour_quantile(df_train_21w, quantiles_list)
crps_train, crps_test, crps_train_std, crps_test_std = CRPS_evaluation_per_grid_HA(df_train_21w, df_test, order_of_grids, 
                                                                                   quantile_dict, quantiles_list)


np.save('Results_Saving_Case_1/CRPS/CRPS_train_historical_21week.npy', crps_train)
np.save('Results_Saving_Case_1/CRPS/CRPS_test_historical_21week.npy', crps_test)
np.save('Results_Saving_Case_1/CRPS/CRPS_train_std_historical_21week.npy', crps_train_std)
np.save('Results_Saving_Case_1/CRPS/CRPS_test_std_historical_21week.npy', crps_test_std)

In [None]:
crps_train_LDQRF = np.load('Results_Saving_Case_1/CRPS/CRPS_train.npy') 
crps_test_LDQRF = np.load('Results_Saving_Case_1/CRPS/CRPS_test.npy')
crps_peak_LDQRF = np.load('Results_Saving_Case_1/CRPS/CRPS_peak.npy')
crps_train_std_LDQRF = np.load('Results_Saving_Case_1/CRPS/CRPS_train_std.npy')
crps_test_std_LDQRF = np.load('Results_Saving_Case_1/CRPS/CRPS_test_std.npy')
crps_peak_std_LDQRF = np.load('Results_Saving_Case_1/CRPS/CRPS_peak_std.npy')

crps_train_HA = np.load('Results_Saving_Case_1/CRPS/CRPS_train_historical.npy')
crps_test_HA = np.load('Results_Saving_Case_1/CRPS/CRPS_test_historical.npy')
crps_peak_HA = np.load('Results_Saving_Case_1/CRPS/CRPS_peak_historical.npy')
crps_train_std_HA = np.load('Results_Saving_Case_1/CRPS/CRPS_train_std_historical.npy')
crps_test_std_HA = np.load('Results_Saving_Case_1/CRPS/CRPS_test_std_historical.npy')
crps_peak_std_HA = np.load('Results_Saving_Case_1/CRPS/CRPS_peak_std_historical.npy')

evas = [crps_train_LDQRF, crps_train_HA, crps_test_LDQRF, crps_test_HA]
stds = [crps_train_std_LDQRF, crps_train_std_HA, crps_test_std_LDQRF, crps_test_std_HA]
test_names = [f'LDQRF: {np.mean(crps_train_LDQRF):.3f} ({np.mean(crps_train_std_LDQRF):.3f})', 
              f'Historical: {np.mean(crps_train_HA):.3f} ({np.mean(crps_train_std_HA):.3f})', 
              f'LDQRF: {np.mean(crps_test_LDQRF):.3f} ({np.mean(crps_test_std_LDQRF):.3f})',  
              f'Historical: {np.mean(crps_test_HA):.3f} ( {np.mean(crps_test_std_HA):.3f})']
zone_names = [f'{i+1}' for i in range(25)]
colors = ['darkmagenta', 'orchid', 'darkgreen',  'mediumseagreen']

fig, axs = plt.subplots(2, 1, figsize=(9, 6))  # 1 row, 2 columns

# Plot for training set
for i, name in enumerate(test_names[:2]):  # Only take the first two elements for training
    axs[0].plot(zone_names, evas[i], 'X-', label=test_names[i], color=colors[i])

axs[0].set_xlabel("Pick-up Zone", fontsize=14)
axs[0].set_ylabel("CRPS", fontsize=14)
axs[0].tick_params(labelsize=13)
axs[0].legend(fontsize=12, ncol=1)
axs[0].set_title("CPRS measured over the training set of case: Delivery Hero")

# Plot for testing set
for i, name in enumerate(test_names[2:]):  # Only take the last two elements for testing
    axs[1].plot(zone_names, evas[i+2], 'X-', label=test_names[i+2], color=colors[i+2])

axs[1].set_xlabel("Pick-up Zone", fontsize=14)
axs[1].set_ylabel("CRPS", fontsize=14)
axs[1].tick_params(labelsize=13)
axs[1].legend(fontsize=12, ncol=1)
axs[1].set_title("CPRS measured over the testing set of case: Delivery Hero")

plt.tight_layout()
plt.show()

## Regular models

In [None]:
all_TBATS_models = {}
all_RF_models = {}
all_ARRF_models = {}
all_XGB_models = {}
all_ARXGB_models = {}
all_QRF_models = {}
all_LDQRF_models = {}

# read trained models - model trained by 21-week data
for i in range(20):
    zone_name = f'zone {i+1}'
    all_TBATS_models[zone_name] = joblib.load(f'Case_1_Models/TBATS/{zone_name}.sav')
    all_RF_models[zone_name] = joblib.load(f'Case_1_Models/week_21/RF_{zone_name}.joblib')
    all_ARRF_models[zone_name]= joblib.load(f'Case_1_Models/week_21/LDRF_{zone_name}.joblib')
    all_XGB_models[zone_name] = joblib.load(f'Case_1_Models/week_21/XGB_{zone_name}.joblib')
    all_ARXGB_models[zone_name] = joblib.load(f'Case_1_Models/week_21/LDXGB_{zone_name}.joblib')
    all_QRF_models[zone_name] = joblib.load(f'Case_1_Models/week_21/QRF_{zone_name}.joblib')
    all_LDQRF_models[zone_name] = joblib.load(f'Case_1_Models/week_21/LDQRF_{zone_name}.joblib')

## Create predicted attributes for clustering

In [13]:
def get_pred(X_test, predictor, model=None, q_=None, HA_dict=None):
    if predictor == 'QRF':
        X_test = X_test.drop(['AR1', 'AR2', 'AR3', 'AR4'], axis=1)
        y_pred = model.predict(X_test, quantiles=q_)
    elif predictor == 'LDQRF':
        y_pred = model.predict(X_test, quantiles=q_)
    elif predictor == 'RF':
        X_test = X_test.drop(['AR1', 'AR2', 'AR3', 'AR4'], axis=1)
        y_pred = model.predict(X_test)
    elif predictor == 'LDRF':
        y_pred = model.predict(X_test)
    elif predictor == 'RF':
        X_test = X_test.drop(['AR1', 'AR2', 'AR3', 'AR4'], axis=1)
        y_pred = model.predict(X_test)
    elif predictor == 'LDXGB':
        y_pred = model.predict(X_test)
    elif predictor == 'XGB':
        X_test = X_test.drop(['AR1', 'AR2', 'AR3', 'AR4'], axis=1)
        y_pred = model.predict(X_test)
    elif predictor == 'TBATS':
        y_pred = model.forecast(steps=len(X_test))
    elif predictor == 'HA':
        X_test['wdhr'] = [(x, y) for x, y in zip(X_test['DayofWeek'], X_test['Hour'])]
        X_test['HA'] = X_test.wdhr.map(HA_dict)
        y_pred = X_test['HA'].values
    elif predictor == 'Myopic':
        y_pred = X_test['AR1'].values
    return np.array(y_pred)

In [7]:
'evaluate deterministic prediction performance of models'

# note that a good benchmark will be historical average according to the zone's weekday and hour
def forecasting_per_grid(df_test, dict_of_predictors={}, predictor='myopic',quantile_ = 0.5, list_of_ha_dict=None, print_ = True):
    length = 20
    mae_test, rmse_test, rmsle_test, resid_std_test = np.zeros(length),  np.zeros(length), np.zeros(length), np.zeros(length)
    QRF_names = ['QRF','LDQRF']
    list_of_data = []
    for i in range(20):
        grid = f'zone {i+1}'
        df_test_grid = df_test[df_test['OrZone'] == grid]
        X_test = df_test_grid[['DayofWeek', 'Hour', 'temp', 'wspd', 'prep', 'Is_Holiday', 'AR1', 'AR2', 'AR3', 'AR4']]
        y_test = df_test_grid['counts'].values
        if predictor in QRF_names:
            model = dict_of_predictors[grid]
            pred_test = get_pred(X_test, predictor, model=model, q_=quantile_, HA_dict=None)
            df_test_grid[f'{predictor}_{quantile_}'] = pred_test
        elif predictor == 'HA':
            pred_test = get_pred(X_test, predictor, model=None, q_=None, HA_dict=list_of_ha_dict[i])
            df_test_grid[predictor] = pred_test
        elif predictor == 'Myopic':
            pred_test = get_pred(X_test, predictor, model=None, q_=None, HA_dict=None)
            df_test_grid[predictor] = pred_test
        else:
            model = dict_of_predictors[grid]
            pred_test = get_pred(X_test, predictor, model=model, q_=None, HA_dict=None)
            df_test_grid[predictor] = pred_test
        if print_:
            mae_test[i], rmse_test[i], rmsle_test[i], resid_std_test[i] = prediction_evaluate(pred_test, y_test)
        list_of_data.append(df_test_grid)

    if print_:
        print(f'Test: MAE: {np.mean(mae_test):.3f}({np.std(mae_test):.3f}), \n RMSE: {np.mean(rmse_test):.3f}({np.std(rmse_test):.3f}), \n RMSLE: {np.mean(rmsle_test):.3f}({np.std(rmsle_test):.3f}), \n Residual Std: {np.mean(resid_std_test):.3f}({np.std(resid_std_test):.3f})')
        print('\n')
    
    # create new dataframe from list_of_data
    df_test = pd.concat(list_of_data)
    # re-arrange rows based on date, hour, quarter, then OrZone
    df_test = df_test.sort_values(by=['date', 'Hour', 'Quarter', 'OrZone'])
    return df_test

## predictions

In [None]:
length = 20
mae_test, rmse_test, rmsle_test, resid_std_test = np.zeros(length),  np.zeros(length), np.zeros(length), np.zeros(length)
for i in range(length):
    grid = f'zone {i+1}'
    df_test_grid = df_train_21w[df_train_21w['OrZone'] == grid]
    X_ = df_test_grid[['DayofWeek', 'Hour', 'temp', 'wspd', 'prep', 'Is_Holiday', 'AR1', 'AR2', 'AR3', 'AR4']]
    y_ = df_test_grid['counts'].values
    pred_test = get_pred(X_, 'HA', model=None, q_=None, HA_dict=list_of_ha_dict_21[i])
    mae_test[i], rmse_test[i], rmsle_test[i], resid_std_test[i] = prediction_evaluate(pred_test, y_)
print(f'Training (21W): MAE: {np.mean(mae_test):.3f}({np.std(mae_test):.3f}), \n RMSE: {np.mean(rmse_test):.3f}({np.std(rmse_test):.3f}), \n RMSLE: {np.mean(rmsle_test):.3f}({np.std(rmsle_test):.3f}), \n Residual Std: {np.mean(resid_std_test):.3f}({np.std(resid_std_test):.3f})')
print('\n')


length = 20
mae_test, rmse_test, rmsle_test, resid_std_test = np.zeros(length),  np.zeros(length), np.zeros(length), np.zeros(length)
for i in range(length):
    grid = f'zone {i+1}'
    df_test_grid = df_test[df_test['OrZone'] == grid]
    X_ = df_test_grid[['DayofWeek', 'Hour', 'temp', 'wspd', 'prep', 'Is_Holiday', 'AR1', 'AR2', 'AR3', 'AR4']]
    y_ = df_test_grid['counts'].values
    pred_test = get_pred(X_, 'HA', model=None, q_=None, HA_dict=list_of_ha_dict_21[i])
    mae_test[i], rmse_test[i], rmsle_test[i], resid_std_test[i] = prediction_evaluate(pred_test, y_)
print(f'Testing (21W): MAE: {np.mean(mae_test):.3f}({np.std(mae_test):.3f}), \n RMSE: {np.mean(rmse_test):.3f}({np.std(rmse_test):.3f}), \n RMSLE: {np.mean(rmsle_test):.3f}({np.std(rmsle_test):.3f}), \n Residual Std: {np.mean(resid_std_test):.3f}({np.std(resid_std_test):.3f})')
print('\n')


Training (21W): MAE: 0.851(0.290), 
 RMSE: 1.209(0.382), 
 RMSLE: 0.431(0.262), 
 Residual Std: 1.209(0.382)


Testing (21W): MAE: 0.837(0.269), 
 RMSE: 1.159(0.337), 
 RMSLE: 0.395(0.229), 
 Residual Std: 1.148(0.339)




In [None]:
'weekend dinner time'
dinner_hours = [17, 18, 19, 20]
df_test_dinner = df_test[(df_test['DayofWeek'] >= 5) & (df_test['Hour'].isin(dinner_hours))]
dict_of_predictors = all_ARRF_models

length = 20
mae_test, rmse_test, rmsle_test, resid_std_test = np.zeros(length),  np.zeros(length), np.zeros(length), np.zeros(length)
for i in range(length):
    grid = f'zone {i+1}'
    df_test_grid = df_test_dinner[df_test_dinner['OrZone'] == grid]
    X_ = df_test_grid[['DayofWeek', 'Hour', 'temp', 'wspd', 'prep', 'Is_Holiday', 'AR1', 'AR2', 'AR3', 'AR4']]
    y_ = df_test_grid['counts'].values
    model = dict_of_predictors[grid]
    pred_test = get_pred(X_, 'LDRF', model=model, q_=None, HA_dict=list_of_ha_dict_21[i])
    mae_test[i], rmse_test[i], rmsle_test[i], resid_std_test[i] = prediction_evaluate(pred_test, y_)
print(f'Testing (21W): MAE: {np.mean(mae_test):.3f}({np.std(mae_test):.3f}), \n RMSE: {np.mean(rmse_test):.3f}({np.std(rmse_test):.3f}), \n RMSLE: {np.mean(rmsle_test):.3f}({np.std(rmsle_test):.3f}), \n Residual Std: {np.mean(resid_std_test):.3f}({np.std(resid_std_test):.3f})')
print('\n')


Testing (4W): MAE: 1.290(0.412), 
 RMSE: 1.634(0.520), 
 RMSLE: 0.556(0.346), 
 Residual Std: 1.607(0.517)


Testing (21W): MAE: 1.290(0.412), 
 RMSE: 1.634(0.520), 
 RMSLE: 0.556(0.346), 
 Residual Std: 1.607(0.517)




In [None]:
length = 20
mae_test, rmse_test, rmsle_test, resid_std_test = np.zeros(length),  np.zeros(length), np.zeros(length), np.zeros(length)
for i in range(length):
    grid = f'zone {i+1}'
    df_test_grid = df_train_21w[df_train_21w['OrZone'] == grid]
    X_ = df_test_grid[['DayofWeek', 'Hour', 'temp', 'wspd', 'prep', 'Is_Holiday', 'AR1', 'AR2', 'AR3', 'AR4']]
    y_ = df_test_grid['counts'].values
    pred_test = get_pred(X_, 'Myopic', model=None, q_=None, HA_dict=None)
    mae_test[i], rmse_test[i], rmsle_test[i], resid_std_test[i] = prediction_evaluate(pred_test, y_)
print(f'Training (21W): MAE: {np.mean(mae_test):.3f}({np.std(mae_test):.3f}), \n RMSE: {np.mean(rmse_test):.3f}({np.std(rmse_test):.3f}), \n RMSLE: {np.mean(rmsle_test):.3f}({np.std(rmsle_test):.3f}), \n Residual Std: {np.mean(resid_std_test):.3f}({np.std(resid_std_test):.3f})')
print('\n')

In [None]:
df_15min_ = forecasting_per_grid(df_15min, dict_of_predictors=None, 
                                predictor='HA',quantile_ = None, list_of_ha_dict=list_of_ha_dict_21, print_ = True)

In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=None, 
                                predictor='Myopic',quantile_ = None, list_of_ha_dict=None, print_ = True)


In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_TBATS_models, 
                                predictor='TBATS',quantile_ = None, list_of_ha_dict=None, print_ = True)

In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_ARXGB_models, 
                                predictor='LDXGB',quantile_ = None, list_of_ha_dict=None, print_ = True)

In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_ARRF_models, 
                                predictor='LDRF',quantile_ = None, list_of_ha_dict=None, print_ = True)

In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_LDQRF_models, 
                                predictor='LDQRF',quantile_ = 0.10, list_of_ha_dict=None, print_ = False)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_LDQRF_models, 
                                predictor='LDQRF',quantile_ = 0.25, list_of_ha_dict=None, print_ = False)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_LDQRF_models, 
                                predictor='LDQRF',quantile_ = 0.5, list_of_ha_dict=None, print_ = True)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_LDQRF_models, 
                                predictor='LDQRF',quantile_ = 0.75, list_of_ha_dict=None, print_ = False)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_LDQRF_models, 
                                predictor='LDQRF',quantile_ = 0.90, list_of_ha_dict=None, print_ = False)

In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_XGB_models, 
                                predictor='XGB',quantile_ = None, list_of_ha_dict=None, print_ = True)

In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_RF_models, 
                                predictor='RF',quantile_ = None, list_of_ha_dict=None, print_ = True)

In [None]:
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_QRF_models, 
                                predictor='QRF',quantile_ = 0.10, list_of_ha_dict=None, print_ = False)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_QRF_models, 
                                predictor='QRF',quantile_ = 0.25, list_of_ha_dict=None, print_ = False)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_QRF_models, 
                                predictor='QRF',quantile_ = 0.5, list_of_ha_dict=None, print_ = True)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_QRF_models, 
                                predictor='QRF',quantile_ = 0.75, list_of_ha_dict=None, print_ = False)
df_15min_ = forecasting_per_grid(df_15min_, dict_of_predictors=all_QRF_models, 
                                predictor='QRF',quantile_ = 0.90, list_of_ha_dict=None, print_ = False)

In [None]:
df_15min_.to_csv('Data/case 1/df_test_21week.csv', index=False)

# Attributes

In [None]:
df_15min_ = pd.read_csv('Data/case 1/df_test_21week.csv')

In [7]:
df_15min_.columns

Index(['date', 'DayofWeek', 'Hour', 'Quarter', 'OrZone', 'temp', 'wspd',
       'prep', 'Is_Holiday', 'AR1', 'AR2', 'AR3', 'AR4', 'counts', 'zone_id',
       'HA', 'TBATS', 'LDXGB', 'LDRF', 'LDQRF_0.1', 'LDQRF_0.25', 'LDQRF_0.5',
       'LDQRF_0.75', 'LDQRF_0.9', 'XGB', 'RF', 'QRF_0.1', 'QRF_0.25',
       'QRF_0.5', 'QRF_0.75', 'QRF_0.9'],
      dtype='object')

In [9]:
'return the list of indices of rows that we will discard from df_15min_'
# Define the condition: for example, where column 'A' is greater than 20
condition_1 = (df_15min_['date'] == '2020-09-07')
condition_2 = (df_15min_['Hour'] == 10) & (df_15min_['Quarter'] == 3)

# Get list of indices where the condition is met
indices_1 = df_15min_.index[condition_1].tolist()
indices_2 = df_15min_.index[condition_2].tolist()
indices = indices_1 + indices_2

In [11]:
df_lst = []
for i in range(20):
    df_temp = df_15min_[df_15min_['OrZone'] == f'zone {i+1}']
    # print(len(df_temp))
    pred_SARIMA = np.load(f'Results_Saving_Case_1/SARIMA/zone {i+1}.npy')
    # print(len(pred_SARIMA))
    pred_SARIMAX = np.load(f'Results_Saving_Case_1/SARIMAX/zone {i+1}.npy')
    df_temp['SARIMA'] = pred_SARIMA
    df_temp['SARIMAX'] = pred_SARIMAX
    df_lst.append(df_temp)
df_15min_ = pd.concat(df_lst)

In [12]:
df_15min_.head(2)

Unnamed: 0,date,DayofWeek,Hour,Quarter,OrZone,temp,wspd,prep,Is_Holiday,AR1,...,LDQRF_0.9,XGB,RF,QRF_0.1,QRF_0.25,QRF_0.5,QRF_0.75,QRF_0.9,SARIMA,SARIMAX
0,2020-09-07,0,21,1,zone 1,154,40,0,0,0.0,...,2.0,0.575611,0.664446,0.0,0.0,1.0,1.0,2.0,0.758285,0.573147
20,2020-09-07,0,21,2,zone 1,154,40,0,0,1.0,...,2.0,0.575611,0.664446,0.0,0.0,1.0,1.0,2.0,0.600581,0.47711


In [None]:
def create_attributes_list(df_, features_):
    # create a list of sub-dataframes for each zone_id
    zone_dfs = []
    for i in range(20):
        # TODO: ALLERT: ZONE_DF order is NOW OrZone order
        temp = df_[df_.OrZone == f'zone {i+1}']
        # temp = df_[df_.zone_id == i]
        zone_dfs.append(temp[features_])
    attributes_biglist = []
    for j in range(len(zone_dfs[0])): # time
        attributes_ = []
        for z in range(len(zone_dfs)): # zone id
            attributes_.append(zone_dfs[z].iloc[j].values)
        attributes_biglist.append(attributes_) # list of T sublist: each sublist is from grid 1 - 20
        # shape of attributes_biglist: (T, 20 * num_features)
    return attributes_biglist

In [None]:
attributes_real_demand = create_attributes_list(df_15min_, ['counts'])
attributes_myopic = create_attributes_list(df_15min_, ['AR1'])
attributes_HA = create_attributes_list(df_15min_, ['HA'])
attributes_TBATS = create_attributes_list(df_15min_, ['TBATS'])
attributes_LDRF = create_attributes_list(df_15min_, ['LDRF'])
attributes_RF = create_attributes_list(df_15min_, ['RF'])
attributes_LDXGB = create_attributes_list(df_15min_, ['LDXGB'])
attributes_XGB = create_attributes_list(df_15min_, ['XGB'])
attributes_LDQRF = create_attributes_list(df_15min_, ['LDQRF_0.25', 'LDQRF_0.5', 'LDQRF_0.75'])
attributes_QRF = create_attributes_list(df_15min_, [ 'QRF_0.25', 'QRF_0.5', 'QRF_0.75'])
attributes_SARIMA = create_attributes_list(df_15min_, ['SARIMA'])
attributes_SARIMAX = create_attributes_list(df_15min_, ['SARIMAX'])

In [None]:
attributes_QRF_median = create_attributes_list(df_15min_, ['QRF_0.5'])
attributes_LDQRF_median = create_attributes_list(df_15min_, ['LDQRF_0.5'])

In [15]:
'layer 1: from t1 to tN, layer 2: demand val from zone 1 to zone 20, layer 3: demand val from each zone'
print('layer 1', len(attributes_real_demand))
print('layer 2', len(attributes_real_demand[0]))
print('layer 3', len(attributes_real_demand[0][0]))


layer 1 308
layer 2 20
layer 3 1


# Load Pre-saved predictions, cont here

In [16]:
@lru_cache(maxsize=None)
def _load_pred_dict(path: str):
    with open(path, 'rb') as f:
        return pickle.load(f)

def create_attributes_array_savedpred(model_name, test_length, 
                                      quantile_switch = False,
                                      quantiles=(25, 75), 
                                      base_dir="Predictions/1_week_predictions",
                                      zone_order=None, dtype=np.float32, to_list=False):
    """
    Returns array of shape (T, Z, P) where:
        T = test_length
        Z = number of zones (e.g. 20)
        P = number of prediction variants (1 + len(quantiles) for quantile models)
    """
    paths = []
    if model_name in {'QRF', 'LDQRF'}:
        paths.append(f"{base_dir}/{model_name}_pred_test.pkl")          # mean (or main) prediction
        if quantile_switch:
            for q in quantiles:
                paths.append(f"{base_dir}/{model_name}_{q}_pred_test.pkl")  # quantile predictions
    else:
        paths.append(f"{base_dir}/{model_name}_pred_test.pkl")

    # Load all prediction dicts (cached) – each: {zone_name: array_like (len >= test_length)}
    pred_dicts = [_load_pred_dict(p) for p in paths]

    # Establish consistent zone order
    if zone_order is None:
        # Use intersection order from first dict; sort for determinism
        zone_order = [f'zone {i}' for i in range(1, 21)]

    Z = len(zone_order)
    P = len(pred_dicts)
    T = test_length

    # Preallocate (P, Z, T)
    cube = np.empty((P, Z, T), dtype=dtype)
    for p_idx, dct in enumerate(pred_dicts):
        # Optionally validate keys once
        # Fill per zone (vectorized over time)
        for z_idx, z in enumerate(zone_order):
            arr = np.asarray(dct[z])
            if arr.shape[0] < T:
                raise ValueError(f"Zone {z} has only {arr.shape[0]} preds (< {T}).")
            cube[p_idx, z_idx, :] = arr[:T]

    # Reorder axes to (T, Z, P)
    attributes = np.transpose(cube, (2, 1, 0)) # (T, Z, P)

    if to_list:
        return attributes.tolist(), zone_order
    return attributes, zone_order

In [17]:
attributes_LSTM, _ = create_attributes_array_savedpred('LSTM', 307, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

first_row = attributes_LSTM[0:1].copy()          # shape (1, 20, 1)
attributes_LSTM = np.concatenate([first_row, attributes_LSTM], axis=0)

attributes_LSTM.shape

(308, 20, 1)

In [None]:
attributes_SARIMA, _ = create_attributes_array_savedpred('SARIMA', 308, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

attributes_SARIMAX, _ = create_attributes_array_savedpred('SARIMAX', 308, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

attributes_TBATS, zone_order = create_attributes_array_savedpred('TBATS', 308,
                            base_dir="Predictions/1_week_predictions",
                            zone_order=None, dtype=np.float32, to_list=False)

# attributes_LSTM, _ = create_attributes_array_savedpred('LSTM', 308, quantiles=(25, 75),
#                         base_dir="Predictions/1_week_predictions",
#                         zone_order=None, dtype=np.float32, to_list=False)

attributes_RF, _ = create_attributes_array_savedpred('RF', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

attributes_LDRF, _ = create_attributes_array_savedpred('LDRF', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

attributes_XGB, _ = create_attributes_array_savedpred('XGB', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

attributes_LDXGB, _ = create_attributes_array_savedpred('LDXGB', 308, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

attributes_QRF_median, _ = create_attributes_array_savedpred('QRF', 308, quantiles=(25,75),
                                                             base_dir="Predictions/1_week_predictions",
                                                             zone_order=None, dtype=np.float32, to_list=False)

attributes_LDQRF_median, _ = create_attributes_array_savedpred('LDQRF', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)


In [None]:
attributes_LDQRF, _ = create_attributes_array_savedpred('LDQRF', 308, quantiles=(25, 75),
                                                     quantile_switch = True,
                                                    base_dir="Predictions/1_week_predictions",
                                                    zone_order=None, dtype=np.float32, to_list=False)

attributes_QRF, _ = create_attributes_array_savedpred('QRF', 308, quantiles=(25, 75),quantile_switch = True,
                        base_dir="Predictions/1_week_predictions",
                        zone_order=None, dtype=np.float32, to_list=False)

In [52]:
attributes_LDQRF_median.shape

(308, 20, 1)

In [53]:
attributes_QRF.shape

(308, 20, 3)

In [47]:
median_df = pd.DataFrame()

# Hierarchical Clustering

In [None]:
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.sparse import csr_matrix

def calculate_last_merge_distance(clustering, distance_matrix, num_current_cluster_):
  n_samples = distance_matrix.shape[0]
  # Retrieve the children attribute
  children = clustering.children_

  # Function to compute average linkage distance
  def compute_avg_linkage_dist(i, j, dist_matrix):
      cluster_i_points = get_cluster_points(i, children, n_samples)
      cluster_j_points = get_cluster_points(j, children, n_samples)
      return np.mean([dist_matrix[p1, p2] for p1 in cluster_i_points for p2 in cluster_j_points])

  # Helper function to get all points in a cluster
  def get_cluster_points(cluster_idx, children, n_samples):
      if cluster_idx < n_samples:
          return [cluster_idx]
      else:
          cluster = []
          for child in children[cluster_idx - n_samples]:
              cluster.extend(get_cluster_points(child, children, n_samples))
          return cluster

  # Compute the distances of each merge
  merge_distances = []
  for i, (child1, child2) in enumerate(children):
      dist = compute_avg_linkage_dist(child1, child2, distance_matrix)
      merge_distances.append(dist)

  # Get the distance of the last merge before termination
  if merge_distances:
      last_merge_distance = merge_distances[-(num_current_cluster_ - 1)]
      return last_merge_distance
  else:
      return None

# network of pickup zones
def create_zone_graph(edges, num_nodes=20):
    G = nx.Graph()
    for node in range(num_nodes):
        G.add_node(node)
    for edge in edges:
        G.add_edge(*edge)
    return G

def get_neighbors(G_general):
    return {node: list(G_general.neighbors(node)) for node in G_general.nodes}

def create_cluster_graph(G_zone, cluster_list):
    # cluster_list: [zone ji] for all zones i belong to cluster j
    G_cluster = nx.Graph()
    for j in range(len(cluster_list)):
        G_cluster.add_node(j)
    for i, cluster1 in enumerate(cluster_list):
        for j, cluster2 in enumerate(cluster_list):
            if cluster1 != cluster2 and any(G_zone.has_edge(node1, node2) for node1 in cluster1 for node2 in cluster2):
                G_cluster.add_edge(i, j)
    return G_cluster

def get_inner_neighbors(num_zones, cluster_list):
    # cluster_list: [zone ji] for all zones i belong to cluster j
    inner_neighbors = {}
    for i in range(num_zones):
        for cluster in cluster_list:
            if i in cluster:
                inner_neighbors[i] = cluster.copy()
                inner_neighbors[i].remove(i)
    return inner_neighbors

def get_outer_neighbors(num_zones, max_cluster_size, cluster_list, G_cluster):
    cluster_outer_neighbor_dict = get_neighbors(G_cluster)
    zone_outer_neighbor_dict = {}

    # Create a new list that only includes clusters with size less than or equal to max_cluster_size
    valid_clusters = [cluster for cluster in cluster_list if len(cluster) < max_cluster_size]

    # Update cluster_outer_neighbor_dict to only include valid clusters
    cluster_outer_neighbor_dict = {i: cluster_outer_neighbor_dict[i] for i in range(len(valid_clusters))}

    for i in range(num_zones):
        neighbors_ = []
        for c_id, cluster in enumerate(valid_clusters):
            if i in cluster:
                cluster_neighbors = cluster_outer_neighbor_dict[c_id]
                for nb_cluster_id in cluster_neighbors:
                    if nb_cluster_id in cluster_outer_neighbor_dict.keys():
                        if len(valid_clusters[nb_cluster_id]) + len(cluster) <= max_cluster_size:
                            neighbors_ += valid_clusters[nb_cluster_id]
        zone_outer_neighbor_dict[i] = list(set(neighbors_))

    return zone_outer_neighbor_dict

def create_linkage_matrix(num_zones, zone_outer_neighbor_dict, zone_inner_neighbor_dict):
    # create the potential linkage matrix
    potential_linkages = np.zeros((num_zones, num_zones))
    for zone, outer_neighbors in zone_outer_neighbor_dict.items():
        for outer_neighbor in outer_neighbors:
            potential_linkages[zone][outer_neighbor] = 1
    assert np.allclose(potential_linkages, potential_linkages.T, rtol=1e-05, atol=1e-08)
    assert np.allclose(np.diag(potential_linkages), np.zeros(num_zones), rtol=1e-05, atol=1e-08)

    # create the must linkage matrix
    must_linkages = np.zeros((num_zones, num_zones))
    for zone, inner_neighbors in zone_inner_neighbor_dict.items():
        for inner_neighbor in inner_neighbors:
            must_linkages[zone][inner_neighbor] = 1
    # the diagonal of the must linkage matrix should be zero, and the matrix should be symmetric
    assert np.allclose(must_linkages, must_linkages.T, rtol=1e-05, atol=1e-08)
    assert np.allclose(np.diag(must_linkages), np.zeros(num_zones), rtol=1e-05, atol=1e-08)

    return potential_linkages, must_linkages

# TODO: implement the weighted Euclidean distance in main code
def weighted_euclidean_distance(v1, v2, w=np.array([1,1,1])):
    # Calculate the weighted Euclidean distance
    diff = v1 - v2
    weighted_diff = w * (diff ** 2)
    weighted_distance = np.sqrt(np.sum(weighted_diff))
    return weighted_distance

def get_distance_metrix(num_zones, attributes_, potential_linkages, must_linkages, attribute_type = 'single'):
    # attributes_ = df[feature_list], a 2D array of the features of all zones, for specific time step t
    # attributes_[i]:  the feature vector of zone i
    distance_matrix = np.zeros((num_zones, num_zones))
    for i in range(num_zones):
        for j in range(num_zones):
            if i != j:
                if attribute_type == 'single':
                    distance_matrix[i, j] = np.linalg.norm(attributes_[i] - attributes_[j])
                elif attribute_type == 'multiple':
                    # distance_matrix[i, j] = weighted_euclidean_distance(attributes_[i], attributes_[j], w=np.array([1,2,1]))
                    distance_matrix[i, j] = weighted_euclidean_distance(attributes_[i], attributes_[j], w=np.array([1,1,1]))
                if potential_linkages[i, j] == 0:
                    # if two zones are impossible to link, set their distance to a large number
                    distance_matrix[i, j] = 99
                if must_linkages[i, j] == 1:
                    # if two zones must be linked, set their distance to zero
                    distance_matrix[i, j] = 0
    
    # Ensure the distance matrix is symmetric
    distance_matrix = np.minimum(distance_matrix, distance_matrix.T)

    return distance_matrix

def contingency_constrained_hierarchical_clustering(attributes,
                                                    zone_edges,
                                                    zone_actual_demand, 
                                                    zone_pred_demand,
                                                    num_zones = 20,
                                                    ultimate_num_clusters = 5, 
                                                    max_cluster_size = 6, 
                                                    distance_threshold_=9,
                                                    distance_measure = 'ward',
                                                    print_ = False,
                                                    attribute_type = 'single'):
    '''
    This function generates the clustering of the zones based on their predicted demand values,
    by the contingency constrained hierarchical clustering process

    Inputs: 
    df: the dataframe containing the predicted and previous demand values of all pick-up zones, for each time step t
    feature_list: the list of features that will be used as criterion of clustering

    Constraints:
    max_num_clusters: the maximum number of clusters that can be generated, default = 5
    min_num_clusters: the minimum number of clusters that can be generated, default = 2
    max_cluster_size: the maximum number of zones that can be in a cluster, default = 9
    min_cluster_size: the minimum number of zones that can be in a cluster, default = 1
    (contingency constraints are updated in the linkage matrix, during the clustering process)
    distance_measure: the distance measure used in the clustering process, default = 'ward', we should perform sensitivity analysis over this parameter

    Outputs:
    clustering.labels_: the cluster label of each zone
    cluster_demand_actual: the average demand value of each cluster, calculated using actual demand values, based on the clustering result over predicted attributes
    cluster_demand_pred: the average predicted demand value of each cluster, calculated using predicted demand values, based on the clustering result over predicted attributes
    '''
    # initialize helper variables
    cluster_list= [[i] for i in range(num_zones)] # [zone ji] for all zones i belong to cluster j
    # print(cluster_list)
    cluster_sizes = np.ones(num_zones)
    # temp_max_cluster_size = np.max(cluster_sizes)
    last_merge_distance_ = 99

    # intialize the network of zones & clusters using networkx
    G_zone = create_zone_graph(zone_edges)
    # zone_geoconnected_dict = get_neighbors(G_zone)

    # Perform hierarchical clustering: 
    num_current_cluster_ = num_zones
    violations_ = False
    
    while num_current_cluster_ > ultimate_num_clusters and violations_ == False:
        num_current_cluster_ -= 1
        # print(f'Current number of clusters: {num_current_cluster_}')
        # create the cluster graph
        G_cluster = create_cluster_graph(G_zone, cluster_list)
        zone_outer_neighbor_dict = get_outer_neighbors(num_zones, max_cluster_size, cluster_list, G_cluster) # this is used to update the potential linkage matrix
        zone_inner_neighbor_dict = get_inner_neighbors(num_zones, cluster_list) # this is used to update the must linkage matrix

        # how to create the linkage matrix from zone_outer_neighbor_dict and zone_inner_neighbor_dict
        potential_linkages, must_linkages = create_linkage_matrix(num_zones, zone_outer_neighbor_dict, zone_inner_neighbor_dict)

        # Modify the distance matrix to enforce can-link constraints
        distance_matrix = get_distance_metrix(num_zones, attributes, potential_linkages, must_linkages, attribute_type)
        # print('--The next smallest distance in line', linkage(distance_matrix, method='average'))
        
        clustering = AgglomerativeClustering(
            n_clusters=num_current_cluster_,
            affinity='precomputed', # we use the precomputed distance matrix
            linkage=distance_measure,
            compute_full_tree=False
        )

        # Fit the clustering
        clustering.fit(distance_matrix)

        # size of each cluster after the clustering
        cluster_sizes = np.zeros(num_zones)
        for i in range(num_zones):
            cluster_sizes[clustering.labels_[i]] += 1

        # cluster similarity threshold via last merge distance
        last_merge_distance = calculate_last_merge_distance(clustering, distance_matrix, num_current_cluster_)
        # if last_merge_distance > 10:
        #     print('last merge distance', last_merge_distance)
        #     print(cluster_sizes)
        if last_merge_distance > distance_threshold_: 
            violations_ = True

        # create cluster list, which is a 2D list of zones in each cluster
        cluster_list = [[] for _ in range(num_current_cluster_)]
        for i in range(num_zones):
            cluster_list[clustering.labels_[i]].append(i)

        # check whether zones that do not included by zone_inner_neighbor_dict and zone_outer_neighbor_dict are in the same cluster
        for i in range(num_zones):
            if violations_ == False:
                # get the list of zones in the same cluster with zone i
                cluster_i = cluster_list[clustering.labels_[i]]
                # check whether any zone j in cluster_i is neither in zone_inner_neighbor_dict nor in zone_outer_neighbor_dict
                for j in cluster_i:
                    if j != i: 
                        if j not in zone_inner_neighbor_dict[i] and j not in zone_outer_neighbor_dict[i]:
                            violations_ = True
                            break

        # revert to the previous clustering
        if violations_:
            num_current_cluster_ += 1
            if print_:
                print('Contingency violation detected, reverting to the previous clustering')
            clustering = AgglomerativeClustering(n_clusters=num_current_cluster_,
                                                affinity='precomputed', # we use the precomputed distance matrix
                                                linkage=distance_measure,
                                                compute_full_tree=False)

            # Fit the clustering
            clustering.fit(distance_matrix)

    # size of each cluster after the clustering
    cluster_sizes = np.zeros(num_current_cluster_)
    for i in range(num_zones):
        cluster_sizes[clustering.labels_[i]] += 1

    cluster_list = [[] for _ in range(num_current_cluster_)]
    for i in range(num_zones):
        cluster_list[clustering.labels_[i]].append(i)

    # for each cluster, calculate the average demand value using zone_actual_demand
    cluster_medi_demand_actual = []
    cluster_medi_demand_pred = []
    # cluster_avg_demand_actual = []
    # cluster_avg_demand_pred = []
    for cluster in cluster_list:
        # demand = 0
        # demand_pred = 0
        demand = []
        demand_pred = []
        for zone in cluster:
            # demand += zone_actual_demand[zone]
            # demand_pred += zone_pred_demand[zone]
            demand.append(zone_actual_demand[zone])
            demand_pred.append(zone_pred_demand[zone])
        # cluster_avg_demand_actual.append(demand / len(cluster))
        # cluster_avg_demand_pred.append(demand_pred / len(cluster))
        cluster_medi_demand_actual.append(np.median(demand))
        cluster_medi_demand_pred.append(np.median(demand_pred))
    
    if print_:
        print('number of clusters', num_current_cluster_)
        print('final labels', clustering.labels_)
        print('final cluster sizes', cluster_sizes)
        # print('cluster average demand (actual)', cluster_avg_demand_actual)
        # print('cluster average demand (predict)', cluster_avg_demand_pred)
        print('cluster median demand (actual)', cluster_medi_demand_actual)
        print('cluster median demand (predict)', cluster_medi_demand_pred)

    # create an array that contains the cluster average demand for each zone
    cluster_demand_actual = np.zeros(num_zones)
    cluster_demand_pred = np.zeros(num_zones)
    for i in range(num_zones):
        # print('zone', i, 'cluster', clustering.labels_[i])
        # cluster_demand_actual[i] = cluster_avg_demand_actual[clustering.labels_[i]]
        # cluster_demand_pred[i] = cluster_avg_demand_pred[clustering.labels_[i]]
        cluster_demand_actual[i] = cluster_medi_demand_actual[clustering.labels_[i]]
        cluster_demand_pred[i] = cluster_medi_demand_pred[clustering.labels_[i]]
    
    return clustering.labels_, cluster_demand_actual, cluster_demand_pred

## Hier-Clustering with real demand 

In [21]:
import time

def hier_clustering_per_timestep(attributes_biglist,
                                 zone_wise_demand_actual,
                                 zone_wise_demand_pred, 
                                 zone_edges_,
                                 min_num_cluster = 2,
                                 max_size_per_cluster = 7,
                                 distance_threshold_ = 9,
                                 proximacy_measure = 'average',
                                 attribute_type = 'single'):
    # # ID order
    # zone_edges_ = [(1,3), (1,4), (1,6), (2,5), (2,13), (3,6), (4,6), (4,5),
    #                (5,13), (7,11), (8,10), (8,9), (9,10), (9,11), (9,14),
    #                (10,14), (10,18), (11,14), (12,14), (12,13), (14,18),
    #                (15,19), (16,17), (16,18), (16,19),(17,18)]
    num_timestep = len(attributes_biglist)
    list_of_labels = []
    list_of_cluster_demand_actual = []
    list_of_cluster_demand_pred = []
    # new for computational time recording
    start_time = time.time()
    for i in range(num_timestep):
        clustering_labels_, cluster_demand_actual, cluster_demand_pred =contingency_constrained_hierarchical_clustering(
                                                attributes = attributes_biglist[i],
                                                zone_edges = zone_edges_,
                                                zone_actual_demand = zone_wise_demand_actual[i], 
                                                zone_pred_demand = zone_wise_demand_pred[i],
                                                num_zones = 20,
                                                distance_threshold_ = distance_threshold_,
                                                ultimate_num_clusters = min_num_cluster, 
                                                max_cluster_size = max_size_per_cluster, 
                                                distance_measure = proximacy_measure,
                                                attribute_type = attribute_type)
        # COLLECT all the labels, cluster_actual and cluster_pred into a full length array
        list_of_labels.append(clustering_labels_) # at timestep t and for all 20 grids
        list_of_cluster_demand_actual.append(cluster_demand_actual)
        list_of_cluster_demand_pred.append(cluster_demand_pred)
    # print the time elapsed for each clustering
    print(f'Average Time elapsed for clustering at time step {i}: {(time.time() - start_time)/num_timestep:.3f} seconds')

    list_of_labels = np.array(list_of_labels).flatten()
    list_of_cluster_demand_actual = np.array(list_of_cluster_demand_actual).flatten()
    list_of_cluster_demand_pred = np.array(list_of_cluster_demand_pred).flatten()
    return list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred

def clustering_evaluation(df_15min, clustering_labels, cluster_demand_actual, cluster_demand_pred):
    '''
    This universal evaluation func compares the estimated courier resource needed post clustering
    
    Measure: Error/Difference from {estimated per cluster resource (using actual demand values attributes) - estimated per cluster resource (using pred demand attributes)}

    Note that, the measure accounts for influence from
    - (1) forecasting error from the point predicted demand for each zone per 15min time window
    - (2) the clustering deviation caused by using (potentially flawed) predicted demand rather than perfect predictions (the actual demand)
    - * the deterministic predictions of Quantile Regression Forest are should be more or less the same as those generated from Random Forest
    - * as they follow the very same tree regressor generation criterion

    '''
    # create df_full via arranging df by date, hour, quarter, zone_id
    df_full = df_15min.sort_values(by=['date', 'Hour', 'Quarter', 'zone_id'])
    df_full['cluster_label'] = clustering_labels
    df_full['cluster_demand_actual'] = cluster_demand_actual
    df_full['cluster_demand_pred'] = cluster_demand_pred

    # calculate the error/difference between the actual and predicted demand for each cluster
    zone_wise_MAE = np.zeros(20)
    zone_wise_RMSE = np.zeros(20)
    zone_wise_RMSLE = np.zeros(20)
    zone_wise_residual = np.zeros(20)

    for i in range(20):
        zone_name = f'zone {i+1}'
        temp = df_full[df_full.OrZone == zone_name]
        mae, rmse, rmsle, resid_std = prediction_evaluate(temp.cluster_demand_pred.values, temp.cluster_demand_actual.values)
        zone_wise_MAE[i] = mae
        zone_wise_RMSE[i] = rmse
        zone_wise_RMSLE[i] = rmsle
        zone_wise_residual[i] = resid_std
    
    # print the evaluation results
    print(f'MAE: {np.mean(zone_wise_MAE):.3f}({np.std(zone_wise_MAE):.3f})')
    print(f'RMSE: {np.mean(zone_wise_RMSE):.3f}({np.std(zone_wise_RMSE):.3f})')
    print(f'RMSLE: {np.mean(zone_wise_RMSLE):.3f}({np.std(zone_wise_RMSLE):.3f})')
    print(f'Residual STD: {np.mean(zone_wise_residual):.3f}({np.std(zone_wise_residual):.3f})')


## CCHC results

In [22]:
cluster_outcomes = pd.DataFrame()

global_distance_threshold = 9 
# the threshold used to determine the distance between zones in the clustering

In [None]:
list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_real_demand,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_real_demand, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')
cluster_outcomes['actual_labels'] = list_of_labels
cluster_outcomes['actual_cluster_demand'] = list_of_cluster_demand_pred

In [None]:
list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_real_demand,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_real_demand, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred)

In [None]:
list_of_labels, SARIMA_list_of_cluster_demand_actual, SARIMA_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_SARIMA,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_SARIMA, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMA_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMA_list_of_cluster_demand_actual)


In [None]:
list_of_labels, SARIMAX_list_of_cluster_demand_actual, SARIMAX_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_SARIMAX,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_SARIMAX, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMAX_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMAX_list_of_cluster_demand_actual)


In [None]:
list_of_labels, TBATS_list_of_cluster_demand_actual, TBATS_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_TBATS,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_TBATS, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, TBATS_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, TBATS_list_of_cluster_demand_actual)


In [None]:
list_of_labels, LSTM_list_of_cluster_demand_actual, LSTM_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_LSTM,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LSTM, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LSTM_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LSTM_list_of_cluster_demand_actual)

In [None]:
list_of_labels, RF_list_of_cluster_demand_actual, RF_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_RF,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_RF, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, RF_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, RF_list_of_cluster_demand_actual)


In [None]:
list_of_labels, LDRF_list_of_cluster_demand_actual, LDRF_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_LDRF,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDRF, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            distance_threshold_ = global_distance_threshold,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDRF_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDRF_list_of_cluster_demand_actual)

In [None]:
list_of_labels, XGB_list_of_cluster_demand_actual, XGB_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_XGB,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_XGB, 
                                                                                                            zone_edges_=connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, XGB_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, XGB_list_of_cluster_demand_actual)

In [None]:
list_of_labels, LDXGB_list_of_cluster_demand_actual, LDXGB_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_LDXGB,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDXGB, 
                                                                                                            zone_edges_= connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDXGB_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDXGB_list_of_cluster_demand_actual)

In [None]:
'Median/Single QRF 21 week'
list_of_labels, medi_QRF_list_of_cluster_demand_actual, medi_QRF_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_QRF_median,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_QRF_median, 
                                                                                                            zone_edges_= connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average',
                                                                                                            attribute_type = 'single')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_QRF_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_QRF_list_of_cluster_demand_actual)

In [None]:
'Quantile_QRF 21 week'
list_of_labels, QRF_list_of_cluster_demand_actual, QRF_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_QRF,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_QRF_median, 
                                                                                                            zone_edges_= connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average',
                                                                                                            attribute_type = 'multiple')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, QRF_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, QRF_list_of_cluster_demand_actual)

In [None]:
'Median/Single LDQRF 21 week'
list_of_labels, medi_LDQRF_list_of_cluster_demand_actual, medi_LDQRF_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_LDQRF_median,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDQRF_median, 
                                                                                                            zone_edges_= connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average',
                                                                                                            attribute_type = 'single')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_LDQRF_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_LDQRF_list_of_cluster_demand_actual)

cluster_outcomes['medi_LDQRF_labels'] = list_of_labels
cluster_outcomes['medi_LDQRF_cluster_demand'] = medi_LDQRF_list_of_cluster_demand_pred


In [None]:
'Quantile LDQRF 21 week'
list_of_labels, LDQRF_list_of_cluster_demand_actual, LDQRF_list_of_cluster_demand_pred = hier_clustering_per_timestep(attributes_biglist = attributes_LDQRF,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDQRF_median, 
                                                                                                            zone_edges_= connected_pairs,
                                                                                                            min_num_cluster = 3,
                                                                                                            max_size_per_cluster = 9,
                                                                                                            # max_size_per_cluster = 7,
                                                                                                            proximacy_measure = 'average',
                                                                                                            attribute_type = 'multiple')
print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDQRF_list_of_cluster_demand_pred)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDQRF_list_of_cluster_demand_actual)

cluster_outcomes['Quan_LDQRF_labels'] = list_of_labels
cluster_outcomes['Quan_LDQRF_cluster_demand'] = LDQRF_list_of_cluster_demand_pred

# Constrained K-Means

## misc

In [91]:
def CKMC_clustering(attributes, attribute_type, zone_dict,zone_actual_demand,zone_pred_demand):
    rows = []

    if attribute_type in ['QRF','LDQRF']:
        for j,z in enumerate(list(zone_dict.keys())[:20]): # follow the order of
            address = zone_dict[z]
            combined_pred = 1*attributes[j][0] + 2* attributes[j][1] + 1*attributes[j][2]
            rows.append({'prediction':combined_pred,
                        'latitude':address[0],
                        'longitude':address[1]})
    else:
        for j,z in enumerate(list(zone_dict.keys())[:20]): # follow the order of zone_dict keys zone 1, 2, ... 20
            address = zone_dict[z]
            rows.append({'prediction':attributes[j],
                        'latitude':address[0],
                        'longitude':address[1]})

    pred_dict = pd.DataFrame(rows, columns=['prediction','latitude','longitude'])

    cluster_labels, _, _ = clustering(data_normalization(pred_dict), min_k=3, max_k=6, cluster_method='constrained_k-means')
    
    num_current_cluster_ = len(set(cluster_labels))
    num_zones = len(attributes)
    # add each zone to its corresponding cluster
    cluster_list = [[] for _ in range(num_current_cluster_)]
    for i in range(num_zones):
        cluster_list[cluster_labels[i]].append(i)
    
    # for each cluster, calculate the average demand value using zone_actual_demand
    cluster_medi_demand_actual = []
    cluster_medi_demand_pred = []

    for cluster in cluster_list:
        demand = []
        demand_pred = []
        for zone in cluster:
            demand.append(zone_actual_demand[zone])
            demand_pred.append(zone_pred_demand[zone])
        cluster_medi_demand_actual.append(np.median(demand))
        cluster_medi_demand_pred.append(np.median(demand_pred))
    
    # create an array that contains the cluster average demand for each zone
    cluster_demand_actual = np.zeros(num_zones)
    cluster_demand_pred = np.zeros(num_zones)
    for i in range(num_zones):
        cluster_demand_actual[i] = cluster_medi_demand_actual[cluster_labels[i]]
        cluster_demand_pred[i] = cluster_medi_demand_pred[cluster_labels[i]]

    return cluster_labels, cluster_demand_actual, cluster_demand_pred



def CKMC_per_timestep(attributes_biglist,
                      zone_wise_demand_actual,
                      zone_wise_demand_pred, 
                      zone_dict,
                      method='constrained_k-means',
                      prediction_type='actual'):

    # DEBUGGER check attribute_type
    print(f'Attribute type: {prediction_type}')
    print('check input attribute shape', attributes_biglist.shape)
    print('check input zone_wise_demand_pred shape', zone_wise_demand_pred.shape)

    num_timestep = attributes_biglist.shape[0]
    list_of_labels = []
    list_of_cluster_demand_actual = []
    list_of_cluster_demand_pred = []

    start_time = time.time()
    # new for computational time recording
    for i in tqdm(range(num_timestep)):
        cluster_labels, cluster_demand_actual, cluster_demand_pred = CKMC_clustering(attributes_biglist[i], 
                                 prediction_type, 
                                 zone_dict,
                                 zone_wise_demand_actual[i],
                                 zone_wise_demand_pred[i])
        list_of_labels.append(cluster_labels)
        list_of_cluster_demand_actual.append(cluster_demand_actual)
        list_of_cluster_demand_pred.append(cluster_demand_pred)
    print(f'Average Time elapsed for clustering at time step {i}: {(time.time() - start_time)/num_timestep:.3f} seconds')
    
    list_of_labels = np.array(list_of_labels).flatten()
    list_of_cluster_demand_actual = np.array(list_of_cluster_demand_actual).flatten()
    list_of_cluster_demand_pred = np.array(list_of_cluster_demand_pred).flatten()

    return list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred

## experiment

In [None]:
df_15min_ = pd.read_csv('Data/EU_use_case/df_test_21week.csv')
zone_id_dict = {'zone 1':1, 'zone 2': 2, 'zone 3': 3, 'zone 4': 4, 'zone 5': 5, 'zone 6': 6, 'zone 7': 7, 'zone 8': 8, 'zone 9': 9, 'zone 10': 10,
                'zone 11': 11, 'zone 12': 12, 'zone 13': 13, 'zone 14': 14, 'zone 15': 15, 'zone 16': 16, 'zone 17': 17, 'zone 18': 18, 'zone 19': 19, 'zone 20': 20}
df_15min_['zone_order'] = df_15min_['OrZone'].map(zone_id_dict)
df_15min_ = df_15min_.sort_values(by=['date', 'Hour', 'Quarter', 'zone_order']).reset_index(drop=True)
del df_15min_['zone_id']
zone_order = df_15min_['OrZone'].unique()

# load zone address from txt file
zone_addresses = None

df_15min_['lateitude'] = df_15min_['OrZone'].map(lambda x: zone_addresses[x][0])
df_15min_['longitude'] = df_15min_['OrZone'].map(lambda x: zone_addresses[x][1])

In [67]:
@lru_cache(maxsize=None)
def _load_pred_dict(path: str):
    with open(path, 'rb') as f:
        return pickle.load(f)

def create_attributes_array_savedpred(model_name, test_length, 
                                      quantile_switch = False,
                                      quantiles=(25, 75), 
                                      base_dir="Predictions/1_week_predictions",
                                      zone_order=None, dtype=np.float32, to_list=False):
    """
    Returns array of shape (T, Z, P) where:
        T = test_length
        Z = number of zones (e.g. 20)
        P = number of prediction variants (1 + len(quantiles) for quantile models)
    """
    paths = []
    if model_name in {'QRF', 'LDQRF'}:
        paths.append(f"{base_dir}/{model_name}_pred_test.pkl")          # mean (or main) prediction
        if quantile_switch:
            for q in quantiles:
                paths.append(f"{base_dir}/{model_name}_{q}_pred_test.pkl")  # quantile predictions
    elif model_name == 'actual':
        paths.append(f"{base_dir}/LDRF_actual_test.pkl")
    else:
        paths.append(f"{base_dir}/{model_name}_pred_test.pkl")

    # Load all prediction dicts (cached) – each: {zone_name: array_like (len >= test_length)}
    pred_dicts = [_load_pred_dict(p) for p in paths]

    # Establish consistent zone order
    if zone_order is None:
        # Use intersection order from first dict; sort for determinism
        zone_order = [f'zone {i}' for i in range(1, 21)]

    Z = len(zone_order)
    P = len(pred_dicts)
    T = test_length

    # Preallocate (P, Z, T)
    cube = np.empty((P, Z, T), dtype=dtype)
    for p_idx, dct in enumerate(pred_dicts):
        # Optionally validate keys once
        # Fill per zone (vectorized over time)
        for z_idx, z in enumerate(zone_order):
            arr = np.asarray(dct[z])
            if arr.shape[0] < T:
                raise ValueError(f"Zone {z} has only {arr.shape[0]} preds (< {T}).")
            cube[p_idx, z_idx, :] = arr[:T]

    # Reorder axes to (T, Z, P)
    attributes = np.transpose(cube, (2, 1, 0)) # (T, Z, P)

    if to_list:
        return attributes.tolist(), zone_order
    return attributes, zone_order

In [68]:
attributes_real_demand, _ = create_attributes_array_savedpred('actual', 308, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_LSTM, _ = create_attributes_array_savedpred('LSTM', 307, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

first_row = attributes_LSTM[0:1].copy()          # shape (1, 20, 1)
attributes_LSTM = np.concatenate([first_row, attributes_LSTM], axis=0)

attributes_SARIMA, _ = create_attributes_array_savedpred('SARIMA', 308, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_SARIMAX, _ = create_attributes_array_savedpred('SARIMAX', 308, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_TBATS, zone_order = create_attributes_array_savedpred('TBATS', 308,
                            base_dir="Predictions/1_week_predictions",
                            zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_RF, _ = create_attributes_array_savedpred('RF', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_LDRF, _ = create_attributes_array_savedpred('LDRF', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_XGB, _ = create_attributes_array_savedpred('XGB', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_LDXGB, _ = create_attributes_array_savedpred('LDXGB', 308, quantiles=(25, 75),
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_QRF_median, _ = create_attributes_array_savedpred('QRF', 308, quantiles=(25,75),
                                                             base_dir="Predictions/1_week_predictions",
                                                             zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_LDQRF_median, _ = create_attributes_array_savedpred('LDQRF', 308, quantiles=(25, 75), 
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_LDQRF, _ = create_attributes_array_savedpred('LDQRF', 308, quantiles=(25, 75),
                                                     quantile_switch = True,
                                                    base_dir="Predictions/1_week_predictions",
                                                    zone_order=zone_order, dtype=np.float32, to_list=False)

attributes_QRF, _ = create_attributes_array_savedpred('QRF', 308, quantiles=(25, 75),quantile_switch = True,
                        base_dir="Predictions/1_week_predictions",
                        zone_order=zone_order, dtype=np.float32, to_list=False)

In [None]:
list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_real_demand,
                                                                                                zone_wise_demand_actual = attributes_real_demand,
                                                                                                zone_wise_demand_pred = attributes_real_demand, 
                                                                                                zone_dict = zone_dict,
                                                                                                method='constrained_k-means',
                                                                                                prediction_type='actual')
median_df['actual'] = list_of_cluster_demand_pred
cluster_outcomes['actual_labels'] = list_of_labels
cluster_outcomes['actual_cluster_demand'] = list_of_cluster_demand_pred

In [None]:
list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_real_demand,
                                                                                                zone_wise_demand_actual = attributes_real_demand,
                                                                                                zone_wise_demand_pred = attributes_real_demand, 
                                                                                                zone_dict = zone_dict,
                                                                                                method='constrained_k-means',
                                                                                                prediction_type='actual')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, list_of_cluster_demand_pred)

In [None]:
list_of_labels, SARIMA_list_of_cluster_demand_actual, SARIMA_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_SARIMA,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_SARIMA, 
                                                                                                            zone_dict = zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='SARIMA')


print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMA_list_of_cluster_demand_pred, print_out=True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMA_list_of_cluster_demand_actual, print_out=True)


In [None]:
list_of_labels, SARIMAX_list_of_cluster_demand_actual, SARIMAX_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_SARIMAX,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_SARIMAX, 
                                                                                                            zone_dict = zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='SARIMA')


print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMAX_list_of_cluster_demand_pred, print_out=True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, SARIMAX_list_of_cluster_demand_actual, print_out=True)


In [None]:
list_of_labels, TBATS_list_of_cluster_demand_actual, TBATS_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_TBATS,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_TBATS, 
                                                                                                            zone_dict = zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='tbats')


print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, TBATS_list_of_cluster_demand_pred, print_out=True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, TBATS_list_of_cluster_demand_actual, print_out=True)


In [None]:
list_of_labels, LSTM_list_of_cluster_demand_actual, LSTM_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_LSTM,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LSTM, 
                                                                                                            zone_dict = zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='LSTM')


print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LSTM_list_of_cluster_demand_pred, print_out=True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LSTM_list_of_cluster_demand_actual, print_out=True)


In [None]:
list_of_labels, RF_list_of_cluster_demand_actual, RF_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_RF,
                                                                                                    zone_wise_demand_actual = attributes_real_demand,
                                                                                                    zone_wise_demand_pred = attributes_RF, 
                                                                                                    zone_dict = zone_dict,
                                                                                                    method='constrained_k-means',
                                                                                                    prediction_type='RF')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, RF_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, RF_list_of_cluster_demand_actual, print_out = True)

In [None]:
list_of_labels, LDRF_list_of_cluster_demand_actual, LDRF_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_LDRF,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDRF, 
                                                                                                            zone_dict = zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='LDRF')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDRF_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDRF_list_of_cluster_demand_actual, print_out = True)

In [None]:
list_of_labels, XGB_list_of_cluster_demand_actual, XGB_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_XGB,
                                                                                                        zone_wise_demand_actual = attributes_real_demand,
                                                                                                        zone_wise_demand_pred = attributes_XGB, 
                                                                                                        zone_dict= zone_dict,
                                                                                                        method='constrained_k-means',
                                                                                                        prediction_type='XGB')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, XGB_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, XGB_list_of_cluster_demand_actual, print_out = True)

In [None]:
list_of_labels, LDXGB_list_of_cluster_demand_actual, LDXGB_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_LDXGB,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDXGB, 
                                                                                                            zone_dict= zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='LDXGB')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDXGB_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDXGB_list_of_cluster_demand_actual, print_out = True)


In [None]:
'Median/Single QRF 21 week'
list_of_labels, medi_QRF_list_of_cluster_demand_actual, medi_QRF_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_QRF_median,
                                                                                                                zone_wise_demand_actual = attributes_real_demand,
                                                                                                                zone_wise_demand_pred = attributes_QRF_median, 
                                                                                                                zone_dict= zone_dict,
                                                                                                                method='constrained_k-means',
                                                                                                                prediction_type='QRF_median')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_QRF_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_QRF_list_of_cluster_demand_actual, print_out = True)

In [None]:
'Quantile_QRF 21 week'
list_of_labels, QRF_list_of_cluster_demand_actual, QRF_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_QRF,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_QRF_median, 
                                                                                                            zone_dict= zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='QRF')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, QRF_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, QRF_list_of_cluster_demand_actual, print_out = True)

In [None]:
'Median/Single LDQRF 21 week'
list_of_labels, medi_LDQRF_list_of_cluster_demand_actual, medi_LDQRF_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_LDQRF_median,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDQRF_median, 
                                                                                                            zone_dict= zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='LDQRF_median')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_LDQRF_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, medi_LDQRF_list_of_cluster_demand_actual, print_out = True)

In [None]:
'Quantile LDQRF 21 week'
list_of_labels, LDQRF_list_of_cluster_demand_actual, LDQRF_list_of_cluster_demand_pred = CKMC_per_timestep(attributes_biglist = attributes_LDQRF,
                                                                                                            zone_wise_demand_actual = attributes_real_demand,
                                                                                                            zone_wise_demand_pred = attributes_LDQRF_median, 
                                                                                                            zone_dict= zone_dict,
                                                                                                            method='constrained_k-means',
                                                                                                            prediction_type='LDQRF')

print('Computing actual zone-wise resource from "actual clusters" and predicted zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDQRF_list_of_cluster_demand_pred, print_out = True)
print('\n')
print('Computing actual zone-wise resource from "actual clusters" and actual zone-wise resource from "predicted clusters"')
clustering_evaluation(df_15min_, list_of_labels, list_of_cluster_demand_actual, LDQRF_list_of_cluster_demand_actual, print_out = True)