# Autoclustering

Are there any patterns/ groupings in the data that we are missing?

NB tslearn needs numpy <= 1.21 (currentt verision is 1.23.2). Hence I need to run in a separate environment

Sources for this are:

https://www.kaggle.com/izzettunc/introduction-to-time-series-clustering
and tslearn documentation i.e. https://tslearn.readthedocs.io/en/stable/index.html

In [None]:
#Import libraries
from helpers import *

from sqlalchemy import create_engine
import psycopg2
import numpy as np
import pandas as pd
import math

import datetime as dt
import dateutil

from sklearn.preprocessing import StandardScaler

from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans

from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.barycenters import dtw_barycenter_averaging
from tslearn.clustering import TimeSeriesKMeans
from tslearn.clustering import KernelKMeans

from sklearn.decomposition import PCA

import matplotlib.pyplot as plt
%matplotlib inline

#This is suppress all warnings in the notebook - turn on when happy code works
# import warnings
# warnings.filterwarnings('ignore')

In [None]:
#Redshift user credentials - set here
USER = 
PASSWORD = 

FCST_PERIOD = 9   #How many months I want to forecast ahead

In [None]:
#Create SQLAlchemy engine for Redshift database
user = USER
password = PASSWORD
host= 
port='5439'
dbname='prod'

url = "postgresql+psycopg2://{0}:{1}@{2}:{3}/{4}".format(user, password, host, port, dbname)
engine = create_engine(url)

# A. Get data from Redshift

Let's focus on Spain

And select everything where there was some demand in the preiod that we want to forecast


In [None]:
MIN_DEMAND = 0

query = f"""
select
    isbn + ship_to_country_key as key,
    last_day(date) as month,
    sum(quantity_demanded) as qty
from r2ibp.f_demand_actual
where ship_to_country_key = 'ES'
and month <= current_date
and key in
(
select key
from
(
select
     isbn + ship_to_country_key as key,
     sum(quantity_demanded) as qty_last_12m
from r2ibp.f_demand_actual
where ship_to_country_key = 'ES'
and last_day(date) <= current_date
and last_day(date) > dateadd(month, -{FCST_PERIOD}, current_date)
and isbn not like '555%%'
group by key
)
where qty_last_12m > {MIN_DEMAND}
)
group by key, month
order by key, month asc

"""
conn = engine.connect()
df = pd.read_sql_query(query, conn)
conn.close()


# B. Pivot into a datafame

NB Drop negative values and replace NaNs (i.e. missing values) with zeroes

Also simplify the columns index

In [None]:
df_pivoted = df[df['qty']>0].pivot(index='key', columns='month').fillna(0)
df_pivoted.columns = df_pivoted.columns.droplevel(0)

In [None]:
df_pivoted.tail()

# C. Prepare the data for modelling

I will make use of tslearn's TimeSeriesScalerMinMax scaler and scale between 0 and 1 NB I want to retain the zero values where these exist.

In [None]:
scaler = TimeSeriesScalerMinMax()

X = df_pivoted.to_numpy()
X = scaler.fit_transform(X)

#Reshape for plotting later
print('Original shape of X', X.shape)

num_ts = X.shape[0]
X = np.reshape(X, (num_ts, -1))

print('Reshaped X', X.shape)

In [None]:
## Create a sample to speed up calculatinbg silhouette scores

df_sample = df_pivoted.sample(frac=0.1, random_state = 1234)

X_sample = df_sample.to_numpy()
X_sample = scaler.fit_transform(X_sample)

num_ts = X_sample.shape[0]
X_sample = np.reshape(X_sample, (num_ts, -1))

X_sample.shape

# D. Determining the number of clusters

Use the Silhouette Score or Elbow plot to determine the best number of clusters

## D.1 Silhouette Scores

The full set is going to take forever so use X_sample

NB. Small number of clusters take the longest

In [None]:
#This is to track progress
start = dt.datetime.now()

from tslearn.clustering import silhouette_score

scores = []

num_clusters_list = range(2, 28)

for n_clusters in num_clusters_list:
    
    kmeans = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw")
#    kmeans = KernelKMeans(n_clusters=n_clusters)
    cluster_assignment = kmeans.fit_predict(X_sample)
    scores.append(silhouette_score(X_sample, cluster_assignment))
    
    print('Calc complete for silhouette score for', n_clusters, 'clusters after', dt.datetime.now() - start)

In [None]:
plt.plot(num_clusters_list, scores)

plt.xlabel('Number of clusters')
plt.ylabel('Silhouette Score')
plt.title('K-means Clustering')
plt.show();

In [None]:
list(zip(num_clusters_list, scores))

#Lowest score at 10

## D.2 Elbow Plots

Same as above i.e. using X_sample

In [None]:
#Helper function to calculate the within cluster variance

def cluster_variance(points):
    
    N = points.shape[0]
    total = 0
    
    for i in range(N):
        for j in range(N):
            total += 0.5 * np.linalg.norm(points[i,:] - points[j,:]) ** 2
            
    return total / N**2

In [None]:
start = dt.datetime.now()

all_assignments = []

for i in range(1,28):
    kmeans = TimeSeriesKMeans(n_clusters=i, metric="dtw")
    all_assignments.append(kmeans.fit_predict(X_sample))

awcv = []

for assignment in all_assignments:
    wcv = 0
    C = np.max(assignment) + 1
    for i in range(C):
        wcv += cluster_variance(X_sample[assignment == i])
    awcv.append(wcv / C)

    print('Calc complete for average within cluster variance for', i+1, 'clusters after', dt.datetime.now() - start)
    
#plt.plot(range(1,C+1), awcv)
plt.plot(range(1,C+1), awcv)

plt.xlabel('Number of flat clusters')
plt.ylabel('Average within-class variance')
plt.title('K-means Clustering')
plt.show()

#There's no strong elbow.
#Presumably the decrease in awcv is simply a matter of the cluster size getting bigger

In [None]:
list(zip(range(1,28),awcv))

# E Clustering with KMeans

Cluster using TS version of K means on the full dataset

In [None]:
cluster_count = math.ceil(math.sqrt(len(X))) 
# A good rule of thumb is choosing k as the square root of the number of points in the training data set in kNN
print(cluster_count)

#I'm going to override (as otherwise it will take forever)
cluster_count = 16  #This is based on info from later
print(cluster_count)

In [None]:
start = dt.datetime.now()
print(start)

km = TimeSeriesKMeans(n_clusters=cluster_count, metric="dtw")
#km = KernelKMeans(n_clusters=cluster_count)

labels = km.fit_predict(X)

print(dt.datetime.now() - start)

In [None]:
ar_unique, n = np.unique(labels, return_counts=True)
plt.bar(ar_unique, n);

list(zip(ar_unique, n))

In [None]:
#This is with DBA

plot_count = math.ceil(math.sqrt(cluster_count))

fig, axs = plt.subplots(plot_count,plot_count,figsize=(25,25))
fig.suptitle('Clusters')
row_i=0
column_j=0
for label in set(labels):
    cluster = []
    for i in range(len(labels)):
            if(labels[i]==label):
                axs[row_i, column_j].plot(X[i],c="gray",alpha=0.4)
                cluster.append(X[i])
    if len(cluster) > 0:
        axs[row_i, column_j].plot(dtw_barycenter_averaging(np.vstack(cluster)),c="red")
    axs[row_i, column_j].set_title("Cluster "+str(row_i*plot_count+column_j))
    column_j+=1
    if column_j%plot_count == 0:
        row_i+=1
        column_j=0
        
plt.show()

# This is definitely more useful than averaging

# F Forecast HW by Cluster

Has the autoclustering helped?

In [None]:
#Add the cluster labels to df_sample dataframe

df_clustered = df_pivoted.copy()
df_clustered['cluster'] = list(labels)

In [None]:
#220809 This is not the most sensible way to do this.
#Run HWES ion everthing and then add the clusetr labels to df_metrics

percent_HWES_rmse_better_list = []

for cluster in range(0, cluster_count):
#for cluster in range(0, 2):
    
    key_list = df_clustered[df_clustered['cluster'] == cluster].index.to_list()
    
    df_demand = get_demand(key_list, engine)
    df_errors, df_hwes_forecasts = predict_using_hwes(df_demand, FCST_PERIOD)
    df_metrics = calc_prediction_metrics(df_hwes_forecasts)
    
    total_fcsts = len(df_metrics)
    num_hwes_rmse_better = df_metrics['pred_rmse_lower'].sum()
    percent_HWES_rmse_better = round((num_hwes_rmse_better/total_fcsts)*100, 1)
    
    percent_HWES_rmse_better_list.append(percent_HWES_rmse_better)

    

In [None]:
#Which clusters perform best
list(zip(range(0, cluster_count),n, percent_HWES_rmse_better_list))

In [None]:
df_results = pd.DataFrame()
df_results['cluster'] = ar_unique
df_results['timeseries count'] = n
df_results['% HWES RMSE lower'] = percent_HWES_rmse_better_list

df_results

In [None]:
print(df_results)

# G Time Series Plots

Let's look at the best preforming cluster

In [None]:
#Plot cluster 5 NB only 50 timeseries

plot_list = df_clustered[df_clustered['cluster'] == 5].index.to_list()[:10]

df_demand = get_demand(plot_list, engine)
df_errors, df_hwes_forecasts = predict_using_hwes(df_demand, FCST_PERIOD)
df_metrics = calc_prediction_metrics(df_hwes_forecasts)

plot_sample_preds(plot_list, df_demand, df_hwes_forecasts, FCST_PERIOD)

df_metrics[df_metrics['key'].isin(plot_list)]
