# Clustering Solution Generator
### $Time$ $Series$ $3rd$ $Test$

$Vasco$ $Mergulhão$ $-$ $Jan$ $2023$

### Version 1:
This script loads a model and outputs a CSV ready to put analysed on the Dashboard

---
# Editing Note:
---
set up code so that all macro variables are defined in the first cell:<br>
- dataset
- zscore yes/no
- model name
- etc

Store results from different Sub-Samples in seperate folders!

In [1]:
import os  

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import groupby
from datetime import timedelta, date
import plotly.graph_objects as go
import scipy
import math
from sklearn.cluster import KMeans

import random

import time
import datetime

import umap # UMAP library is responsible for ipywidgets warning!

import tensorflow as tf
from tensorflow import keras

import wandb
from wandb.keras import WandbCallback

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set Random Seeds
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

---
---
## Loading a Zscoring Data

In [3]:
df_90 = pd.read_csv('Data/W90_Test2_V1.csv', index_col = [0,1])

In [4]:
window_cols = []
window_len = df_90.columns.shape[0]-1
for w in range(window_len):
    window_cols.append(f'd{w+1}')   

In [5]:
data = df_90[window_cols].to_numpy()

In [6]:
data.shape

(12517, 90)

In [7]:
df_90.head()

Unnamed: 0,Unnamed: 1,window_start_date,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
799,0,2018-12-22,28.559722,27.559722,26.559722,25.559722,24.559722,23.559722,22.559722,21.559722,20.559722,...,1.559722,0.559722,14.559722,13.559722,12.559722,11.559722,10.559722,9.559722,8.559722,7.559722
799,1,2019-03-22,6.559722,5.559722,4.559722,3.559722,2.559722,1.559722,25.559722,24.559722,23.559722,...,9.65162,8.65162,7.65162,6.65162,5.528588,4.65162,3.65162,2.65162,1.65162,0.65162
799,2,2019-06-20,-7.0,-7.0,-7.0,29.691308,28.691308,27.691308,26.691308,25.691308,24.691308,...,2.691308,1.691308,0.691308,11.691308,10.691308,9.691308,8.691308,7.691308,6.691308,5.691308
799,3,2019-09-18,4.691308,3.691308,2.691308,8.691308,7.691308,6.691308,5.691308,4.691308,3.691308,...,2.328368,1.328368,0.328368,7.661632,6.661632,5.661632,4.661632,3.661632,2.661632,1.661632
799,4,2019-12-17,0.661632,-7.0,12.605613,11.605613,10.605613,9.605613,8.605613,7.605613,6.605613,...,8.760648,7.760648,6.760648,5.760648,4.760648,3.760648,2.760648,1.760648,0.760648,-7.0


### Zscoring

In [8]:
window_col_names = []
w_len=90
for d in range(w_len):
    window_col_names.append('d' + str(d+1))

# ATENTION: Z-score function likely to be INCORRECT
# Works OK here, but likely to leed to bugs and issues elsewhere
df_zscore = df_90.copy(deep=True)
df_zscore[window_col_names] = scipy.stats.zscore(df_zscore[window_col_names], axis=1, nan_policy = 'omit')
df_zscore.fillna(-1, inplace = True)

In [9]:
data_zscored = df_zscore[window_cols].to_numpy()

### Saving Zscored Data

In [10]:
df_zscore_save = df_zscore.reset_index(names=['local_id', 'window_id'])
# df_zscore_save.to_csv(f'Data/W90_Zscored.csv', index=False)  


---
---
# Loading Models

In [11]:
# This loads model by name.
# Identify the best model by navigating the WandB Sweeps page.
# Find link to best model (and version)  in Artifacts -> Usage, and copy directory.
# .pb format is only accepted by Tensorflow, NOT Keras.
solution_name = 'pious-sweep-1:v9'
run = wandb.init()
model_artifact = run.use_artifact(f'vasco-phd/Sweep_Test_v2/model-{solution_name}', type='model')
model_dir = model_artifact.download()
# local_model_dir = 'C:/Users/ucesvpm/OneDrive - University College London/PhD Project/Data Analytics/Time Series Clustering/Second Test/wandb/run-20221213_173041-jc918077'
loaded_autoencoder = tf.keras.models.load_model(model_dir)
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mvasco-mergulhao[0m ([33mvasco-phd[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [12]:
loaded_autoencoder.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 90)]              0         
                                                                 
 dense (Dense)               (None, 300)               27300     
                                                                 
 dense_1 (Dense)             (None, 300)               90300     
                                                                 
 dense_2 (Dense)             (None, 300)               90300     
                                                                 
 dense_3 (Dense)             (None, 300)               90300     
                                                                 
 dense_4 (Dense)             (None, 5)                 1505      
                                                                 
 dense_5 (Dense)             (None, 300)               1800  

In [13]:
encoder_layers = int(len(loaded_autoencoder.layers)/2)

In [14]:
encoder_layers = int(len(loaded_autoencoder.layers)/2)
encoder = keras.models.Sequential(loaded_autoencoder.layers[:-encoder_layers])
encoder.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 300)               27300     
                                                                 
 dense_1 (Dense)             (None, 300)               90300     
                                                                 
 dense_2 (Dense)             (None, 300)               90300     
                                                                 
 dense_3 (Dense)             (None, 300)               90300     
                                                                 
 dense_4 (Dense)             (None, 5)                 1505      
                                                                 
Total params: 299,705
Trainable params: 299,705
Non-trainable params: 0
_________________________________________________________________


---
## Enconding Data

In [15]:
encoded_data = encoder(data_zscored)
encoded_data = encoded_data.numpy()
df_encoded = pd.DataFrame(encoded_data)

---
##  Reconstruction Error

In [16]:
y_pred = loaded_autoencoder.predict(data_zscored)



In [17]:
df_reconstruct = pd.DataFrame(columns=window_cols, index=df_zscore.index)
df_reconstruct[window_cols] = y_pred

In [18]:
df_reconstruct = pd.DataFrame(columns=window_cols, index=df_zscore.index)
#Inserting reconstructed profiles
df_reconstruct[window_cols] = y_pred
# Calculating MSE per window
MSE = tf.keras.losses.MeanSquaredError(reduction='none')
MSE_values = MSE(data_zscored, y_pred).numpy()
#Adding to reconstructed dataframe
df_reconstruct.insert(loc=0, column='MSE', value=MSE_values)
df_reconstruct.reset_index(names=['local_id', 'window_id'], inplace=True)

In [19]:
df_reconstruct.head()

Unnamed: 0,local_id,window_id,MSE,d1,d2,d3,d4,d5,d6,d7,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
0,799,0,0.24702,0.861982,0.946337,1.023229,0.913732,0.789527,0.734652,0.696375,...,-0.887788,-0.791022,-0.750865,-0.851607,-0.820415,-0.833995,-0.73711,-0.714877,-0.740061,-0.613801
1,799,1,0.390724,-1.137315,-1.118176,-1.191306,-0.667892,-0.188515,0.574962,1.183609,...,-0.754237,-0.722684,-0.742673,-0.695211,-0.707334,-0.728421,-0.729348,-0.669194,-0.698752,-0.619272
2,799,2,0.241173,-1.025704,-1.041278,-1.056806,-0.195937,0.555022,1.303572,1.828867,...,-0.798859,-0.794543,-0.818431,-0.760448,-0.760539,-0.836529,-0.787765,-0.743372,-0.760676,-0.663323
3,799,3,0.639385,0.121909,0.043181,0.092471,-0.047451,0.046363,-0.00726,0.084297,...,-0.392383,-0.358334,-0.205351,-0.27483,-0.157824,0.051694,0.030989,-0.010297,-0.041775,0.141992
4,799,4,0.606934,-1.122783,-0.575524,0.458419,1.325414,1.790534,1.874686,1.285085,...,0.339593,0.33652,0.299439,0.115216,0.017182,-0.234514,-0.411235,-0.408283,-0.338318,-0.365986


---
---
## 2D UMAP

In [20]:
df_sols  = df_zscore.copy(deep=True)
df_sols.drop(window_cols, axis=1, inplace=True)
df_sols.reset_index(names=['local_id', 'window_id'], inplace=True)

In [21]:
#Calcs UMAP for Visual Purposes  
start_time = time.time()
v_2D_umap = umap.UMAP(random_state=42).fit_transform(df_encoded)
print(f'\nUMAP 4Viz process time: {np.round(time.time() - start_time,2)}[s]') 


UMAP 4Viz process time: 20.92[s]


In [22]:
#Adds Dims to Dataframes
df_sols['UMAP_V1'] = v_2D_umap[:, 0]
df_sols['UMAP_V2'] = v_2D_umap[:, 1]

df_reconstruct.insert(loc=2, column='UMAP_V2', value=v_2D_umap[:, 1])
df_reconstruct.insert(loc=2, column='UMAP_V1', value=v_2D_umap[:, 0])

---
---
## Clustering Solutions

In [23]:
df_sols

Unnamed: 0,local_id,window_id,window_start_date,UMAP_V1,UMAP_V2
0,799,0,2018-12-22,4.849304,2.319143
1,799,1,2019-03-22,-1.492943,3.546806
2,799,2,2019-06-20,-1.429315,3.775426
3,799,3,2019-09-18,3.234871,1.918851
4,799,4,2019-12-17,3.490935,-3.001745
...,...,...,...,...,...
12512,525,13,2021-06-03,17.192415,0.265117
12513,525,14,2021-09-01,17.281046,1.061741
12514,525,15,2021-11-30,17.704914,0.559145
12515,525,16,2022-02-28,17.343908,0.514863


In [24]:
clustering_cols = df_encoded.columns[:-2]

### kMeans

In [25]:
def kMeans_cluster(df_en, df_s,cols):
    df_en = df_en.copy(deep=True)
    df_s = df_s.copy(deep=True)
        
    # k-Means per se
    kmeans = [KMeans(n_clusters=k, random_state=42).fit(df_en[cols])
                for k in range(2, 10)]
    
    i=2
    sol_cols = []
    for partition in kmeans:
        col_name = 'kMeans_k='+ str(i)
        sol_cols.append(col_name)
        df_s[col_name] = partition.labels_ + 1
        i+=1
        
    return df_s

In [26]:
df_sols =  kMeans_cluster(df_encoded, df_sols, clustering_cols)

### HDBSCAN
To Be Done

---
---
## Saving Outputs

### Clustering and UMAP

In [27]:
os.makedirs('ModelResults', exist_ok=True)  
solution_fileName = solution_name.replace(":", "_" )
df_sols.to_csv(f'ModelResults/ClusteringUmap/{solution_fileName}.csv', index=False)  


### Recontructions & MSEs

In [28]:
os.makedirs('Reconstruction', exist_ok=True)  
reconstruction_fileName = solution_name.replace(":", "_" )
df_reconstruct.to_csv(f'ModelResults/Reconstruction/{reconstruction_fileName}.csv', index=False)  