# Clustering Solution Generator
### $Time$ $Series$ $4th$ $Test$

$Vasco$ $Mergulhão$ $-$ $April$ $2023$

### Version 1:
This script loads a model and outputs a CSV ready to be analysed on the Dashboard

In [85]:
import os  

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from itertools import groupby
from datetime import timedelta, date
import plotly.graph_objects as go
import scipy
import math
from sklearn.cluster import KMeans

import random

import time
import datetime

import umap # UMAP library is responsible for ipywidgets warning!

import tensorflow as tf
from tensorflow import keras
from keras import models

import wandb
from wandb.keras import WandbCallback

import Transform

In [2]:
# Set Random Seeds
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

---
# Script Variables

In [79]:
# MANUAL INPUTS
# MAKE SURE THEY ARE CORRECT
# Everything else is automated
dataset_name = 'Kenya_90k_Set_1_w90_pW'
Model_Name = 'LSTM_AE' # Options: FC_N2D, LSTM_AE, CNN_AE, CNN_ConvEmb
solution_name = 'efficient-sweep-2:v49'

# This loads model by name.
# Identify the best model by navigating the WandB Sweeps page.
# Find link to best model (and version)  in Artifacts -> Usage, and copy directory.
# .pb format is only accepted by Tensorflow, NOT Keras.
##############################################################################



In [80]:
# Uses name to navigate folders
window_cols, window_len = Transform.retrive_window_col_names(dataset_name)
dataset_folder = "_".join(dataset_name.split('_')[:-2]) 
dataset_location = f'../Data_Storage_Processing/Data/{dataset_folder}/{dataset_name}.csv'


# Project Name
scaler_dict = {'Orig':'OriginalScale',
               'pW':'pWindow',
               'G':'Gobal'}
dataset_name_base = "_".join(dataset_name.split('_')[:-1])
scaler_used = dataset_name.split('_')[-1]

Project_Name = f'DeepClust--{dataset_name_base}--{scaler_dict[scaler_used]}'

---
# Data Imports

In [75]:
Data = pd.read_csv(dataset_location)

In [76]:
Data.head()

Unnamed: 0,short_ID,window_ID,window_start_date,d1,d2,d3,d4,d5,d6,d7,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
0,127,0,2018-01-20,0.543805,0.503693,0.46358,0.423467,0.383355,0.343242,0.303129,...,0.906409,0.904819,0.864707,0.786444,0.784481,0.706456,0.666459,0.626384,0.58636,0.583918
1,127,1,2018-04-20,0.525359,0.522565,0.48095,0.439334,0.397718,0.317612,0.314487,...,1.019053,0.977438,0.935822,0.894206,0.852591,0.810975,0.769359,0.727744,0.686128,0.644512
2,127,2,2018-07-19,0.651208,0.606258,0.561307,0.516357,0.471407,0.426456,0.381506,...,1.013146,0.968196,0.923246,0.878295,0.833345,0.788395,0.743444,0.698494,0.653543,0.608593
3,127,3,2018-10-17,0.594276,0.546883,0.499489,0.452096,0.404703,0.357309,0.309916,...,0.499489,0.452096,0.404703,0.357309,0.309916,0.262522,0.215129,0.167736,0.120342,0.072949
4,127,4,2019-01-15,0.953922,0.853922,0.753922,0.653922,0.553922,0.453922,0.353922,...,0.453922,0.353922,0.253922,0.153922,0.753922,0.653922,0.553922,0.453922,0.353922,2.0


## Reshaping Data

In [81]:
if Model_Name == 'FC_N2D':
    predict_data =  Data[window_cols].to_numpy()
else:
    predict_data =  Data[window_cols].to_numpy().reshape(-1, window_len, 1)

In [82]:
predict_data.shape

(824662, 90, 1)

---
---
# Loading Model

In [12]:
run = wandb.init()
model_artifact = run.use_artifact(f'vasco-phd/{Project_Name}/model-{solution_name}', type='model')
model_dir = model_artifact.download()
# local_model_dir = 'C:/Users/ucesvpm/OneDrive - University College London/PhD Project/Data Analytics/Time Series Clustering/Second Test/wandb/run-20221213_173041-jc918077'
loaded_autoencoder = tf.keras.models.load_model(model_dir)
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mvasco-mergulhao[0m ([33mvasco-phd[0m). Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [49]:
loaded_autoencoder.summary()

Model: "LSTM_AE"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 90, 1)]           0         
                                                                 
 Encoder (LSTM)              (None, 90, 100)           40800     
                                                                 
 Lantent_Space (LSTM)        (None, 25)                12600     
                                                                 
 Reshape_Embeding (RepeatVec  (None, 90, 25)           0         
 tor)                                                            
                                                                 
 Decoder_1 (LSTM)            (None, 90, 25)            5100      
                                                                 
 Decoder_2 (LSTM)            (None, 90, 100)           50400     
                                                           

In [84]:
# bottleneck model
encoder_output = loaded_autoencoder.get_layer('Lantent_Space').output
encoder = models.Model(inputs = loaded_autoencoder.input, outputs = encoder_output, name = f'{Model_Name}_encoder')
encoder.summary()

Model: "LSTM_AE_encoder"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 90, 1)]           0         
                                                                 
 Encoder (LSTM)              (None, 90, 100)           40800     
                                                                 
 Lantent_Space (LSTM)        (None, 25)                12600     
                                                                 
Total params: 53,400
Trainable params: 53,400
Non-trainable params: 0
_________________________________________________________________


---
#  Reconstruction Error

In [95]:
# Calculates the RModelontruction Profiles (Predictions)
reconstruct_data = loaded_autoencoder.predict(predict_data)



In [105]:
# Saves the Reconstructect windows
df_reconstruct = Data[['short_ID', 'window_ID']].copy(deep=True)
if Model_Name == 'FC_N2D':
    df_reconstruct[window_cols] = reconstruct_data
else:
    df_reconstruct[window_cols] = reconstruct_data.reshape(-1, window_len)

In [112]:
# Calculating MSE per window
MSE = tf.keras.losses.MeanSquaredError(reduction='none')
if Model_Name == 'FC_N2D':
    MSE_values = MSE(predict_data, reconstruct_data).numpy()
else:
    MSE_values = MSE(predict_data.reshape(-1, window_len), reconstruct_data.reshape(-1, window_len)).numpy()
#Adding to reconstructed dataframe
df_reconstruct.insert(loc=2, column='MSE', value=MSE_values)

In [113]:
df_reconstruct.head()

Unnamed: 0,short_ID,window_ID,MSE,d1,d2,d3,d4,d5,d6,d7,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
0,127,0,0.036224,0.551866,0.511525,0.418921,0.39357,0.376943,0.354484,0.34285,...,0.918718,0.906462,0.890662,0.858993,0.806989,0.740205,0.668449,0.601423,0.551958,0.541089
1,127,1,0.001781,0.511137,0.535627,0.456171,0.43078,0.399154,0.356051,0.327404,...,1.020759,0.99579,0.969248,0.935301,0.888822,0.829687,0.764942,0.704779,0.660446,0.643842
2,127,2,0.008751,0.540324,0.648341,0.572467,0.494882,0.453017,0.394184,0.348231,...,0.935303,0.903605,0.875646,0.850747,0.823421,0.788953,0.749272,0.711617,0.685288,0.67995
3,127,3,0.003819,0.503273,0.541258,0.460892,0.437429,0.392647,0.366842,0.359645,...,0.443012,0.411199,0.390286,0.376721,0.359625,0.32753,0.277821,0.214182,0.147162,0.123559
4,127,4,0.057101,1.141542,0.830329,0.652757,0.551421,0.494866,0.487153,0.502427,...,0.576964,0.515441,0.45345,0.377552,0.284933,0.226393,0.501658,1.00643,0.971729,0.888564


---
---
# 2D UMAP

In [86]:
def UMAP_funct (data, dims = 2, mode = 'visualisation'):
    # this function allows you to switch from the settings optimazed from visualisation to clustering more easily
    start_time = time.time()
    if mode == 'visualisation':
        print(f'Processing UMAP {dims}D-Viz')
        umap_array = umap.UMAP(random_state=42, n_components = dims).fit_transform(data)
        print(f'Time: {np.round(time.time() - start_time,2)}[s]') 
        
        return umap_array
    
    elif mode == 'clustering':
        
        print(f'Processing UMAP {dims}D-Clust')
        # Settings from https://umap-learn.readthedocs.io/en/latest/clustering.html
        # General idea is larger n_neighbors to capture wider relationships, and smaller min_dist to keep points closer (better for density alg)
        umap_array = umap.UMAP(random_state=42,
                               n_components = dims,
                               n_neighbors=30,
                               min_dist=0.0
                              ).fit_transform(data)
        
        print(f'Time: {np.round(time.time() - start_time,2)}[s]') 
        
        return umap_array
    else:
        print(f'UMAP mode {mode} NOT recognized.')

In [68]:
# Calculating Latent Space Projection
encoded_data = encoder.predict(predict_data)



In [90]:
 # encoded_data = encoded_data.numpy()
df_encoded = pd.DataFrame(encoded_data)

In [88]:
#Calcs UMAP for Visual Purposes  
v_2D_umap = UMAP_funct (encoded_data, dims = 2, mode = 'visualisation')

Processing UMAP 2D-Viz
Time: 1336.47[s]


In [116]:
# Creates DF for Clustering Solutions
df_sols  = Data.copy(deep=True)
df_sols.drop(window_cols, axis=1, inplace=True)

In [117]:
#Adds Dims to Dataframes
df_sols['UMAP_V1'] = v_2D_umap[:, 0]
df_sols['UMAP_V2'] = v_2D_umap[:, 1]

df_reconstruct.insert(loc=2, column='UMAP_V2', value=v_2D_umap[:, 1])
df_reconstruct.insert(loc=2, column='UMAP_V1', value=v_2D_umap[:, 0])

---
---
# Clustering Solutions

### kMeans

In [118]:
def kMeans_cluster(Encoded_Data, UMAP_mode = False):
    # Uses kMeans on encoded dataset.
    # UMAP_mode uses N2D-paper recomendation of UMAP encoded data to where k = Dims.
    # Of course, if latent size is smaller or equal to k, then no UMAP needed.
    
    # Output DF, only has solution columns
    df = pd.DataFrame()
    start_time = time.time()
    if UMAP_mode == True:
        print('Clustering on UMAP of Encoded Space')
        for k in range(2, 11):
            if k < latent_layer_size:
                # UMAPing to macthed k=dims with cluster friendly settings
                clust_umap = UMAP_funct (Encoded_Data, dims = k, mode = 'clustering')
                print(f'Processing kMeans for k={k}\n')
                partition = KMeans(n_clusters=k, random_state=42).fit(clust_umap)
                
                col_name = f'kMeans_k={k}'
                df[col_name] = partition.labels_ + 1

            else:
                print(f'k={k} is >= to Latent Space:{latent_layer_size}')
                print(f'Processing kMeans for k={k}')
                partition = KMeans(n_clusters=k, random_state=42).fit(Encoded_Data)
                
                col_name = f'kMeans_k={k}'
                df[col_name] = partition.labels_ + 1
                
    # Case with no UMAPing            
    else:
        print('Clustering on Encoded Space')
    
        # k-Means per se
        kmeans = [KMeans(n_clusters=k, random_state=42).fit(Encoded_Data)
                  for k in range(2, 11)]

        for partition in kmeans:
            cluster_k = partition.labels_.max() + 1
            col_name = f'kMeans_k={cluster_k}'
            df[col_name] = partition.labels_ + 1
        
    
    print(f'Total Time: {np.round(time.time() - start_time,2)}[s]')         
    return df

In [119]:
kMeans_sols = kMeans_cluster(df_encoded, UMAP_mode = False)

Clustering on Encoded Space
Total Time: 115.7[s]


In [120]:
df_sols = pd.concat([df_sols, kMeans_sols], axis=1)

### HDBSCAN
TBC

---
---
# Saving Outputs

### Clustering Solutions

In [121]:
df_sols.head()

Unnamed: 0,short_ID,window_ID,window_start_date,UMAP_V1,UMAP_V2,kMeans_k=2,kMeans_k=3,kMeans_k=4,kMeans_k=5,kMeans_k=6,kMeans_k=7,kMeans_k=8,kMeans_k=9,kMeans_k=10
0,127,0,2018-01-20,-1.518637,3.296972,1,3,1,1,4,1,6,7,1
1,127,1,2018-04-20,-1.484561,3.080095,1,3,1,1,4,1,6,7,1
2,127,2,2018-07-19,-0.214835,3.931988,1,3,1,1,4,1,6,7,1
3,127,3,2018-10-17,-0.444263,3.664465,1,3,1,1,4,1,6,7,1
4,127,4,2019-01-15,-0.634308,4.234664,1,3,1,1,4,1,6,7,1


In [134]:
os.makedirs(f'../ModelResults/Clustering/{dataset_name}', exist_ok=True)  
solution_fileName = solution_name.replace(":", "-" ).replace("-", "_" )
solution_fileName = f'{Model_Name}-{solution_fileName}.csv'
df_sols.to_csv(f'../ModelResults/Clustering/{dataset_name}/{solution_fileName}', index=False)  


In [141]:
solution_fileName

'LSTM_AE-efficient_sweep_2_v49.csv'

[34m[1mwandb[0m: While tearing down the service manager. The following error has occured: [WinError 10054] An existing connection was forcibly closed by the remote host


### Auto-Encoder Recontruction

In [135]:
df_reconstruct.head()

Unnamed: 0,short_ID,window_ID,UMAP_V1,UMAP_V2,MSE,d1,d2,d3,d4,d5,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
0,127,0,-1.518637,3.296972,0.036224,0.551866,0.511525,0.418921,0.39357,0.376943,...,0.918718,0.906462,0.890662,0.858993,0.806989,0.740205,0.668449,0.601423,0.551958,0.541089
1,127,1,-1.484561,3.080095,0.001781,0.511137,0.535627,0.456171,0.43078,0.399154,...,1.020759,0.99579,0.969248,0.935301,0.888822,0.829687,0.764942,0.704779,0.660446,0.643842
2,127,2,-0.214835,3.931988,0.008751,0.540324,0.648341,0.572467,0.494882,0.453017,...,0.935303,0.903605,0.875646,0.850747,0.823421,0.788953,0.749272,0.711617,0.685288,0.67995
3,127,3,-0.444263,3.664465,0.003819,0.503273,0.541258,0.460892,0.437429,0.392647,...,0.443012,0.411199,0.390286,0.376721,0.359625,0.32753,0.277821,0.214182,0.147162,0.123559
4,127,4,-0.634308,4.234664,0.057101,1.141542,0.830329,0.652757,0.551421,0.494866,...,0.576964,0.515441,0.45345,0.377552,0.284933,0.226393,0.501658,1.00643,0.971729,0.888564


In [136]:
os.makedirs(f'../ModelResults/AE_Reconstruction/{dataset_name}', exist_ok=True)  
df_reconstruct.to_csv(f'../ModelResults/AE_Reconstruction/{dataset_name}/{solution_fileName}', index=False)  