# Base Line - Mean Based Windows
### $Time$ $Series$ $4th$ $Test$

$Vasco$ $Mergulhão$ $-$ $June$ $2023$

### Version 2:
Here a base line is created to compare against the Time Series Clustering methods.  
It reduces each window to it's **mean value** and calculates both in the Original and UMAP spaces.

The reconstruction error was dropped as it can be calculated in the fly.

In [1]:
import os

In [2]:
on_gradient = False
directory = os.getcwd()
if directory == '/notebooks':
    on_gradient = True

In [3]:
if on_gradient == False:
    os.environ["OMP_NUM_THREADS"] = "4"
os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

In [4]:
if on_gradient == False:
    import plotly.express as px
    import plotly.graph_objects as go
    import umap
    from umap.umap_ import nearest_neighbors

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.cluster import KMeans
import Transform

In [5]:
# Set Random Seeds
np.random.seed(42)

---
# Script Variables

In [6]:
dataset_name = 'Kenya_90k_Set_1_w90_pW'
max_num_clusters = 25

In [7]:
# Uses name to navigate folders
window_cols, window_len = Transform.retrive_window_col_names(dataset_name)
if on_gradient == False:
    dataset_folder = "_".join(dataset_name.split('_')[:-2]) 
    dataset_location = f'../Data_Storage_Processing/Data/{dataset_folder}/{dataset_name}.csv'
else:
    gradient_mountedfiles = ! ls /storage/data_w90 #! ls /datasets/s3_bucket #!ls /datasets/kenya-90k-set-1-w90
    print(f'Datasets mounted: {gradient_mountedfiles}')
    dataset_location = f'/storage/data_w90/{dataset_name}.csv' # f'/datasets/s3_bucket/{dataset_name}.csv'

---
# Data Imports

In [8]:
Data = pd.read_csv(dataset_location)

In [9]:
Data.head()

Unnamed: 0,short_ID,window_ID,window_start_date,d1,d2,d3,d4,d5,d6,d7,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
0,127,0,2018-01-20,0.543805,0.503693,0.46358,0.423467,0.383355,0.343242,0.303129,...,0.906409,0.904819,0.864707,0.786444,0.784481,0.706456,0.666459,0.626384,0.58636,0.583918
1,127,1,2018-04-20,0.525359,0.522565,0.48095,0.439334,0.397718,0.317612,0.314487,...,1.019053,0.977438,0.935822,0.894206,0.852591,0.810975,0.769359,0.727744,0.686128,0.644512
2,127,2,2018-07-19,0.651208,0.606258,0.561307,0.516357,0.471407,0.426456,0.381506,...,1.013146,0.968196,0.923246,0.878295,0.833345,0.788395,0.743444,0.698494,0.653543,0.608593
3,127,3,2018-10-17,0.594276,0.546883,0.499489,0.452096,0.404703,0.357309,0.309916,...,0.499489,0.452096,0.404703,0.357309,0.309916,0.262522,0.215129,0.167736,0.120342,0.072949
4,127,4,2019-01-15,0.953922,0.853922,0.753922,0.653922,0.553922,0.453922,0.353922,...,0.453922,0.353922,0.253922,0.153922,0.753922,0.653922,0.553922,0.453922,0.353922,2.0


In [10]:
data = Data[window_cols].values

---
# Problematic Libraries

In [11]:
if on_gradient == True:
    ! pip install plotly
    ! pip install umap-learn
    
    import plotly.express as px
    import plotly.graph_objects as go
    import umap
    from umap.umap_ import nearest_neighbors

---
---
# 2D UMAP

In [12]:
start_knn_time = time.time()
knn_orig_precomp = nearest_neighbors(data,
                              n_neighbors=30,
                              metric="euclidean",
                              metric_kwds=None,
                              angular=False,
                              random_state=42)

print(f'Precomp Time: {np.round(time.time() - start_knn_time,2)}[s]')  

Precomp Time: 103.75[s]


In [13]:
start_time = time.time()
umap_viz_model = umap.UMAP(random_state=42,
                           n_neighbors=15,
                           min_dist= 0.1,
                           n_components = 2,
                           precomputed_knn= knn_orig_precomp,
                          ).fit(data)

v_2D_umap = umap_viz_model.transform(data)

print(f'Total Time: {np.round(time.time() - start_time,2)}[s]')  




KeyboardInterrupt: 

In [None]:
df_sols = pd.DataFrame()
df_sols[['short_ID', 'window_ID', 'window_start_date']] = Data[['short_ID', 'window_ID', 'window_start_date']]

In [None]:
df_sols['UMAP_V1'] = v_2D_umap[:, 0]
df_sols['UMAP_V2'] = v_2D_umap[:, 1]

---
# kMeans on Original Space

In [None]:
# The following defines the mean per window and shapes it a dataset with one feature per row for sklearn
data.mean(axis=1).reshape(-1, 1)[:5]

In [None]:
print('Clustering on Original Space\n')
start_total_time = time.time()
clust_sol_cols = []
for k in range(2, max_num_clusters + 1):
    # UMAPing to macthed k=dims with cluster friendly settings
    start_ind_sol_time = time.time()
    print(f'Processing kMeans for k={k}')
    partition = KMeans(n_clusters=k, random_state=42).fit(data.mean(axis=1).reshape(-1, 1))
    sol_name = f'kMeans_k{k}'
    clust_sol_cols.append(sol_name)
    df_sols[sol_name] = partition.labels_ + 1
    print(f'Time: {np.round(time.time() - start_ind_sol_time,2)}[s]\n')  

print(f'Total Time: {np.round(time.time() - start_total_time,2)}[s]')         

In [None]:
df_sols.head()

---
# KUMAP

In [None]:
print("Clustering on UMAP'ed Space\n")
start_total_time = time.time()
for k in range(2, max_num_clusters + 1):
    print(f'Processing for k={k}')
    # UMAPing to macthed k=dims with cluster friendly settings
    start_ind_UMAP_time = time.time()
    print(f'Working UMAP')
    # Settings from https://umap-learn.readthedocs.io/en/latest/clustering.html
    # General idea is larger n_neighbors to capture wider relationships, and smaller min_dist to keep points closer (better for density alg)   
    umap_clust_model = umap.UMAP(random_state=42,
                           n_neighbors=30,
                           min_dist= 0,
                           n_components = k,
                           precomputed_knn= knn_orig_precomp,
                          ).fit(data)
    clust_umap = umap_clust_model.transform(data)
    
    print(f'Time: {np.round(time.time() - start_ind_UMAP_time,2)}[s]')
    
    start_ind_sol_time = time.time()
    print(f'Working kMeans')
    # Here we cluster on the window Mean in the transformed UMAP space
    partition = KMeans(n_clusters=k, random_state=42).fit(clust_umap.mean(axis=1).reshape(-1, 1))
    sol_name = f'KUMAP_k{k}'
    clust_sol_cols.append(sol_name)
    df_sols[sol_name] = partition.labels_ + 1
    print(f'Time: {np.round(time.time() - start_ind_sol_time,2)}[s]\n')  

print(f'Total Time: {np.round(time.time() - start_total_time,2)}[s]')         

---
---
# Saving Outputs

### Solutions

In [None]:
df_sols.head()

In [None]:
solution_fileName = f'Baseline-Means.csv'

if on_gradient == False:
    os.makedirs(f'../ModelResults/Clustering/Solutions/{dataset_name}', exist_ok=True)  
    df_sols.to_csv(f'../ModelResults/Clustering/Solutions/{dataset_name}/{solution_fileName}', index=False)
else:
    solution_fileName = f'Solutions-Shallow-kMeans.csv'
    df_sols.to_csv(solution_fileName, index=False)