# W&B Sweep Launcher
### $Time$ $Series$ $4th$ $Test$

$Vasco$ $Mergulhão$ $-$ $March$ $2023$

### Version 1:
 - Applies Weights and Biases Sweeps on Full Sample (i.e., 90k per country).
 - Imports Custom Functions and Networks


---
# Python Libraries & Custom Functions

In [1]:
# Library scripts
from networks import ann_train, fc_n2d, cnn_AE, cnn_ConvEmb, lstm_ae
import Transform

In [2]:
# Standard Libraries
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GroupShuffleSplit
import wandb
from wandb.keras import WandbCallback

In [3]:
# Fixing random seeds to ensure the reproducibility 
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

---
# Gradient Checks

In [4]:
on_gradient = False
# enable memory growth for gpu devices
# source: https://stackoverflow.com/a/55541385/8849692
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    on_gradient = True
    for device in gpu_devices:
        tf.config.experimental.set_memory_growth(device, True)

if on_gradient:
    print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
    gradient_mountedfiles = ! ls /datasets/s3_bucket #!ls /datasets/kenya-90k-set-1-w90
    print(f'Datasets mounted: {gradient_mountedfiles}')
else:
    print('No GPUs. On local Machine.')


No GPUs. On local Machine.


---
# Script Variables

In [5]:
# Defines Dataset for the Sweep
dataset_name = 'Kenya_90k_Set_1_w90_pW'
# Model Name
AE_Model = fc_n2d # Define Model Here
# Latent Layer Size
latent_layer_size = 25
# Number of Sweeps
sweep_count = 5

In [6]:
# First dependables from Script Variables

# File name and navigation vars
window_cols, window_len = Transform.retrive_window_col_names(dataset_name)
if on_gradient == False:
    # Uses name to navigate folders
    dataset_folder = "_".join(dataset_name.split('_')[:-2]) 
    dataset_location = f'../Data_Storage_Processing/Data/{dataset_folder}/{dataset_name}.csv'
    
if  on_gradient == True:
    dataset_location = f'/datasets/s3_bucket/{dataset_name}.csv'# f'/datasets/kenya-90k-set-1-w90/{dataset_name}.csv'

# Model Name and Variables
ae_model = AE_Model.model()
AE_Model_Name = ae_model.name 

# Sweep Names and Configurations
scaler_dict = {'Orig':'OriginalScale',
               'pW':'pWindow',
               'G':'Gobal'}
dataset_name_base = "_".join(dataset_name.split('_')[:-1])
scaler_used = dataset_name.split('_')[-1]

Project_Name = f'DeepClust--{dataset_name_base}--{scaler_dict[scaler_used]}'
Sweep_Config = f'{AE_Model_Name}_sweepconfig'


In [7]:
Sweep_Config

'FC_N2D_sweepconfig'

In [8]:
sweep_config = AE_Model.sweep_config(name=Sweep_Config, window_len=window_len, latent_layer_size=latent_layer_size)
ann_network = AE_Model.model(window_length = window_len, latent_layer_size = latent_layer_size)

In [9]:
sweep_config

{'method': 'random',
 'name': 'FC_N2D_sweepconfig',
 'metric': {'name': 'mse', 'goal': 'minimize'},
 'parameters': {'optimizer': {'values': ['nadam']},
  'latent_layer_size': {'value': 25},
  'epochs': {'value': 100},
  'window_length': {'value': 90},
  'activation_fn': {'values': ['LeakyReLU']},
  'learning_rate': {'distribution': 'log_uniform_values',
   'min': 1e-05,
   'max': 0.0001},
  'batch_size': {'distribution': 'q_log_uniform_values',
   'q': 2,
   'min': 100,
   'max': 300}}}

In [10]:
ann_network.summary()

Model: "FC_N2D"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 90)]              0         
                                                                 
 l1_enc (Dense)              (None, 500)               45500     
                                                                 
 l2_enc (Dense)              (None, 500)               250500    
                                                                 
 l3_enc (Dense)              (None, 2000)              1002000   
                                                                 
 Lantent_Space (Dense)       (None, 25)                50025     
                                                                 
 l1_dec (Dense)              (None, 2000)              52000     
                                                                 
 l2_dec (Dense)              (None, 500)               10005

---
# Data Imports

In [11]:
Data = pd.read_csv(dataset_location)

In [12]:
Data.head()

Unnamed: 0,short_ID,window_ID,window_start_date,d1,d2,d3,d4,d5,d6,d7,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
0,127,0,2018-01-20,0.543805,0.503693,0.46358,0.423467,0.383355,0.343242,0.303129,...,0.906409,0.904819,0.864707,0.786444,0.784481,0.706456,0.666459,0.626384,0.58636,0.583918
1,127,1,2018-04-20,0.525359,0.522565,0.48095,0.439334,0.397718,0.317612,0.314487,...,1.019053,0.977438,0.935822,0.894206,0.852591,0.810975,0.769359,0.727744,0.686128,0.644512
2,127,2,2018-07-19,0.651208,0.606258,0.561307,0.516357,0.471407,0.426456,0.381506,...,1.013146,0.968196,0.923246,0.878295,0.833345,0.788395,0.743444,0.698494,0.653543,0.608593
3,127,3,2018-10-17,0.594276,0.546883,0.499489,0.452096,0.404703,0.357309,0.309916,...,0.499489,0.452096,0.404703,0.357309,0.309916,0.262522,0.215129,0.167736,0.120342,0.072949
4,127,4,2019-01-15,0.953922,0.853922,0.753922,0.653922,0.553922,0.453922,0.353922,...,0.453922,0.353922,0.253922,0.153922,0.753922,0.653922,0.553922,0.453922,0.353922,2.0


---
# Pre-Processing

---
## Train-Validation Split

### Dimitrios Sphatis Suggestion
Make sure not to have same IDs in test(valid) and train sets.<br>
This will reduce test accuracy, but increase generability. 

In [13]:
#As Dimitrios use:
#https://stackoverflow.com/questions/44007496/random-sampling-with-pandas-data-frame-disjoint-groups
# Initialize the GroupShuffleSplit.
gss = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state= seed)

# Get the indexers for the split.
idxTrain, idxTest = next(gss.split(Data, groups=Data.short_ID))

# Get the split DataFrames.
TrainData, ValidData = Data.iloc[idxTrain], Data.iloc[idxTest]

# Ensuring the Valid and Train IDs are seperate 
assert len(set(TrainData['short_ID'].unique()).intersection(set(ValidData['short_ID'].unique()))) == 0

# Converting to Numpy Array
x_train, x_valid = TrainData[window_cols].to_numpy(), ValidData[window_cols].to_numpy()

In [14]:
x_train.shape

(741695, 90)

In [15]:
x_valid.shape

(82967, 90)

---
---
# WandB Sweep Log in
https://github.com/wandb/examples/blob/master/colabs/keras/Keras_param_opti_using_sweeps.ipynb


In [16]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mvasco-mergulhao[0m ([33mvasco-phd[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

# Sweep & Train Functions

In [17]:
def train(model, batch_size= 32, epochs= 100, lr=0.001, optimizer='nadam'):  
    
    tf.keras.backend.clear_session()
    model.compile(loss="mse", 
                  optimizer=ann_train.get_optimizer(lr, optimizer), 
                  metrics=["mse", tf.keras.metrics.MeanAbsoluteError()])

    early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    lr_scheduler_cb = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
    # lr_scheduler_cb = keras.callbacks.LearningRateScheduler(ann_train.exponential_decay(lr, steps=20))

    model.fit(x_train, 
              x_train, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=(x_valid, x_valid), 
              callbacks=[WandbCallback(), early_stopping_cb, lr_scheduler_cb])
    
    

In [18]:
def sweep_train(config_defaults=None):
    # Initialize wandb with a sample project name
    with wandb.init(config=config_defaults):  # this gets over-written in the Sweep

        # Specify the other hyperparameters to the configuration
        wandb.config.architecture_name = AE_Model_Name
        wandb.config.dataset_name = dataset_name
        ann_network = AE_Model.model(window_length = wandb.config.window_length,
                                  latent_layer_size = wandb.config.latent_layer_size,
                                  activation_fn = wandb.config.activation_fn)
        train(ann_network, 
              wandb.config.batch_size, 
              wandb.config.epochs,
              wandb.config.learning_rate,
              wandb.config.optimizer)
 


---
---
# Run Sweep

In [19]:
sweep_id = wandb.sweep(sweep_config, project = Project_Name)

Create sweep with ID: w1ohw6vt
Sweep URL: https://wandb.ai/vasco-phd/DeepClust--Kenya_90k_Set_1_w90--pWindow/sweeps/w1ohw6vt


In [20]:
wandb.agent(sweep_id, function=sweep_train, count= sweep_count)

[34m[1mwandb[0m: Agent Starting Run: b2mbg7nt with config:
[34m[1mwandb[0m: 	activation_fn: LeakyReLU
[34m[1mwandb[0m: 	batch_size: 180
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	latent_layer_size: 25
[34m[1mwandb[0m: 	learning_rate: 1.1295759251618729e-05
[34m[1mwandb[0m: 	optimizer: nadam
[34m[1mwandb[0m: 	window_length: 90




Epoch 1/100



INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets


INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets
[34m[1mwandb[0m: Adding directory to artifact (C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best)... Done. 0.3s


Epoch 2/100



INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets


INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets
[34m[1mwandb[0m: Adding directory to artifact (C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best)... Done. 0.3s


Epoch 3/100
Epoch 4/100
Epoch 5/100



INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets


INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets
[34m[1mwandb[0m: Adding directory to artifact (C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best)... Done. 0.3s


Epoch 6/100



INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets


INFO:tensorflow:Assets written to: C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best\assets
[34m[1mwandb[0m: Adding directory to artifact (C:\Users\ucesvpm\OneDrive - University College London\PhD Project\Data Analytics\Time Series Clustering\Time Series Clustering\SHS-DeepClustering\wandb\run-20230331_162357-b2mbg7nt\files\model-best)... Done. 0.3s


Epoch 7/100

0,1
epoch,▁▂▄▅▇█
loss,█▂▂▁▁▁
mean_absolute_error,█▅▄▃▁▁
mse,█▂▂▁▁▁
val_loss,▃▃█▄▁▁
val_mean_absolute_error,█▆█▄▁▁
val_mse,▃▃█▄▁▁

0,1
best_epoch,5.0
best_val_loss,6006.46875
epoch,5.0
loss,2103.87988
mean_absolute_error,1.1343
mse,2103.87988
val_loss,6006.46875
val_mean_absolute_error,1.31232
val_mse,6006.46875


In [21]:
wandb.finish()