# W&B Sweep Launcher
### $Time$ $Series$ $4th$ $Test$

$Vasco$ $Mergulhão$ $-$ $March$ $2023$

### Version 1:
 - Applies Weights and Biases Sweeps on Full Sample (i.e., 90k per country).
 - Imports Custom Functions and Networks


### ANN Configurations:

- #### Architecture(s)
    - Fully Connected Auto Encoder
        - Small (Input, 200, 200, LatDim)
        - N2D [other papers orig ref] (Input, 500, 500, 2000, LatDim)
    
- #### Hyperparamenters (To Be Updated)
    - Latent Space Size
    - Batch Size
        - Small test [2 - 32] and Large test [128 - 256]
    - Learning Rate
    - Learning Rate Scheduler
        - Performance Schedulling
    - Activation Functions
        - SELU and Leaky ReLU
    - Initializations
        - LeCun and He (accordingly)
    - Batch Normalization
        - With/Without tests (note: if data is not z-scored, SELU not worth it, downgrade to ELU)
    - Optimizers
        - Nadam and SDG(momentum [0.9], Nesterov)
    - Epochs
        - 100 with Early Stopping
 

---
# Python Libraries & Custom Functions

In [4]:
# Library scripts
import Transform
from networks import ann_train, fc_small, fc_n2d

In [1]:
# Standard Libraries
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GroupShuffleSplit
import wandb
from wandb.keras import WandbCallback

In [2]:
# Fixing random seeds to ensure the reproducibility 
seed = 42
tf.random.set_seed(seed)
np.random.seed(seed)
random.seed(seed)

# Gradient Checks

In [28]:
on_gradient = False
# enable memory growth for gpu devices
# source: https://stackoverflow.com/a/55541385/8849692
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    on_gradient = True
    for device in gpu_devices:
        tf.config.experimental.set_memory_growth(device, True)

if on_gradient:
    print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
    gradient_mountedfiles = !ls /datasets/kenya-90k-set-1-w90
    print(f'Datasets mounted: {gradient_mountedfiles}')
else:
    print('No GPUs. On local Machine.')


No GPUs. On local Machine.


---
# Script Variables

In [29]:
# Defines Dataset for the Sweep
dataset_name = 'Kenya_90k_Set_1_w90'

if on_gradient == False:
    # Uses name to navigate folders
    dataset_folder = "_".join(dataset_name.split('_')[:-1]) #Takes out window length section
    dataset_location = f'../Data_Storage_Processing/Data/{dataset_folder}/{dataset_name}.csv'
    
if  on_gradient == True:
    dataset_location = f'/datasets/kenya-90k-set-1-w90/{dataset_name}.csv'

# Zcore Data Decision
zscore_data = True # Set to: [True/False]
zscore_data_done = False # Always set to False. Ensures its not normalized multiple times

# Model Name and Variables
AE_Model_Name = 'FC_Small' # Options: FC_Small, FC_N2D
latent_layer_size = 25

# Sweep Names and Configurations
if zscore_data == True:
    Project_Name = f'DeepClust-{dataset_name}-Zscored-v1'
else:
    Project_Name = f'DeepClust-{dataset_name}-NOTzscored-v1'
Sweep_Config = f'{AE_Model_Name}_sweepconfig'
sweep_count = 1

In [12]:
def window_col_names(dataset_name, win_prefix = 'd'):
    # retriving window length
    window_len = int(dataset_name.split('_')[-1][1:]) # Gets _wXX part of name, then skips 'w' to get the number.
    # defining window column names
    window_cols = [None]*window_len
    for i  in range(window_len):
        window_cols[i] = f'{win_prefix}' + str(i+1)
        
    return window_cols, window_len

window_cols, window_len = window_col_names(dataset_name)

In [13]:
if AE_Model_Name == 'FC_N2D':
    sweep_config = fc_n2d.sweep_config(name=Sweep_Config, window_len=window_len, latent_layer_size=latent_layer_size)
    ann_network = fc_n2d.model(window_length = window_len, latent_layer_size = latent_layer_size, activation_fn = 'SELU')
    
elif AE_Model_Name == 'FC_Small':
    sweep_config = fc_small.sweep_config(name=Sweep_Config, window_len=window_len, latent_layer_size=latent_layer_size)
    ann_network = fc_small.model(window_length = window_len, latent_layer_size = latent_layer_size, activation_fn = 'SELU')
    
else:
    print(f'ERROR: AE name {AE_Model_Name} not recognised!')

In [14]:
sweep_config

{'method': 'random',
 'name': 'FC_Small_sweepconfig',
 'metric': {'name': 'mse', 'goal': 'minimize'},
 'parameters': {'optimizer': {'values': ['nadam', 'sgd']},
  'latent_layer_size': {'value': 25},
  'epochs': {'value': 100},
  'window_length': {'value': 90},
  'activation_fn': {'values': ['SELU', 'LeakyReLU']},
  'learning_rate': {'distribution': 'log_uniform_values',
   'min': 0.001,
   'max': 0.1},
  'batch_size': {'distribution': 'q_log_uniform_values',
   'q': 2,
   'min': 2,
   'max': 256}}}

In [15]:
ann_network.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 90)]              0         
                                                                 
 dense_6 (Dense)             (None, 200)               18200     
                                                                 
 dense_7 (Dense)             (None, 200)               40200     
                                                                 
 dense_8 (Dense)             (None, 25)                5025      
                                                                 
 dense_9 (Dense)             (None, 200)               5200      
                                                                 
 dense_10 (Dense)            (None, 200)               40200     
                                                                 
 dense_11 (Dense)            (None, 90)                1809

---
# Data Imports

In [16]:
Data = pd.read_csv(dataset_location)

In [17]:
Data.head()

Unnamed: 0,short_ID,window_ID,window_start_date,d1,d2,d3,d4,d5,d6,d7,...,d81,d82,d83,d84,d85,d86,d87,d88,d89,d90
0,127,0,2018-01-20,13.556944,12.556944,11.556944,10.556944,9.556944,8.556944,7.556944,...,22.596563,22.556944,21.556944,19.605868,19.556944,17.611782,16.614676,15.615602,14.617812,14.556944
1,127,1,2018-04-20,12.624086,12.556944,11.556944,10.556944,9.556944,7.632025,7.556944,...,24.487269,23.487269,22.487269,21.487269,20.487269,19.487269,18.487269,17.487269,16.487269,15.487269
2,127,2,2018-07-19,14.487269,13.487269,12.487269,11.487269,10.487269,9.487269,8.487269,...,22.539225,21.539225,20.539225,19.539225,18.539225,17.539225,16.539225,15.539225,14.539225,13.539225
3,127,3,2018-10-17,12.539225,11.539225,10.539225,9.539225,8.539225,7.539225,6.539225,...,10.539225,9.539225,8.539225,7.539225,6.539225,5.539225,4.539225,3.539225,2.539225,1.539225
4,127,4,2019-01-15,9.539225,8.539225,7.539225,6.539225,5.539225,4.539225,3.539225,...,4.539225,3.539225,2.539225,1.539225,7.539225,6.539225,5.539225,4.539225,3.539225,32.539225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
824657,219074,1,2022-08-31,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,2.676516,1.676516,...,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000
824658,219075,0,2022-06-02,14.339271,13.339271,12.339271,11.339271,10.339271,9.339271,8.339271,...,0.690706,0.690706,-7.000000,-7.000000,-7.000000,-7.000000,3.263808,2.263808,1.263808,1.263808
824659,219075,1,2022-08-31,0.263808,2.360567,1.360567,0.360567,2.653484,1.653484,0.653484,...,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000,-7.000000
824660,219077,0,2022-06-02,14.449306,13.449306,12.449306,11.449306,10.449306,12.449306,11.449306,...,1.670833,1.670833,1.670833,7.670833,6.670833,5.670833,4.670833,3.670833,2.670833,1.670833


---
# Pre-Processing

---
## Z-Scoring Data

This is done on a row-by-row basis.<br>
Meaning, each window is normalized to its own Mean and Std.

In [21]:
if zscore_data == True and zscore_data_done == False:
    Data = Transform.Zscore_Individually(Data, window_cols)
    zscore_data_done = True
    print('Data WAS Zscored')
    
elif zscore_data == True and zscore_data_done == True:
    print('Already WAS Zscored')
    
elif zscore_data == False:
    print('Data NOT Zscored')
    
else:
    print('Error')

Data WAS Zscored


---
## Train-Test Split

### Dimitrios Sphatis Suggestion
Make sure not to have same IDs in test(valid) and train sets.<br>
This will reduce test accuracy, but increase generability. 

In [None]:
#As Dimitrios use:
#https://stackoverflow.com/questions/44007496/random-sampling-with-pandas-data-frame-disjoint-groups
# Initialize the GroupShuffleSplit.
gss = GroupShuffleSplit(n_splits=1, test_size=0.1, random_state= seed)

# Get the indexers for the split.
idxTrain, idxTest = next(gss.split(Data, groups=Data.short_ID))

# Get the split DataFrames.
TrainData, TestData = Data.iloc[idxTrain], Data.iloc[idxTest]

# Unsuring the Test and Train IDs are seperate 
assert len(set(TrainData['short_ID'].unique()).intersection(set(TestData['short_ID'].unique()))) == 0

# Converting to Numpy Array
x_train, x_test = TrainData[window_cols].to_numpy(), TestData[window_cols].to_numpy()

In [None]:
x_train.shape

In [None]:
x_test.shape

---
---
# WandB Sweep Log in
https://github.com/wandb/examples/blob/master/colabs/keras/Keras_param_opti_using_sweeps.ipynb


In [None]:
wandb.login()

# Sweep & Train Functions

In [None]:
def train(model, batch_size= 32, epochs= 100, lr=0.001, optimizer='nadam'):  
    
    tf.keras.backend.clear_session()
    model.compile(loss="mse", 
                  optimizer=ann_train.get_optimizer(lr, optimizer), 
                  metrics=["mse", tf.keras.metrics.MeanAbsoluteError()])

    early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
    lr_scheduler_cb = keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)

    model.fit(x_train, 
              x_train, 
              batch_size=batch_size, 
              epochs=epochs, 
              validation_data=(x_test, x_test), 
              callbacks=[WandbCallback(), early_stopping_cb, lr_scheduler_cb])
    
    

In [None]:
def sweep_train(config_defaults=None):
    # Initialize wandb with a sample project name
    with wandb.init(config=config_defaults):  # this gets over-written in the Sweep

        # Specify the other hyperparameters to the configuration
        wandb.config.architecture_name = AE_Model_Name
        wandb.config.dataset_name = dataset_name
        
        train_go = True
        # initialize model
        if AE_Model_Name == 'FC_Small':
            AE_model = fc_small.model(window_length = wandb.config.window_length,
                                      latent_layer_size = wandb.config.latent_layer_size,
                                      activation_fn = wandb.config.activation_fn)
            
        elif AE_Model_Name == 'FC_N2D':
            AE_model = fc_n2d.model(wandb.config.window_length,
                                    wandb.config.latent_layer_size,
                                    wandb.config.activation_fn)
        else:
            print('ERROR: AE name not recognised!')
            train_go = False
            
        if train_go:
            train(AE_model, 
                  wandb.config.batch_size, 
                  wandb.config.epochs,
                  wandb.config.learning_rate,
                  wandb.config.optimizer)
        


---
---
# Run Sweep

In [None]:
sweep_id = wandb.sweep(sweep_config, project = Project_Name)

In [None]:
wandb.agent(sweep_id, function=sweep_train, count= sweep_count)

In [None]:
wandb.finish()