# Benchmarking notebook

- The purpose of this notebook is to run all the benchmarks in a notebook format
- The benchmarks can be runned in a terminal with `just benchmarks`
- But that method don't work well in Google Colab Notebooks
- So we do this adaptation to use Google Colab power


# Global Parameters of the Notebook

## Paths

- Parameters related to data / model / lib paths

In [None]:
# Lib to define paths
import os

# Define if we are running the notebook in our computer ("local")
# or in Google Colab ("remote")
RUNNING_ENV = "remote"

# Base path for the rest of paths defined in the notebook
BASE_PATH = "./src" if RUNNING_ENV == "local" else "/content/drive/MyDrive/Colab Notebooks/"

# Path to our lib dir
LIB_PATH = os.path.join(BASE_PATH, "lib")

# Path to the benchmarks dir
BENCHMARK_PATH = os.path.join(BASE_PATH, "benchmarks")

# Path where we store training / test data
DATA_PATH = os.path.join(BASE_PATH, "data")

# Dir with all cached models 
# This cached models can be loaded from disk when training is skipped
MODEL_CACHE_FOLDER = os.path.join(BASE_PATH, "cached_models")

# Cache for the augmented dataset
AUGMENTED_DATASET_CACHE_FILE = os.path.join(BASE_PATH, "cached_augmented_dataset.pt")

# File where the logs are written
LOGGING_FILE = os.path.join(BASE_PATH, "training.log")

# Binary file where the stats of the profiling are saved
PROFILE_SAVE_FILE = os.path.join(BASE_PATH, "training_profile.stat")

## ML parameters

- Parameters related to machine learning
- For example, batch sizes, learning rates, ...

In [None]:
# Parameters of P-K sampling
P = 100   # Number of classes used in each minibatch
K = 2     # Number of images sampled for each selected class

# Batch size for online training 
# We can use `P * K` as batch size. Thus, minibatches will be
# as we expect in P-K sampling. 
# 
# But we can use `n * P * K`. Thus, minibatches will be n P-K sampling
# minibatche concatenated together
# Be careful when doing this because it can be really slow, and there is no
# clear reason to do this
ONLINE_BATCH_SIZE = P * K

# Epochs for hard triplets, online training 
TRAINING_EPOCHS = 1

# Learning rate for hard triplets, online training
ONLINE_LEARNING_RATE = 0.01

# How many single elements we want to see before logging 
# It has to be a multiple of P * K, otherwise `should_log` would return always 
# false as `it % LOGGING_ITERATIONS != 0` always
#
# `LOGGING_ITERATIONS = P * K * n` means we log after seeing `n` P-K sampled
# minibatches
LOGGING_ITERATIONS = P * K * 20

# Which percentage of the training and validation set we want to use for the logging
ONLINE_LOGGER_TRAIN_PERCENTAGE = 1 / 5
ONLINE_LOGGER_VALIDATION_PERCENTAGE = 1 / 3

# Choose which model we're going to use
# Can be "ResNet18", "LightModel" or "LFWResNet18"
NET_MODEL = "LFWResNet18"

# Epochs used in k-Fold Cross validation 
# k-Fold Cross validation used for parameter exploration
HYPERPARAMETER_TUNING_EPOCHS = 7

# Number of folds used in k-fold Cross Validation
NUMBER_OF_FOLDS = 4

# Margin used in the loss function
MARGIN = 1.0

# Dim of the embedding calculated by the network
EMBEDDING_DIMENSION = 5

# Number of neighbours considered in K-NN
# K-NN used for transforming embedding task to classification task 
NUMBER_NEIGHBOURS = 3

# Batch Triplet Loss Function
# This way we can choose among "hard", "all"
BATCH_TRIPLET_LOSS_FUNCTION = "hard"

# Wether or not use softplus loss function instead of vanilla triplet loss
USE_SOFTPLUS_LOSS = False

# Count all sumamnds in the mean loss or only those summands greater than zero
USE_GT_ZERO_MEAN_LOSS = True

# Wether or not use lazy computations in the data augmentation
LAZY_DATA_AUGMENTATION = True

## Section parameters

- Flags to choose if some sections will run or not
- This way we can skip some heavy computations when not needed 

In [None]:
# Skip hyper parameter tuning for online training
SKIP_HYPERPARAMTER_TUNING = True

# Skip training and use a cached model
# Useful for testing the embedding -> classifier transformation
# Thus, when False training is not computed and a cached model
# is loaded from disk
# Cached models are stored in `MODEL_CACHE_FOLDER`
USE_CACHED_MODEL = False

# Skip data augmentation and use the cached augmented dataset
USE_CACHED_AUGMENTED_DATASET = False

# Most of the time we're not exploring the data, but doing
# either hyperparameter settings or training of the model
# So if we skip this step we can start the process faster
SKIP_EXPLORATORY_DATA_ANALYSYS = True

# Wether or not profile the training 
# This should be False most of the times
# Note that profiling adds a significant overhead to the training
PROFILE_TRAINING = False

## WANDB Parameters

In [None]:
from datetime import datetime

# Name for the project
# One project groups different runs
WANDB_PROJECT_NAME = "Benchmarking"

# Name for this concrete run 
# I don't care too much about it, because wandb tracks the parameters we use 
# in this run (see "Configuration for Weights and Biases" section)
WANDB_RUN_NAME = str(datetime.now())

## Others

In [None]:
# Number of workers we want to use 
# We can have less, equal or greater num of workers than CPUs
# In the following forum:
#   https://discuss.pytorch.org/t/guidelines-for-assigning-num-workers-to-dataloader/813/4
# they recomend to explore this parameter, growing it until system RAM saturates
# Using a value greater than 2 makes pytorch tell us that this value is not optimal
# So sticking with what pytorch tells uss
NUM_WORKERS = 2

# Fix random seed to make reproducible results
RANDOM_SEED = 123456789

# Auth forGoogle Drive

In [None]:
if RUNNING_ENV == "remote":
    from google.colab import drive
    drive.mount('/content/drive')

# Pre-installations

- Some packages are not installed in the Colab Enviroment
- So install them if we're running in Colab

In [None]:
if RUNNING_ENV == "remote":
    !pip install wandb

# Importing the modules we are going to use

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.datasets as datasets

# For using pre-trained ResNets
import torchvision.models as models 
import torchvision.transforms as transforms

from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import numpy as np
import os
import logging
from datetime import datetime
from pprint import pprint
import gc
import functools
import math
import seaborn as sns
from collections import Counter
import time
import copy
import cProfile

import wandb

# All concrete pieces we're using form sklearn
from sklearn.metrics import roc_auc_score, accuracy_score, silhouette_score
from sklearn.model_selection import ShuffleSplit


from tqdm import tqdm
from typing import List

# Load in the notebook all .py files that make our personal lib
# This way we keep notebook code as small as possible, and only pertinent 
# to the concrete task that this notebook solves (generic and reusable code
# goes to personal lib)
# Also, 
!mkdir -p ./src/lib/
!mkdir -p ./src/benchmarks/
!cp -r "$LIB_PATH"/* ./src/lib/
!cp -r "$BENCHMARK_PATH"/* ./src/benchmarks

# Now that files are loaded, we can import pieces of code
import src.lib.core as core
import src.lib.trainers as trainers
import src.lib.board as board 
import src.lib.filesystem as filesystem
import src.lib.metrics as metrics
import src.lib.loss_functions as loss_functions
import src.lib.embedding_to_classifier as embedding_to_classifier
import src.lib.sampler as sampler
import src.lib.utils as utils
import src.lib.data_augmentation as data_augmentation

import src.benchmarks.benchmark_metrics as bb_metrics
import src.benchmarks.benchmark_loss_functions as bb_loss_functions

from src.lib.trainers import train_model_offline, train_model_online
from src.lib.train_loggers import ClassificationLogger, SilentLogger, TripletLoggerOffline, TripletLoggerOnline, TrainLogger, CompoundLogger, IntraClusterLogger, InterClusterLogger
from src.lib.models import *
from src.lib.visualizations import *
from src.lib.models import ResNet18, LFWResNet18
from src.lib.loss_functions import MeanTripletBatchTripletLoss, BatchHardTripletLoss, BatchAllTripletLoss
from src.lib.embedding_to_classifier import EmbeddingToClassifier
from src.lib.sampler import CustomSampler
from src.lib.data_augmentation import AugmentatedDataset, LazyAugmentatedDataset

# Configuration of the logger

- Here we set the configuration for all logging done 
- In lib, `logging.getLogger("MAIN_LOGGER")` is used everywhere, so we get it, configure it once, and use that config everywhere

In [None]:
# Get the logger that is used everywhere
file_logger = logging.getLogger("MAIN_LOGGER")

# Configure it
file_logger.propagate = False # Avoid propagatint to upper logger, which logs to 
                         # the console
file_logger.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(asctime)s::%(levelname)s::%(funcName)s::> %(message)s")
file_handler = logging.FileHandler(LOGGING_FILE)
file_handler.setFormatter(formatter)
file_logger.addHandler(file_handler)

# 'application' code
file_logger.debug('debug message')


# Configuration for Weigths and Biases

- We're going to use `wandb` for tracking the training of the models
- In this section, we configure `wandb`, mainly selecting which parameters of the notebook are we going to track

In [None]:
# Select which parameters of the notebook we're going to track in wand
# This has to be done before `wandb.init()` in order to pass this dict to 
# `wandb.init`
# 
# I could create a config dict in "Global Parameters of the Notebook" and pass it
# rightaway. Or use directly wandb.config.SOMETHING everywhere. We don't do this 
# because of the following reasons:
# 
# 1. We don't want to track all parameters (ie. section parameters, dir paths...)
# 2. At this moment, we're not 100% sure that wandb is the right tool, so we are
#    looking for loose coupling

wandb_config_dict = {}


wandb_config_dict["P"] = P 
wandb_config_dict["K"] = K
wandb_config_dict["ONLINE_BATCH_SIZE"] = ONLINE_BATCH_SIZE
wandb_config_dict["TRAINING_EPOCHS"] = TRAINING_EPOCHS
wandb_config_dict["ONLINE_LEARNING_RATE"] = ONLINE_LEARNING_RATE
wandb_config_dict["LOGGING_ITERATIONS"] = LOGGING_ITERATIONS
wandb_config_dict["ONLINE_LOGGER_TRAIN_PERCENTAGE"] = ONLINE_LOGGER_TRAIN_PERCENTAGE
wandb_config_dict["ONLINE_LOGGER_VALIDATION_PERCENTAGE"] = ONLINE_LOGGER_VALIDATION_PERCENTAGE
wandb_config_dict["NET_MODEL"] = NET_MODEL
wandb_config_dict["HYPERPARAMETER_TUNING_EPOCHS"] = HYPERPARAMETER_TUNING_EPOCHS
wandb_config_dict["NUMBER_OF_FOLDS"] = NUMBER_OF_FOLDS
wandb_config_dict["MARGIN"] = MARGIN
wandb_config_dict["EMBEDDING_DIMENSION"] = EMBEDDING_DIMENSION
wandb_config_dict["NUMBER_NEIGHBOURS"] = NUMBER_NEIGHBOURS
wandb_config_dict["BATCH_TRIPLET_LOSS_FUNCTION"] = BATCH_TRIPLET_LOSS_FUNCTION
wandb_config_dict["USE_SOFTPLUS_LOSS"] = USE_SOFTPLUS_LOSS
wandb_config_dict["USE_GT_ZERO_MEAN_LOSS"] = USE_GT_ZERO_MEAN_LOSS
wandb_config_dict["PROFILE_TRAINING"] = PROFILE_TRAINING

In [None]:
# Init the wandb tracker
# We need to do this before 
wandb.init(
    project = WANDB_PROJECT_NAME, 
    name = WANDB_RUN_NAME, 
    config = wandb_config_dict
)

In [None]:
# Set env variable to allow wandb to save the code of the notebook
%env WANDB_NOTEBOOK_NAME=WANDB_RUN_NAME

# Functions that we are going to use

In [None]:
def show_learning_curve(training_history: dict):
    # Take two learning curves
    loss = training_history['loss']
    val_loss = training_history['val_loss']

    # Move the lists to cpu, because that's what matplotlib needs
    loss = [loss_el.cpu() for loss_el in loss]
    val_loss = [val_loss_el.cpu() for val_loss_el in val_loss]
    
    # Show graphics
    plt.plot(loss)
    plt.plot(val_loss)
    plt.legend(['Training loss', 'Validation loss'])
    plt.show()
    
def try_to_clean_memory(): 
    torch.cuda.empty_cache() 
    gc.collect()

# Running the benchmarks

In [None]:
bb_metrics.main()

In [None]:
bb_loss_functions.main()