# Project - Day 4 - MLFlow training of your model

## Insert MLFlow parameters
The following cell is marked as `parameters`, you might find useful to include MLFlow usable parameters here for varying and experimenting different values for the CNN.

In [None]:
batch_size = 10
n_epochs = 5

## Excercise

Based on the Training step of the project done on day 3:

- train a model and store the metrics of the training process in MLFlow. e.g.:
```python
with mlflow.start_run(tags={"mlflow.runName": "train"}) as mlrun:

    losses = []
    val_losses = []
    !pip install -q tqdm
    from tqdm import trange
    
    n_epochs = 5
    n_blocks = y_train.numblocks[0]
    
    for epoch in trange(n_epochs):
        for X, y in zip(X_train.blocks, y_train.blocks):
            losses.append(
                (len(losses)/n_blocks, classifier.train_on_batch(X.compute(), y.compute()))
            )
        ls = classifier.test_on_batch(X_valid, y_valid)
        val_losses.append(
            (len(losses)/n_blocks,ls)
            )
        mlflow.log_metric("loss", ls, step=int(len(losses)/n_blocks))

```

- store the model in MLFlow of the usage on the next step of the pipeline, e.g.:

```python
    classifier.save("classifier.keras")
    mlflow.log_artifact("classifier.keras")
    prds = classifier.predict(X_valid.compute())
    signature = infer_signature(X_valid.compute(), prds)
    mlflow.tensorflow.log_model(classifier, "model", registered_model_name="CYGNO_CNN", signature=signature)
```

- store any additional plot that you find useful to track as a MLFlow artifact

## SOLUTIONS

In [None]:
%%bash

## Download the training dataset from an INFN archive
wget https://pandora.infn.it/public/269d22/dl/training_set.zip -qO $HOME/data/training_set.zip

## Install the unzip utility 
#apt-get -qy install unzip

## Extract the archive
cd $HOME/data/
unzip -qn $HOME/data/training_set.zip

In [None]:
import warnings

warnings.filterwarnings('ignore')

from glob import glob
filenames = glob("/home/jovyan/data/data/export/*/*/*/*.png")
print (f"Found {len(filenames)} filenames")

import mlflow
from mlflow.models import infer_signature

In [None]:
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import dask, dask.array

## See Day 2
@dask.delayed 
def load_image(filename: str):
    """Wrapper function loading image as a dask.delayed"""
    return np.asarray(Image.open(filename))

## See Day 2
def load_raw_images(filenames):
    """Load the images from the file paths in `filenames` into a delayed dask-array"""
    return dask.array.stack([
        dask.array.from_delayed(load_image(f), shape=(576, 576), dtype=np.uint8) 
        for f in filenames
    ], axis=0)


## Discussed in Day 1, implemented in Day 2
def windowing(dask_image, x_min, x_max):
    """Maps the pixel values from the interval [x_min, x_max] to [0, 1]"""
    return dask.array.clip((dask_image - x_min)/(x_max - x_min), 0., 1.)

## Discussed in Day 1, implemented in Day 2
def crop_center(dask_image, half_win=64):
    """Crop a numpy-represented image around its center, the resulting image will be a square of side 2*half_win"""
    low, high = 576//2 - half_win, 576//2 + half_win
    return dask_image[:,low:high, low:high]

In [None]:
import re
def energy_keV_from_path(filenames):
    """
    Return a dask array with the energy (in keV) as obtained parsing a sequence of filenames passed 
    as an argument.
    """
    return dask.array.from_array([float(re.findall(r"/([0-9]+)_keV", f)[0]) for f in filenames])

def is_nuclear_from_path(filenames):
    """
    Return an array of boolean, true for nuclear recoil, or false for electron recoils as 
    obtained parsing the list of filenames passed as an argument.
    """
    return dask.array.from_array([float('NR' in re.findall(r"/([NE]R)/", f)) for f in filenames])

In [None]:
shuffled_filenames = np.random.RandomState(seed=42).permutation(filenames)

In [None]:
n_validation = 50
validation_files = shuffled_filenames[:n_validation]
training_files = shuffled_filenames[n_validation:]
training_set = crop_center(windowing(load_raw_images(training_files), 60, 130))
validation_set = crop_center(windowing(load_raw_images(validation_files), 60, 130))

training_label = is_nuclear_from_path(training_files)
validation_label = is_nuclear_from_path(validation_files) 

training_energy = energy_keV_from_path(training_files)
validation_energy = energy_keV_from_path(validation_files)

In [None]:
import tensorflow as tf

input = tf.keras.Input(shape=(128,128), name="input")
hidden = tf.keras.layers.Reshape((128, 128, 1), name="reshape")(input)
hidden = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), activation='relu', kernel_regularizer=tf.keras.regularizers.L2(1e-2), kernel_initializer='he_normal')(hidden)
hidden = tf.keras.layers.Conv2D(filters=8, kernel_size=(3,3), activation='relu', kernel_regularizer=tf.keras.regularizers.L2(1e-2), kernel_initializer='he_normal')(hidden)
hidden = tf.keras.layers.MaxPooling2D(2)(hidden)

hidden = tf.keras.layers.Conv2D(filters=4, kernel_size=(3,3), activation='relu', kernel_regularizer=tf.keras.regularizers.L2(1e-2), kernel_initializer='he_normal')(hidden)
hidden = tf.keras.layers.Conv2D(filters=4, kernel_size=(3,3), activation='relu', kernel_regularizer=tf.keras.regularizers.L2(1e-2), kernel_initializer='he_normal')(hidden)
hidden = tf.keras.layers.MaxPooling2D(2)(hidden)

hidden = tf.keras.layers.Flatten()(hidden)
output = tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer='he_normal')(hidden)

classifier = tf.keras.Model(input, output)
classifier.compile(loss=tf.keras.losses.BinaryCrossentropy(), optimizer=tf.keras.optimizers.Adam(3e-4))

display(classifier.summary())

In [None]:
batch_size = 10

X_train = training_set.rechunk( [batch_size, None, None] )
X_valid = validation_set.rechunk( [-1, None, None] )
y_train = training_label.rechunk(batch_size)
y_valid = validation_label.rechunk(batch_size)

In [None]:
with mlflow.start_run(tags={"mlflow.runName": "train"}) as mlrun:

    losses = []
    val_losses = []
    from tqdm import trange
    
    n_blocks = y_train.numblocks[0]
    
    for epoch in trange(n_epochs):
        for X, y in zip(X_train.blocks, y_train.blocks):
            losses.append(
                (len(losses)/n_blocks, classifier.train_on_batch(X.compute(), y.compute()))
            )
        ls = classifier.test_on_batch(X_valid, y_valid)
        val_losses.append(
            (len(losses)/n_blocks,ls)
            )
        mlflow.log_metric("loss", ls, step=int(len(losses)/n_blocks))

    classifier.save("classifier.keras")
    mlflow.log_artifact("classifier.keras")
    prds = classifier.predict(X_valid.compute())
    signature = infer_signature(X_valid.compute(), prds)
    mlflow.tensorflow.log_model(classifier, "model", registered_model_name="CYGNO_CNN", signature=signature)
    

plt.plot(*(np.array(losses).T), label="Training data")
plt.plot(*(np.array(val_losses).T), label="Validation data")
plt.title(f"{n_epochs} epochs")
plt.xlabel("Epoch")
plt.ylabel("Binary cross-entropy")
plt.legend(title="CYGNO-SIM")

plt.show()