### L2 regularisation experiment

This notebook runs an experiment to understant the effect of the L2 regularisation on the predicitons of matfact.  
The risk state are labeled with integers from 1 to 4: [1: Normal, 2: LowRisk, 3: HighRisk, 4: Cancer]  
Since the data is highly imbalanced towards Normal and Low risk states, there are a majority of 1 and 2 labels in the datasets.  
L2 regularisation on both U and V might promote lower values (Labels 1 and 2) in M.  

This experiment logs with mlflow the matfact results with increasing regularisation parameters for U and V with a synthetic dataset.  
Then, using the same dataset, the distribution of the labels is inverted so that the higher risk is represented by labels 1 and 2 and the lower risks by labels 3 and 4. This way the imbalance is also inverted having a mayority of labels 4 and 3.  
The results of the matfact are also logged to be later compared with visualisations.  


While running the experiment the confusion matrix for each different combination of regularisation parameters are generated and saved into an image in the results directory. 
The rest of the visualisations (matthew, accuracy, recall, precision) are generated at the end and saved in the same directory.

In [None]:
import os
import pathlib
import sys
CWD = pathlib.Path(os.getcwd())

sys.path.append(str(CWD.parent / 'hmm_synthetic/'))
sys.path.append(str(CWD.parent / 'matfact/'))

RESULTS_PATH = CWD / "results"
print(RESULTS_PATH)

In [None]:
from matfact.exp_l2_reg.experiment import run_l2_regularization_experiments
from matfact.exp_l2_reg.experiment import MATFACT_ALS, SKLEARN_NMF, SKLEARN_DL, SKLEARN_TSVD

Compare results between HMM and DGD synthetic data with varying l1 regularization rates for matrix U:

In [None]:
# Run experiments with increasing parameters for the U and V l2 regularisations.
lambda_values = [0, 0.5, 1, 1.5, 2]   # [0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2] 
for l1_rate in [0, 0.5, 0.99]:
    run_l2_regularization_experiments(
        lambda_values, 
        RESULTS_PATH,
        # experiment_name="l1_reg", 
        model_type=MATFACT_ALS, 
        U_l1_rate=l1_rate,
        data_gen_method="HMM",
        N=50000, 
        # lambda_values_l1=lambda_values_l1,
        # N=10, T=20, rank=5, sparsity=100,
    )

In [None]:
# Run experiments with increasing parameters for the U and V l2 regularisations.
lambda_values = [0, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2]
for l1_rate in [0, 0.5, 0.99]:
    run_l2_regularization_experiments(
        lambda_values, 
        RESULTS_PATH,
        model_type=MATFACT_ALS, 
        U_l1_rate=l1_rate, # True,
        data_gen_method="DGD",
        N=10000, 
        # lambda_values_l1=lambda_values_l1,
        # N=10, T=20, rank=5, sparsity=100,
    )

Compare matfact with scikit-learn NMF (matfact CMF with no convolution):

In [None]:
# Run experiments for matfact
lambda_values = [0, 0.5, 1, 1.5, 2]  # [int(1e12)]  # [0,9,18,21,63,126,189]  # 
run_l2_regularization_experiments(lambda_values, RESULTS_PATH, model_type=SKLEARN_NMF)  # 

In [None]:
# Run experiments for matfact
lambda_values = [0, 0.5, 1, 1.5, 2]  # [int(1e12)]  # [0,9,18,21,63,126,189]  # 
run_l2_regularization_experiments(lambda_values, RESULTS_PATH, model_type=MATFACT_ALS)

Delete mlflow experiment logs and figures:

In [None]:
# Delete all mlflow experiments and empty trash
import mlflow
from mlflow import MlflowClient
from mlflow.entities import ViewType
print(mlflow.get_tracking_uri())

# Check existing experiments (remove if necessary)
client = MlflowClient()
for e in client.search_experiments(ViewType.ALL):
    print(e)
    client.delete_experiment(e.experiment_id)
    ! rm -r mlruns/.trash/$e.experiment_id
# ! rm -r ../results/*


In [None]:
# List experiment result directories
from pathlib import Path
results_dirs =  [str(dir) for dir in Path("../experiments/results/").iterdir() if "numpy" not in str(dir) and "DS_Store" not in str(dir)]
print(results_dirs)

In [None]:
# Remove experiment results
# rm_results_dirs = []
rm_results_dirs = results_dirs
for dir in rm_results_dirs:
    ! rm -r $dir

In [None]:
dir = "../experiments/results/l1_reg_l10.99_matfact_als_HMM_9_10"
! rm -r $dir

In [None]:
# Create and print dictionary of experiment ids and names
from mlflow import MlflowClient

experiments_dict = {}
client = MlflowClient()
for e in client.search_experiments():
    experiments_dict[e.experiment_id] = e.name
    print(f"{e.experiment_id}: {e.name}")

In [None]:
# Clean experiments using the experiment dictionary
exp_ids = experiments_dict.keys()  # [""]
for exp_id in exp_ids:
    client.delete_experiment(exp_id)
    ! rm -r mlruns/.trash/*$exp_id*