In [None]:
# https://www.reddit.com/r/computervision/comments/kfhc3u/how_does_one_fine_tune_cnn_hyperparameter_when/
# Guarda tipo di genetic algorithms (YOLO)
# Tuning hyperparameters in the context of large datasets can be a problem. I should investigate further.

In [1]:
from pprint import pprint
from uuid import uuid4

import numpy as np

from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter, \
    RandomTunableDiscreteParameter

seed = 2077  # Wake up samurai
config_path = "./output/config"
config_gen = UniqueParametersConfigFsGenerator(patience=100, seen_configurations_path=config_path)

# Parameters definition:
embedding_sizes = [100, 200, 300, 400]
config_gen.add_parameter('embedding_size', RandomTunableDiscreteParameter(values_list=embedding_sizes, seed=seed))
config_gen.add_parameter('aspect_size', RandomTunableOffsetParameter(value_range=(7, 20), step=1, seed=seed))
# config_gen.add_parameter('negative_sample_size', RandomTunableOffsetParameter(value_range=(10, 20), step=5, seed=seed))
config_gen.add_parameter('epochs', RandomTunableOffsetParameter(value_range=(7, 15), step=2, seed=seed))

np.random.seed(seed)
learning_rates = (10 ** np.random.uniform(-5, -2, 10)).tolist()

print("Possible learning rates are: ")
pprint(learning_rates)

config_gen.add_parameter("learning_rate", RandomTunableDiscreteParameter(values_list=learning_rates, seed=seed))
config_gen.add_parameter("batch_size", RandomTunableDiscreteParameter(values_list=[128, 256, 512, 1024], seed=seed))

Possible learning rates are: 
[0.00034307925427065264,
 1.528848384756386e-05,
 0.0009812379995669137,
 2.7095598297631782e-05,
 0.007306153347321277,
 0.002043186796816812,
 0.0006166794665094792,
 0.0055237549202012005,
 0.0022376672226856378,
 0.0020646784735470726]


I'd love to make K-fold CV but for time constraints it is just not viable. <br>
Since the dataset is big enough we resort to the classic validation set.

In [2]:
import logging

logging.disable()
logging.disable(logging.DEBUG)
logging.disable(logging.INFO)

In [None]:
import json
from pathlib import Path
from main.abae.model_manager import ABAEManager
from main.abae.config import ABAEManagerConfig
import pandas as pd

corpus = pd.read_csv("../dataset/output/default/pre_processed.310k.csv")
split_dataset = np.array_split(corpus, 4)

validation_split = split_dataset[0]  # On what to compute the validation metrics 25% of ds for validation
train = pd.concat([split_dataset[index] for index in range(len(split_dataset)) if index != 0])

results = []

file_path = "./output/config/abae_configurations_results.json"
if Path(file_path).is_file():
    results = results + json.load(open(file_path))

# This script can be re-run as often as desired as the history is persisted
for i in range(20):  # How many different configurations we want to see
    config = next(config_gen)
    run_id = uuid4()

    print(f"Running configuration = {config} ({i + 1}/10)")

    abae_config = ABAEManagerConfig.from_configuration(f"{run_id}", config)
    abae_manager = ABAEManager.from_scratch(abae_config, train, override=True)

    # Now we train:
    history, _ = abae_manager.train(train, verbose=2)
    # Evaluation with inverse order to avoid re-computing the aspects
    run_result = abae_manager.evaluate([20, 10, 5, 3], validation_split)
    run_result['history'] = history.history['loss']
    run_result['config'] = config

    results.append(run_result)
    json.dump(results, open(file_path, 'w'))

Let's analyze the results:

In [None]:
import pandas as pd
import json

file_path = "./output/config/abae_configurations_results.json"
pd.DataFrame(json.load(open(file_path)))

In [None]:
data = pd.DataFrame(json.load(open(file_path)))
structured_data = {'cv_coh': [], 'npmi_coh': [], 'top': [], 'max_margin_loss': [], 'id': [],
                   'plot_npmi_coh': [], 'embedding_size': [], 'aspect_size': [], 'learning_rate': [], 'batch_size': [],
                   'epochs': []}
for index, row in data.iterrows():
    for i in [3, 10, 25]:
        structured_data['id'].append(row['id'])

        structured_data['embedding_size'].append(row['config']['embedding_size'])
        structured_data['aspect_size'].append(row['config']['aspect_size'])
        structured_data['learning_rate'].append(row['config']['learning_rate'])
        structured_data['batch_size'].append(row['config']['batch_size'])
        structured_data['epochs'].append(row['config']['epochs'])
        structured_data['cv_coh'].append(row['cv_coh'][str(i)])
        structured_data['npmi_coh'].append(row['npmi_coh'][str(i)])
        structured_data['plot_npmi_coh'].append(row['npmi_coh'][str(i)] + 1)
        structured_data['top'].append(i)
        structured_data['max_margin_loss'].append(row['max_margin_loss'][1])

data = pd.DataFrame(structured_data)

In [None]:
data.groupby(['id']).sum()

In [None]:
import plotly.express as px

fig = px.scatter(
    data, x="max_margin_loss", y="cv_coh", symbol="top", color='id',
    size='plot_npmi_coh',
    hover_data=['npmi_coh', 'embedding_size', 'aspect_size', 'epochs', 'learning_rate', 'batch_size'],
    title='Hyperparameter Tuning Results',
)
fig.show()

In [None]:
# The default proposed model parameters for ABAE seem to be fitting for our domain as well
# We take the default one and

In [None]:
# todo valuta se guardare 537398ce-4642-4e83-8073-ee795b63f1c1 che ha top3 pirma di top 25
configs = ['31479139-17f6-4f6d-aa8b-494cbc8f183b', '3f192c54-6623-48a7-b01b-2d5019dad186']

In [None]:
fig = px.parallel_categories(
    data.groupby(['id']).mean(), color='max_margin_loss',
    dimensions=['learning_rate', 'batch_size', 'aspect_size', 'embedding_size', 'epochs'],
)

fig.show()