In [None]:
# https://www.reddit.com/r/computervision/comments/kfhc3u/how_does_one_fine_tune_cnn_hyperparameter_when/
# Guarda tipo di genetic algorithms (YOLO)
# Tuning hyperparameters in the context of large datasets can be a problem. I should investigate further.

In [None]:
from pprint import pprint
from uuid import uuid4

import numpy as np
from torch.utils.data import DataLoader

from main.hp_tuning import UniqueParametersConfigFsGenerator, RandomTunableOffsetParameter, \
    RandomTunableDiscreteParameter

seed = 1408
config_path = "./output/config"
config_gen = UniqueParametersConfigFsGenerator(patience=10, seen_configurations_path=config_path)

# Parameters definition:
embedding_sizes = [70, 100, 160, 250, 340, 430]
config_gen.add_parameter('embedding_size', RandomTunableDiscreteParameter(values_list=embedding_sizes, seed=seed))
config_gen.add_parameter('aspect_size', RandomTunableOffsetParameter(value_range=(7, 20), step=2, seed=seed))
config_gen.add_parameter('negative_sample_size', RandomTunableOffsetParameter(value_range=(8, 20), step=2, seed=seed))
config_gen.add_parameter('epochs', RandomTunableOffsetParameter(value_range=(5, 15), step=2, seed=seed))

np.random.seed(seed)
learning_rates = (10 ** np.random.uniform(-5, -3, 10)).tolist()

print("Possible learning rates are: ")
pprint(learning_rates)

config_gen.add_parameter("learning_rate", RandomTunableDiscreteParameter(values_list=learning_rates, seed=seed))
config_gen.add_parameter("batch_size", RandomTunableDiscreteParameter(values_list=[64, 128, 256, 512, 1024], seed=seed))

I'd love to make K-fold CV but for time constraints it is just not viable. <br>
Since the dataset is big enough we resort to the classic validation set.

In [None]:
from main.abae.dataset import PositiveNegativeABAEDataset
from main.abae.evaluation import ABAEEvaluationProcessor
from main.abae.model_manager import ABAEManagerConfig, ABAEManager
import pandas as pd

corpus_path = "../dataset/output/pos_tagged/pre_processed.310k.noun_only.csv"
corpus = pd.read_csv(corpus_path)

split_dataset = np.array_split(corpus, 4)

validation_split = split_dataset[0]  # On what to compute the validation metrics 25% of ds for validation
train = pd.concat([split_dataset[index] for index in range(len(split_dataset)) if index != 0])

results = []
n_folds = 5
# This script can be re-run as often as desired as the history is persisted
for i in range(1):  # How many different configurations we want to see
    config = next(config_gen)
    run_id = uuid4()

    print(f"Running configuration = {config} ({i + 1}/10)")
    run_result = dict(config=config, cv_coh=[], npmi_coh=[], max_margin_loss=[])
    abae_config = ABAEManagerConfig.from_configuration(f"{run_id}", config)
    abae_manager = ABAEManager.from_scratch(abae_config, train, override=True)

    # Now we train:
    abae_manager.train(train)

    # Now for evaluation
    # Max margin loss:
    vocabulary = abae_manager.generator.emb_model.vocabulary()
    max_seq_len = abae_config.max_seq_len
    negative_sample_size = abae_config.negative_sample_size
    eval_ds = PositiveNegativeABAEDataset(validation_split, vocabulary, max_seq_len, negative_sample_size)

    model = abae_manager.get_compiled_model(refresh=False)
    run_result['max_margin_loss'] = model.evaluate(DataLoader(eval_ds, batch_size=abae_config.batch_size))

    # Coherence metrics:
    processor = ABAEEvaluationProcessor(abae_manager, validation_split)
    run_result['silhouette_score'] = processor.silhouette_score()
    validation = validation_split['comments'].apply(lambda x: x.split(' '))

    for top in [3, 10, 25]:
        cv_coh = processor.c_v_coherence_model(top_n=100, ds=validation)
        npmi_coh = processor.c_npmi_coherence_model(top_n=top, ds=validation)
        run_result['cv_coh'].append({top: cv_coh.get_coherence()})
        run_result['npmi_coh'].append({top: npmi_coh.get_coherence()})


Let's analyze the results: