In [None]:
# TODO pulisci questo file
# https://arxiv.org/pdf/1803.09820
# https://machinelearningmastery.com/learning-rate-for-deep-learning-neural-networks/

In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

## Hands on first attempt:


In [None]:
 # Without any hp tuning we just try and see how it goes.

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset.
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] 
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

## Aspect Embedding Size
The aspect embedding size is what will be inferring aspects. It is closest to representative words (?). <br />
We have to identify 7 actual aspects (luck, bookkeeping, downtime...) but that does not mean our matrix should be limited to rows only! What size to search is a good question and should be studied (Which I may be doing later). 

For the first try we setup the aspect_size:
>The optimal number of rows is problem-dependent, so it’s crucial to: <br/>
> Start with a heuristic: Begin with 2–3x the number of aspects.

For **aspect extraction**, which involves identifying key aspects or topics in text, the best early stopping method depends on your approach:

### 1. Embedding-based Methods (e.g., Clustering Embeddings)
- **Silhouette Score**: Measure the separation and compactness of clusters. Stop when the score stabilizes.
- **Inertia/Distortion**: Track the sum of squared distances within clusters and stop when improvement flattens.
- **Centroid Movement**: Stop when the change in cluster centroids across iterations is minimal.

### 2. Topic Modeling (e.g., LDA)
- **Perplexity**: Monitor the perplexity on a held-out dataset and stop when it stops decreasing significantly.
- **Coherence Score**: Measure the semantic consistency of extracted topics and stop when it stabilizes.

### 3. Autoencoder-based Aspect Extraction
- **Reconstruction Loss**: Stop training when the validation reconstruction error no longer improves.

### 4. Qualitative Evaluation (if feasible)
- Periodically inspect extracted aspects for meaningfulness and diversity to decide on stopping.

For **aspect extraction**, combining an automated metric (like coherence score or silhouette score) with manual inspection often yields the best results.


## Hyperparameters Tuning
To tune our parameters we use a filtered version of the 50k ds. <br>
We filter out rows that can be found on the 200k ds.

In [None]:
import pandas as pd

# This is based on the idea that our dataset are generated with different seeds else it won't work
large = pd.read_csv("../output/dataset/pre-processed/200k.preprocessed.csv")
small = pd.read_csv("../output/dataset/pre-processed/100k.preprocessed.csv")
tuning_set = small[~small["comments"].isin(large["comments"])]

tuning_set.to_csv("../output/dataset/pre-processed/tuning.preprocessed.csv", index=False)

> The main goal of ABAE is to extract interpretable and meaningful aspects, which makes coherence the more aligned metric. Reconstruction error might help guide training but doesn’t guarantee that the extracted aspects are semantically useful.

In [2]:
from torch.utils.data import DataLoader
import torch
from core.evaluation import normalize, get_aspect_top_k_words, coherence_per_aspect
from core.hp_tuning import ABAERandomHyperparametersSelectionWrapper
from core.train import AbaeModelManager, AbaeModelConfiguration
from core.dataset import PositiveNegativeCommentGeneratorDataset

import json
import numpy as np

from uuid import uuid4

print("Starting process")
hp_wrapper = ABAERandomHyperparametersSelectionWrapper.create()
configurations = 15  # We try 15 different configurations

seen_configurations = set()
seen_configurations.add(
    frozenset({
                  # I failed on this
                  'aspect_size': 18,
                  'embedding_size': 300,
                  'epochs': 15,
                  'batch_size': 64,
                  'learning_rate': 0.01778279410038923,
                  'decay_rate': 0.9450000000000001,
                  'momentum': 0.97,
                  'negative_sample_size': 15
              }.items()))

seen_configurations.add(
    frozenset({
                  'aspect_size': 18,
                  'batch_size': 64,
                  'decay_rate': 0.9450000000000001,
                  'embedding_size': 300,
                  'epochs': 15,
                  'learning_rate': 0.01778279410038923,
                  'momentum': 0.97,
                  'negative_sample_size': 15
              }.items()))

seen_configurations.add(
    frozenset({
                  "aspect_size": 16,
                  "embedding_size": 200,
                  "epochs": 12,
                  "batch_size": 64,
                  "learning_rate": 0.0031622776601683794,
                  "decay_rate": 0.93,
                  "momentum": 0.9400000000000001,
                  "negative_sample_size": 15
              }.items())
)

scores = list()

corpus_file = "../output/dataset/pre-processed/tuning.preprocessed.csv"

for i in range(configurations):
    uuid = uuid4()
    parameters = next(hp_wrapper)

    while seen_configurations.__contains__(frozenset(parameters.items())):
        print(f"We already worked on configuration: {parameters}")
        parameters = next(hp_wrapper)  # In case we fetch the same config more than once.

    print(f"Working on configuration: {parameters}")
    seen_configurations.add(frozenset(parameters.items()))

    run_scores = []
    run_sores_per_aspect = []

    # One epoch takes approx {~128s} on my machine. Worst case (15 epochs) we have a total of {~32m} x 3.
    # At most a model takes 1.5 h to complete. It's a bit much!
    for runs in range(3):
        # Train process

        ## You should keep the same embeddings model for each iteration of the same configuration.
        # This ensures that the comparison focuses solely on the impact of the hyperparameter settings
        # and avoids introducing additional variability from reinitializing the embeddings.
        #
        # Consistency in the embeddings makes your evaluation of robustness more reliable.
        config = AbaeModelConfiguration(corpus_file=corpus_file, model_name=f"tuning_{uuid}", **parameters)
        manager = AbaeModelManager(config)

        vocab = manager.embedding_model.vocabulary()
        # The dataset generation depends on the embedding model
        ds = PositiveNegativeCommentGeneratorDataset(
            config.corpus_file, vocabulary=vocab, negative_size=config.negative_sample_size
        )

        # Ensure that the same splits are always created.
        # We use more point to test just to speed up.
        # We are not accounting the possibility that the data might be unbalanced.
        train, validation = torch.utils.data.random_split(ds, [0.75, 0.25], generator=torch.Generator().manual_seed(42))
        print("Training process in progress..")
        results, iteration_model = manager.run_train_process(train)
        # Coherence is not good enough. We also track reconstruction error.
        test_dataloader = DataLoader(dataset=validation, batch_size=config.batch_size, shuffle=True)
        evaluation_results = iteration_model.evalutate(test_dataloader)

        print(results)
        # Evaluate the model
        # We evaluate on the relative coherence between topics.
        print("Evaluating model")
        word_emb = normalize(iteration_model.get_layer('word_embedding').weights[0].value.data)

        aspect_embeddings = normalize(iteration_model.get_layer('aspect_embedding').w)
        print(f"Word embeddings shape: {word_emb.shape}")
        inv_vocab = manager.embedding_model.model.wv.index_to_key

        aspects_top_k_words = [get_aspect_top_k_words(a, word_emb, inv_vocab, top_k=50) for a in aspect_embeddings]

        aspect_words = [[word[0] for word in aspect] for aspect in aspects_top_k_words]

        coherence, coherence_model = coherence_per_aspect(aspect_words, ds.text_ds.loc[validation.indices], 10)

        run_scores.append(coherence_model.get_coherence())
        run_sores_per_aspect.append(coherence)

        if runs == 2:
            json.dump(dict(scores=run_scores, run_sores_per_aspect=run_sores_per_aspect, params=parameters),
                      open(manager.output_path + "/run_results.json", "w"))
    scores.append(dict(coherence=np.mean(run_scores), parameters=parameters))

# End done.
print(scores)

Starting process
We already worked on configuration: {'aspect_size': 18, 'embedding_size': 300, 'epochs': 15, 'batch_size': 64, 'learning_rate': 0.01778279410038923, 'decay_rate': 0.9450000000000001, 'momentum': 0.97, 'negative_sample_size': 15}
We already worked on configuration: {'aspect_size': 16, 'embedding_size': 200, 'epochs': 12, 'batch_size': 64, 'learning_rate': 0.0031622776601683794, 'decay_rate': 0.93, 'momentum': 0.9400000000000001, 'negative_sample_size': 15}
Working on configuration: {'aspect_size': 20, 'embedding_size': 200, 'epochs': 12, 'batch_size': 128, 'learning_rate': 0.1, 'decay_rate': 0.975, 'momentum': 0.98, 'negative_sample_size': 20}


Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 119085 words, keeping 4857 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 239721 words, keeping 5207 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 359702 words, keeping 5263 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 477300 words, keeping 5274 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 598508 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 721182 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 838524 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 957006 words, keeping 5280 wor

exceptions must derive from BaseException


INFO:gensim.models.word2vec:estimated required memory for 5280 words and 200 dimensions: 11088000 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.utils:Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-01-18T20:28:30.734652', 'gensim': '4.3.3', 'python': '3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'build_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training model with 8 workers on 5280 vocabulary and 200 features, using sg=1 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-01-18T20:28:30.734652', 'gensim': '4.3.3', 'python': '3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'train'}
DEBUG:gensim.models.word2vec:job loop exiting, total 101 jobs
DEBUG:gensim.models.word2vec:worker exiting, processed 12 jobs
DEBUG:gensim.

exceptions must derive from BaseException
Loading dataset from file: ../output/dataset/pre-processed/tuning.preprocessed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

Max sequence length calculation in progress...
Max sequence length is:  274 . The limit is set to 80 tokens.
We loose information on 54 points.This is 0.06439687555900066% of the dataset.
Padding sequences to length (80).
Training process in progress..


DEBUG:h5py._conv:Creating converter from 5 to 3


492/492 - 116s - 237ms/step - loss: 13.2846 - max_margin_loss: 13.2328


AttributeError: 'Functional' object has no attribute 'evalutate'

In [None]:
r = iteration_model.evaluate(test_dataloader)

In [5]:
print(scores)
# [{'coherence': -13.677789412100989, 'parameters': {'aspect_size': 17, 'embedding_size': 150, 'epochs': 20, 'batch_size': 64}}] #1st config
"""
[{'coherence': -14.44752463324073,
  'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 15, 'batch_size': 64}},
 {'coherence': -13.02996122137577,
  'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 15, 'batch_size': 128}},
 {'coherence': -13.172903114163981,
  'parameters': {'aspect_size': 18, 'embedding_size': 200, 'epochs': 10, 'batch_size': 128}},
 {'coherence': -13.961204280840835,
  'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 20, 'batch_size': 128}},
 {'coherence': -14.993132489678818,
  'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 5, 'batch_size': 64}},
 {'coherence': -13.23495325584567,
  'parameters': {'aspect_size': 15, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}},
 {'coherence': -13.501908255646923,
  'parameters': {'aspect_size': 14, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}}]
"""

[{'coherence': -14.44752463324073, 'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 15, 'batch_size': 64}}, {'coherence': -13.02996122137577, 'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 15, 'batch_size': 128}}, {'coherence': -13.172903114163981, 'parameters': {'aspect_size': 18, 'embedding_size': 200, 'epochs': 10, 'batch_size': 128}}, {'coherence': -13.961204280840835, 'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 20, 'batch_size': 128}}, {'coherence': -14.993132489678818, 'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 5, 'batch_size': 64}}, {'coherence': -13.23495325584567, 'parameters': {'aspect_size': 15, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}}, {'coherence': -13.501908255646923, 'parameters': {'aspect_size': 14, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}}]


# Focus on learning rate

In [None]:
# We fix other params and now focus entirely on lr.
# We have already a "promising" range defined.
# We look in that space so we redefine lr on ABAERandomHyperparametersSelectionWrapper

## Best found model training:

## See if the Hp tuning really improved upon our results:
We used SGD anda learned its parameters under the assumption that we would do better. <br>
Let's see if it really is the case, or we just wasted time.

For comparison we use Adam that has the advantage of being robust enough without parameter scouting.

In [None]:
#todo

In [6]:
parameters

{'aspect_size': 18,
 'embedding_size': 300,
 'epochs': 15,
 'batch_size': 64,
 'learning_rate': 0.01778279410038923,
 'decay_rate': 0.9450000000000001,
 'momentum': 0.97,
 'negative_sample_size': 15}

In [24]:
seen_configurations

{frozenset({('aspect_size', 16),
            ('batch_size', 64),
            ('decay_rate', 0.93),
            ('embedding_size', 200),
            ('epochs', 12),
            ('learning_rate', 0.0031622776601683794),
            ('momentum', 0.9400000000000001),
            ('negative_sample_size', 15)}),
 frozenset({('aspect_size', 18),
            ('batch_size', 64),
            ('decay_rate', 0.9450000000000001),
            ('embedding_size', 300),
            ('epochs', 15),
            ('learning_rate', 0.01778279410038923),
            ('momentum', 0.97),
            ('negative_sample_size', 15)}),
 frozenset({('aspect_size', 20),
            ('batch_size', 128),
            ('decay_rate', 0.975),
            ('embedding_size', 200),
            ('epochs', 12),
            ('learning_rate', 0.1),
            ('momentum', 0.98),
            ('negative_sample_size', 20)})}