In [None]:
# TODO pulisci questo file
# https://arxiv.org/pdf/1803.09820
# https://machinelearningmastery.com/learning-rate-for-deep-learning-neural-networks/

In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

## Hands on first attempt:


In [None]:
 # Without any hp tuning we just try and see how it goes.

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset.
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] 
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

## Aspect Embedding Size
The aspect embedding size is what will be inferring aspects. It is closest to representative words (?). <br />
We have to identify 7 actual aspects (luck, bookkeeping, downtime...) but that does not mean our matrix should be limited to rows only! What size to search is a good question and should be studied (Which I may be doing later). 

For the first try we setup the aspect_size:
>The optimal number of rows is problem-dependent, so it’s crucial to: <br/>
> Start with a heuristic: Begin with 2–3x the number of aspects.

For **aspect extraction**, which involves identifying key aspects or topics in text, the best early stopping method depends on your approach:

### 1. Embedding-based Methods (e.g., Clustering Embeddings)
- **Silhouette Score**: Measure the separation and compactness of clusters. Stop when the score stabilizes.
- **Inertia/Distortion**: Track the sum of squared distances within clusters and stop when improvement flattens.
- **Centroid Movement**: Stop when the change in cluster centroids across iterations is minimal.

### 2. Topic Modeling (e.g., LDA)
- **Perplexity**: Monitor the perplexity on a held-out dataset and stop when it stops decreasing significantly.
- **Coherence Score**: Measure the semantic consistency of extracted topics and stop when it stabilizes.

### 3. Autoencoder-based Aspect Extraction
- **Reconstruction Loss**: Stop training when the validation reconstruction error no longer improves.

### 4. Qualitative Evaluation (if feasible)
- Periodically inspect extracted aspects for meaningfulness and diversity to decide on stopping.

For **aspect extraction**, combining an automated metric (like coherence score or silhouette score) with manual inspection often yields the best results.


## Hyperparameters Tuning
To tune our parameters we use a filtered version of the 50k ds. <br>
We filter out rows that can be found on the 200k ds.

In [None]:
import pandas as pd

# This is based on the idea that our dataset are generated with different seeds else it won't work
large = pd.read_csv("../output/dataset/pre-processed/200k.preprocessed.csv")
small = pd.read_csv("../output/dataset/pre-processed/100k.preprocessed.csv")
tuning_set = small[~small["comments"].isin(large["comments"])]

tuning_set.to_csv("../output/dataset/pre-processed/tuning.preprocessed.csv", index=False)

> The main goal of ABAE is to extract interpretable and meaningful aspects, which makes coherence the more aligned metric. Reconstruction error might help guide training but doesn’t guarantee that the extracted aspects are semantically useful.

In [2]:
import torch
from core.evaluation import normalize, get_aspect_top_k_words, coherence_per_aspect
from core.hp_tuning import ABAERandomHyperparametersSelectionWrapper
from core.train import AbaeModelManager, AbaeModelConfiguration
from core.dataset import PositiveNegativeCommentGeneratorDataset

from uuid import uuid4

from torch.utils.data import DataLoader

print("Starting process")
hp_wrapper = ABAERandomHyperparametersSelectionWrapper.create()
configurations = 15  # We try 15 different configurations

seen_configurations = set()
seen_configurations.add(frozenset({'aspect_size': 17, 'embedding_size': 150, 'epochs': 20, 'batch_size': 64}.items()))

scores = list()

corpus_file = "../output/dataset/pre-processed/tuning.preprocessed.csv"

for i in range(configurations):
    uuid = uuid4()
    parameters = next(hp_wrapper)
    while seen_configurations.__contains__(frozenset(parameters.items())):
        print(f"We already worked on configuration: {parameters}")
        parameters = next(hp_wrapper)  # In case we fetch the same config more than once.
    print(f"Working on configuration: {parameters}")
    seen_configurations.add(frozenset(parameters.items()))

    # Train process
    config = AbaeModelConfiguration(corpus_file=corpus_file, model_name=f"tuning_{uuid}", **parameters)
    manager = AbaeModelManager(config)

    # The dataset generation depends on the embedding model
    ds = PositiveNegativeCommentGeneratorDataset(
        vocabulary=manager.embedding_model.vocabulary(),
        csv_dataset_path=config.corpus_file, negative_size=15
    )

    train, validation = torch.utils.data.random_split(ds, [0.8, 0.2], generator=torch.Generator().manual_seed(42))
    print("Training process in progress..")
    results, iteration_model = manager.run_train_process(train)

    print(results)
    # Evaluate the model
    # We evaluate on the relative coherence between topics.
    print("Evaluating model")
    word_emb = normalize(iteration_model.get_layer('word_embedding').weights[0].value.data)

    aspect_embeddings = normalize(iteration_model.get_layer('aspect_embedding').w)
    print(f"Word embeddings shape: {word_emb.shape}")
    inv_vocab = manager.embedding_model.model.wv.index_to_key

    aspects_top_k_words = [get_aspect_top_k_words(a, word_emb, inv_vocab, top_k=50) for a in aspect_embeddings]

    aspect_words = [[word[0] for word in aspect] for aspect in aspects_top_k_words]


    coherence, coherence_model = coherence_per_aspect(aspect_words, ds.text_ds.loc[validation.indices], 10)
    scores.append(dict(coherence=coherence_model.get_coherence(), parameters=parameters))

# End done.
print(scores)

Starting process
Working on configuration: {'aspect_size': 18, 'embedding_size': 300, 'epochs': 1, 'batch_size': 64, 'learning_rate': 0.001, 'decay_rate': 0.9450000000000001, 'momentum': 0.97, 'negative_sample_size': 15}


Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 119085 words, keeping 4857 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 239721 words, keeping 5207 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 359702 words, keeping 5263 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 477300 words, keeping 5274 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 598508 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 721182 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 838524 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 957006 words, keeping 5280 wor

exceptions must derive from BaseException


INFO:gensim.models.word2vec:estimated required memory for 5280 words and 300 dimensions: 15312000 bytes
INFO:gensim.models.word2vec:resetting layer weights
INFO:gensim.utils:Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2025-01-11T15:16:28.478121', 'gensim': '4.3.3', 'python': '3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'build_vocab'}
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'training model with 8 workers on 5280 vocabulary and 300 features, using sg=1 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2025-01-11T15:16:28.479122', 'gensim': '4.3.3', 'python': '3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'train'}
DEBUG:gensim.models.word2vec:job loop exiting, total 101 jobs
DEBUG:gensim.models.word2vec:worker exiting, processed 12 jobs
DEBUG:gensim.

exceptions must derive from BaseException
Loading dataset from file: ../output/dataset/pre-processed/tuning.preprocessed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

Max sequence length calculation in progress...
Max sequence length is:  274 . The limit is set to 80 tokens.
We loose information on 54 points.This is 0.06439687555900066% of the dataset.
Padding sequences to length (80).
Training process in progress..
[1m1048/1049[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 131ms/step - loss: 13.9122 - max_margin_loss: 13.8530

DEBUG:h5py._conv:Creating converter from 5 to 3


[1m1049/1049[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 131ms/step - loss: 13.9098 - max_margin_loss: 13.8507
<keras.src.callbacks.history.History object at 0x0000027C36BCEA20>
Evaluating model
Word embeddings shape: torch.Size([5280, 300])

Given aspect most representative words are:
Word:  distribute (0.7574232816696167)
Word:  correspond (0.7515112161636353)
Word:  ideia (0.7118360996246338)
Word:  ascend (0.7110495567321777)
Word:  cylinder (0.7081038951873779)
Word:  adjacent (0.6909685134887695)
Word:  randomize (0.6881609559059143)
Word:  coloured (0.6871917247772217)
Word:  select (0.6845979690551758)
Word:  assistant (0.6817619800567627)
Word:  landscape (0.6817315816879272)
Word:  associate (0.67988121509552)
Word:  guild (0.6798262000083923)
Word:  retrieve (0.677538275718689)
Word:  designate (0.6736943125724792)
Word:  assign (0.6718295812606812)
Word:  airship (0.6717966198921204)
Word:  slot (0.6717431545257568)
Word:  occupation (0.6701945066452026)
Word: 

INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 1000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 2000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 3000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 4000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 5000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 6000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 7000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 8000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 9000 documents
INFO:gensim.topic_coherence.text_analysis:CorpusAccumulator accumulated stats from 10000 documents
INFO:gensim.topic_c

Working on configuration: {'aspect_size': 16, 'embedding_size': 200, 'epochs': 1, 'batch_size': 64, 'learning_rate': 0.001, 'decay_rate': 0.93, 'momentum': 0.9400000000000001, 'negative_sample_size': 15}


Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 119085 words, keeping 4857 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 239721 words, keeping 5207 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 359702 words, keeping 5263 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 477300 words, keeping 5274 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 598508 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 721182 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 838524 words, keeping 5279 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 957006 words, keeping 5280 wor

exceptions must derive from BaseException


DEBUG:gensim.models.word2vec:job loop exiting, total 101 jobs
DEBUG:gensim.models.word2vec:worker exiting, processed 13 jobs
DEBUG:gensim.models.word2vec:worker thread finished; awaiting finish of 7 more threads
DEBUG:gensim.models.word2vec:worker exiting, processed 12 jobs
DEBUG:gensim.models.word2vec:worker thread finished; awaiting finish of 6 more threads
DEBUG:gensim.models.word2vec:worker exiting, processed 13 jobs
DEBUG:gensim.models.word2vec:worker exiting, processed 12 jobs
DEBUG:gensim.models.word2vec:worker thread finished; awaiting finish of 5 more threads
DEBUG:gensim.models.word2vec:worker exiting, processed 12 jobs
DEBUG:gensim.models.word2vec:worker exiting, processed 12 jobs
DEBUG:gensim.models.word2vec:worker exiting, processed 13 jobs
DEBUG:gensim.models.word2vec:worker exiting, processed 14 jobs
DEBUG:gensim.models.word2vec:worker thread finished; awaiting finish of 4 more threads
DEBUG:gensim.models.word2vec:worker thread finished; awaiting finish of 3 more threads

exceptions must derive from BaseException
Loading dataset from file: ../output/dataset/pre-processed/tuning.preprocessed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/83855 [00:00<?, ?it/s]

Max sequence length calculation in progress...
Max sequence length is:  274 . The limit is set to 80 tokens.
We loose information on 54 points.This is 0.06439687555900066% of the dataset.
Padding sequences to length (80).
Training process in progress..
[1m 113/1049[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m1:57[0m 125ms/step - loss: 15.0733 - max_margin_loss: 15.0049

KeyboardInterrupt: 

In [5]:
print(scores)
# [{'coherence': -13.677789412100989, 'parameters': {'aspect_size': 17, 'embedding_size': 150, 'epochs': 20, 'batch_size': 64}}] #1st config
"""
[{'coherence': -14.44752463324073,
  'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 15, 'batch_size': 64}},
 {'coherence': -13.02996122137577,
  'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 15, 'batch_size': 128}},
 {'coherence': -13.172903114163981,
  'parameters': {'aspect_size': 18, 'embedding_size': 200, 'epochs': 10, 'batch_size': 128}},
 {'coherence': -13.961204280840835,
  'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 20, 'batch_size': 128}},
 {'coherence': -14.993132489678818,
  'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 5, 'batch_size': 64}},
 {'coherence': -13.23495325584567,
  'parameters': {'aspect_size': 15, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}},
 {'coherence': -13.501908255646923,
  'parameters': {'aspect_size': 14, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}}]
"""

[{'coherence': -14.44752463324073, 'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 15, 'batch_size': 64}}, {'coherence': -13.02996122137577, 'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 15, 'batch_size': 128}}, {'coherence': -13.172903114163981, 'parameters': {'aspect_size': 18, 'embedding_size': 200, 'epochs': 10, 'batch_size': 128}}, {'coherence': -13.961204280840835, 'parameters': {'aspect_size': 19, 'embedding_size': 200, 'epochs': 20, 'batch_size': 128}}, {'coherence': -14.993132489678818, 'parameters': {'aspect_size': 16, 'embedding_size': 150, 'epochs': 5, 'batch_size': 64}}, {'coherence': -13.23495325584567, 'parameters': {'aspect_size': 15, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}}, {'coherence': -13.501908255646923, 'parameters': {'aspect_size': 14, 'embedding_size': 100, 'epochs': 15, 'batch_size': 32}}]


In [5]:
print("A")

A


## Best found model training:

### 64k - Default

In [None]:
# How to Address Issues (If Any):
# Introduce Hard Negatives:
# Instead of randomly selecting negative samples, use hard negatives—examples that are more challenging to distinguish from positive pairs. This keeps the max-margin loss informative and prevents the model from converging too quickly.

# Regularization:
# Apply regularization (e.g., L2 regularization) to prevent overfitting and ensure the model generalizes well.

# Early Stopping:
# If the loss plateaus and aspect quality is satisfactory, consider using early stopping to avoid unnecessary training.