In [None]:
# TODO pulisci questo file
# https://arxiv.org/pdf/1803.09820
# https://machinelearningmastery.com/learning-rate-for-deep-learning-neural-networks/

In [None]:
import os

os.environ['KERAS_BACKEND'] = "torch"

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset. <br>
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] <br>
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors <br>
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

## Hands on first attempt:


In [None]:
from core.dataset import PositiveNegativeCommentGeneratorDataset
from core.train import AbaeModelManager, AbaeModelConfiguration

corpus = "../output/dataset/pre-processed/200k.preprocessed.csv"
config = AbaeModelConfiguration(corpus_file=corpus, model_name=f"hands_on")

print(f"Running on default config:\n {config}")

# Without any hp tuning we just try and see how it goes.
manager = AbaeModelManager(config)
train_dataset = PositiveNegativeCommentGeneratorDataset(
    vocabulary=manager.embedding_model.vocabulary(),
    csv_dataset_path=config.corpus_file, negative_size=config.negative_sample_size
)

manager.run_train_process(train_dataset)

[1m2459/3284[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m3:47[0m 276ms/step - loss: 7.1087 - max_margin_loss: 7.1019

## Aspect Embedding Size
The aspect embedding size is what will be inferring aspects. It is closest to representative words (?). <br />
We have to identify 7 actual aspects (luck, bookkeeping, downtime...) but that does not mean our matrix should be limited to rows only! <br>

For the first try we setup the aspect_size:
>The optimal number of rows is problem-dependent, so it’s crucial to: <br/>
> Start with a heuristic: Begin with 2–3x the number of aspects.

For **aspect extraction**, which involves identifying key aspects or topics in text, the best early stopping method depends on your approach:

### 1. Embedding-based Methods (e.g., Clustering Embeddings)
- **Silhouette Score**: Measure the separation and compactness of clusters. Stop when the score stabilizes.
- **Inertia/Distortion**: Track the sum of squared distances within clusters and stop when improvement flattens.
- **Centroid Movement**: Stop when the change in cluster centroids across iterations is minimal.

### 2. Topic Modeling (e.g., LDA)
- **Perplexity**: Monitor the perplexity on a held-out dataset and stop when it stops decreasing significantly.
- **Coherence Score**: Measure the semantic consistency of extracted topics and stop when it stabilizes.

### 3. Autoencoder-based Aspect Extraction
- **Reconstruction Loss**: Stop training when the validation reconstruction error no longer improves.

### 4. Qualitative Evaluation (if feasible)
- Periodically inspect extracted aspects for meaningfulness and diversity to decide on stopping.

For **aspect extraction**, combining an automated metric (like coherence score or silhouette score) with manual inspection often yields the best results.


## Hyperparameters Tuning
To tune our parameters we use a filtered version of the 50k ds. <br>
We filter out rows that can be found on the 200k ds.

In [None]:
import pandas as pd

# This is based on the idea that our dataset are generated with different seeds else it won't work
large = pd.read_csv("../output/dataset/pre-processed/200k.preprocessed.csv")
small = pd.read_csv("../output/dataset/pre-processed/100k.preprocessed.csv")
tuning_set = small[~small["comments"].isin(large["comments"])]

tuning_set.to_csv("../output/dataset/pre-processed/tuning.preprocessed.csv", index=False)

> The main goal of ABAE is to extract interpretable and meaningful aspects, which makes coherence the more aligned metric.<br> Reconstruction error might help guide training but doesn’t guarantee that the extracted aspects are semantically useful.

In [None]:
from core.hp_tuning import ABAERandomHyperparametersSelectionWrapper, HyperparameterTuningManager

configurations = 15  # We try 15 different configurations
corpus_file = "../output/dataset/pre-processed/tuning.preprocessed.csv"

print(f"Starting procedure. We try a total of {configurations}")
hp_wrapper = ABAERandomHyperparametersSelectionWrapper.create()
hp_tuning_manager = HyperparameterTuningManager(hp_wrapper, corpus_file, "./output")

In [None]:
hp_tuning_manager(different_configurations=configurations, repeat=3)

# Focus on learning rate

In [None]:
# We fix other params and now focus entirely on lr.
# We have already a "promising" range defined.
# We look in that space so we redefine lr on ABAERandomHyperparametersSelectionWrapper

## Best found model training:

## See if the Hp tuning really improved upon our results:
We used SGD anda learned its parameters under the assumption that we would do better. <br>
Let's see if it really is the case, or we just wasted time.

For comparison we use Adam that has the advantage of being robust enough without parameter scouting.

In [None]:
#todo

## Test accuracy on small test sample we filled out

### Test set definition

In [None]:
from core.pre_processing import PreProcessingService
from core.pre_processing import DatasetGeneration

# We take around 1k records that were not seen yet from the model and label them by hand.
dataset = pd.read_csv("../data/corpus.csv")
pipeline = PreProcessingService.full_pipeline(document_game_names, "../data/processed-dataset/full")

# Extract 1k from dataset that are not in 200k
train_ds = pd.read_csv("../output/dataset/pre-processed/200k.preprocessed.csv")

# Take top 2k. (We will select some good ones and reduce the number to 1k)
test_set = dataset[~dataset["comments"].isin(train_ds["comments"])]

We have to use labels:

In [None]:
labels = {
    '0': "Luck/Alea",
    '1': 'Bookkeeping',
    '2': 'Downtime',
    '3': 'Interaction',
    '4': 'Bash',
    '5': 'Complicated/Complex',  # I could watch weight to see if there is a ratio relation.
    '6': 'Misc'
}