In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

In [2]:
import torch
import numpy as np

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x740dc24e6570>

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset.
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] 
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

In [3]:
corpus_file = "./../data/corpus.preprocessed.csv"  # It's this

## Aspect Embedding Size
The aspect embedding size is what will be inferring aspects. It is closest to representative words (?). <br />
We have to identify 7 actual aspects (luck, bookkeeping, downtime...) but that does not mean our matrix should be limited to rows only! What size to search is a good question and should be studied (Which I may be doing later). 

For the first try we setup the aspect_size:
>The optimal number of rows is problem-dependent, so it’s crucial to: <br/>
> Start with a heuristic: Begin with 2–3x the number of aspects.

In [4]:
aspect_size = 2 * 7

## Corpus Considerations
Should move where dataset ipynb is but:

## Model Setup

In [5]:
import core.embeddings as embeddings
import core.utils

embeddings_model = embeddings.WordEmbedding(
    core.utils.LoadCorpusUtility(), max_vocab_size=16000, embedding_size=128,
    target_model_file="./../data/word-embeddings.model", corpus_file=corpus_file
)

aspect_embeddings_model = embeddings.AspectEmbedding(
    aspect_size=aspect_size, embedding_size=128, base_embeddings=embeddings_model,
    target_model_file="./../data/aspects-embedding.model"
)

In [6]:
embeddings_model.load_model()
aspect_embeddings_model.load_model()

INFO:gensim.utils:loading Word2Vec object from ../data/word-embeddings.model
DEBUG:smart_open.smart_open_lib:{'uri': '../data/word-embeddings.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loading wv recursively from ../data/word-embeddings.model.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': '../data/word-embeddings.model', 'datetime': '2024-11-30T18:37:04.229569', 'gensim': '4.3.3', 'python': '3.12.3 (main, Nov  6 2024, 18:32:19) [GCC 13.2.0]', 'platform': 'Linux-6.8.0-49-generic-x86_64-with-glibc2.39', 'event': 'loaded'}


#### Load the data

In [7]:
import dataset
from torch.utils.data import DataLoader

vocabulary = embeddings_model.model.wv.key_to_index

train = dataset.PositiveNegativeCommentGeneratorDataset(
    vocabulary=vocabulary, csv_dataset_path=corpus_file, negative_size=15
)

train_dataloader = DataLoader(train, batch_size=32, shuffle=True)

Loading spacy model.
Loading dataset from file: ./../data/corpus.preprocessed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/50461 [00:00<?, ?it/s]

Max sequence length calculation in progress...
We loose information on 136 points.This is 0.2695150710449654% of the dataset.
Padding sequences to max length (256).
Max sequence length is:  1235  but we will limit sequences to 256 tokens.


In [8]:
from core.model import ABAEGenerator

generator = ABAEGenerator(256, train.negative_size, embeddings_model, aspect_embeddings_model)

## Train

In [9]:
from core import utils

## Why SGD: You know why! todo: Link the papers

In [10]:
import torch

torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Ti'

We have too much data for my little PC:

> Sampling: Randomly select a subset of your data that represents the overall distribution of aspects. This will help maintain diversity while reducing the size.
Filtering: Focus on the most informative or high-quality samples. For example, if certain reviews are very short, irrelevant, or don't have useful context for aspect extraction, remove them.
Focus on Diversity: If you reduce the data, make sure the remaining dataset is still representative of the diversity of aspects you're trying to capture.

In [11]:
training_model = generator.make_training_model()
training_model.compile(optimizer='SGD', loss=[utils.max_margin_loss], metrics={'max_margin': utils.max_margin_loss})
history = training_model.fit(x=train_dataloader, batch_size=32, epochs=15)

Epoch 1/15


  super(WeightedAspectEmb, self).__init__(**kwargs)


[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 42ms/step - loss: 13.7152 - max_margin_loss: 13.7152
Epoch 2/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 42ms/step - loss: 11.2962 - max_margin_loss: 11.2962
Epoch 3/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 40ms/step - loss: 9.4733 - max_margin_loss: 9.4733
Epoch 4/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 41ms/step - loss: 8.7165 - max_margin_loss: 8.7165
Epoch 5/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 41ms/step - loss: 8.3181 - max_margin_loss: 8.3181
Epoch 6/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 41ms/step - loss: 8.1459 - max_margin_loss: 8.1459
Epoch 7/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 40ms/step - loss: 7.9774 - max_margin_loss: 7.9774
Epoch 8/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 41ms/step 

In [12]:
# How to Address Issues (If Any):
# Introduce Hard Negatives:
# Instead of randomly selecting negative samples, use hard negatives—examples that are more challenging to distinguish from positive pairs. This keeps the max-margin loss informative and prevents the model from converging too quickly.

# Regularization:
# Apply regularization (e.g., L2 regularization) to prevent overfitting and ensure the model generalizes well.

# Early Stopping:
# If the loss plateaus and aspect quality is satisfactory, consider using early stopping to avoid unnecessary training.

In [13]:
training_model.save("./../data/abae.keras")

DEBUG:h5py._conv:Creating converter from 5 to 3


## Model Evaluation

In [14]:
# Load evaluation model
inference_model = generator.make_model("./../data/abae.keras")

In [15]:
out = inference_model.predict(x=train_dataloader)
np.argmax(out[2], axis=-1)  # The associated labels

[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 24ms/step


array([7, 0, 7, ..., 0, 0, 0])

## Find aspect most representative words

In [16]:
word_emb = inference_model.get_layer('word_embedding').get_weights()[0]
word_emb = torch.from_numpy(word_emb)
word_emb.shape

torch.Size([12954, 128])

In [17]:
aspect_embeddings = inference_model.get_layer('weighted_aspect_emb').W
vocab_inv = embeddings_model.model.wv.index_to_key

aspect_words = []
aspect_index = 0

for aspect in aspect_embeddings:
    aspect = aspect.cpu()
    # Calculate the cosine similarity of each word with the aspect
    word_emb = word_emb / torch.linalg.norm(word_emb, dim=-1, keepdim=True)
    aspect = aspect / torch.linalg.norm(aspect, dim=-1, keepdim=True)

    similarity = word_emb.matmul(aspect.T)

    numpy_similarity = similarity.detach().numpy()

    ordered_words = np.argsort(numpy_similarity)[::-1]
    desc_list = [(vocab_inv[w], numpy_similarity[w]) for w in ordered_words[:15]]
    aspect_words.append(desc_list)

    print("Aspect ", aspect_index)
    for i in desc_list:
        # hr][/i is not a valid word. meh.
        print("Word: ", i[0], f"({i[1]})")

    aspect_index += 1

Aspect  0
Word:  release (0.5447440147399902)
Word:  buy (0.5116141438484192)
Word:  back (0.4773868918418884)
Word:  unlock (0.477204293012619)
Word:  magic (0.4655134677886963)
Word:  ffg (0.46209990978240967)
Word:  kickstarter (0.46151912212371826)
Word:  marvel (0.459344744682312)
Word:  retail (0.45813944935798645)
Word:  glad (0.4541388750076294)
Word:  pls (0.44647061824798584)
Word:  cosmetic (0.4373928904533386)
Word:  copy (0.4348754286766052)
Word:  legendary (0.43482765555381775)
Word:  regret (0.4348164200782776)
Aspect  1
Word:  belive (0.7245154976844788)
Word:  email (0.7236083745956421)
Word:  familiarize (0.7202874422073364)
Word:  incarnation (0.7198400497436523)
Word:  clothe (0.719068169593811)
Word:  tmb (0.7155524492263794)
Word:  technically (0.7127645611763)
Word:  preset (0.7095102071762085)
Word:  kg (0.7089599370956421)
Word:  rondell (0.7079117298126221)
Word:  ppl (0.7070233821868896)
Word:  dame (0.7059849500656128)
Word:  erratas (0.7055681943893433)
Wo

  similarity = word_emb.matmul(aspect.T)


# Evaluate coherence
Topical coherence measures the semantic consistency of terms grouped under a topic or aspect. It checks whether the terms frequently co-occur in similar contexts within your dataset, reflecting a meaningful grouping. For each topic (aspect), calculate pairwise co-occurrence of terms across the dataset. Terms that co-occur frequently in the same context are considered more coherent



In [18]:
# For each word of aspect for the aspect we calculate the coherence by AVG distance between top words
for aspect_most_representative_words in aspect_words:
    coherence = []
    for word in aspect_most_representative_words:
        w, score = word
        for word2 in aspect_most_representative_words:
            w2, score = word2
            if w != w2:
                coherence.append(embeddings_model.model.wv.similarity(w, w2))
    # todo fai avgf cosi natualmente sbagliato  
    print("Aspect i has total coherence of", np.mean(coherence, axis=0))  # AVG

Aspect i has total coherence of 0.6271665
Aspect i has total coherence of 0.8668326
Aspect i has total coherence of 0.84780085
Aspect i has total coherence of 0.74030197
Aspect i has total coherence of 0.7200178
Aspect i has total coherence of 0.8413263
Aspect i has total coherence of 0.8098694
Aspect i has total coherence of 0.54224765
Aspect i has total coherence of 0.69913083
Aspect i has total coherence of 0.7870225
Aspect i has total coherence of 0.7779807
Aspect i has total coherence of 0.5372712
Aspect i has total coherence of 0.73999083
Aspect i has total coherence of 0.65727794
