In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

In [2]:
import torch
import numpy as np

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x737d5c256570>

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset.
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] 
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

In [3]:
corpus_file = "./../data/corpus.preprocessed.csv"  # It's this

## Aspect Embedding Size
The aspect embedding size is what will be inferring aspects. It is closest to representative words (?). <br />
We have to identify 7 actual aspects (luck, bookkeeping, downtime...) but that does not mean our matrix should be limited to rows only! What size to search is a good question and should be studied (Which I may be doing later). 

For the first try we setup the aspect_size:
>The optimal number of rows is problem-dependent, so it’s crucial to: <br/>
> Start with a heuristic: Begin with 2–3x the number of aspects.

In [4]:
aspect_size = 2 * 7

## Corpus Considerations
Should move where dataset ipynb is but:

## Model Setup

In [5]:
import core.embeddings as embeddings
import core.utils

embeddings_model = embeddings.WordEmbedding(
    core.utils.LoadCorpusUtility(), max_vocab_size=16000, embedding_size=128,
    target_model_file="./../data/word-embeddings.model", corpus_file=corpus_file
)

aspect_embeddings_model = embeddings.AspectEmbedding(
    aspect_size=aspect_size, embedding_size=128, base_embeddings=embeddings_model,
    target_model_file="./../data/aspects-embedding.model"
)

In [7]:
embeddings_model.load_model()
aspect_embeddings_model.load_model()

INFO:gensim.utils:loading Word2Vec object from ../data/word-embeddings.model
DEBUG:smart_open.smart_open_lib:{'uri': '../data/word-embeddings.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loading wv recursively from ../data/word-embeddings.model.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': '../data/word-embeddings.model', 'datetime': '2024-11-29T17:31:43.744425', 'gensim': '4.3.3', 'python': '3.12.3 (main, Nov  6 2024, 18:32:19) [GCC 13.2.0]', 'platform': 'Linux-6.8.0-49-generic-x86_64-with-glibc2.39', 'event': 'loaded'}


#### Load the data

In [8]:
import dataset
from torch.utils.data import DataLoader

vocabulary = embeddings_model.model.wv.key_to_index

train = dataset.PositiveNegativeCommentGeneratorDataset(
    vocabulary=vocabulary, csv_dataset_path=corpus_file, negative_size=15
)

train_dataloader = DataLoader(train, batch_size=64, shuffle=True)

Loading spacy model.
Loading dataset from file: ./../data/corpus.preprocessed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/50461 [00:00<?, ?it/s]

Max sequence length calculation in progress...
We loose information on 136 points.This is 0.2695150710449654% of the dataset.
Padding sequences to max length (256).
Max sequence length is:  1235  but we will limit sequences to 256 tokens.


In [9]:
from core.model import ABAEGenerator

generator = ABAEGenerator(256, train.negative_size, embeddings_model, aspect_embeddings_model)

## Train

In [10]:
from core import utils

## Why SGD: You know why! todo: Link the papers

In [11]:
import torch

torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Ti'

We have too much data for my little PC:

> Sampling: Randomly select a subset of your data that represents the overall distribution of aspects. This will help maintain diversity while reducing the size.
Filtering: Focus on the most informative or high-quality samples. For example, if certain reviews are very short, irrelevant, or don't have useful context for aspect extraction, remove them.
Focus on Diversity: If you reduce the data, make sure the remaining dataset is still representative of the diversity of aspects you're trying to capture.

In [13]:
training_model = generator.make_training_model()
training_model.compile(optimizer='SGD', loss=[utils.max_margin_loss], metrics={'max_margin': utils.max_margin_loss})
history = training_model.fit(x=train_dataloader, batch_size=128, epochs=15)

  super(WeightedAspectEmb, self).__init__(**kwargs)


Epoch 1/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 60ms/step - loss: 14.1123 - max_margin_loss: 14.1123
Epoch 2/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 61ms/step - loss: 12.5177 - max_margin_loss: 12.5177
Epoch 3/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 63ms/step - loss: 11.2422 - max_margin_loss: 11.2422
Epoch 4/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 62ms/step - loss: 10.0412 - max_margin_loss: 10.0412
Epoch 5/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 62ms/step - loss: 9.3263 - max_margin_loss: 9.3263
Epoch 6/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 64ms/step - loss: 8.9555 - max_margin_loss: 8.9555
Epoch 7/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 62ms/step - loss: 8.6766 - max_margin_loss: 8.6766
Epoch 8/15
[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 61ms/step -

In [14]:
training_model.save("./../data/abae.keras")

DEBUG:h5py._conv:Creating converter from 5 to 3


## Model Evaluation

In [15]:
# Load evaluation model
inference_model = generator.make_model("./../data/abae.keras")

In [16]:
out = inference_model.predict(x=train_dataloader)
np.argmax(out[2], axis=-1)  # The associated labels

[1m789/789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 45ms/step


array([5, 5, 5, ..., 5, 5, 5])

## Find aspect most representative words

In [61]:
word_emb = inference_model.get_layer('word_embedding').get_weights()[0]
word_emb = torch.from_numpy(word_emb)
word_emb.shape

torch.Size([12954, 128])

In [74]:
aspect_embeddings = inference_model.get_layer('weighted_aspect_emb').W
vocab_inv = embeddings_model.model.wv.index_to_key

for aspect in aspect_embeddings:
    # Calculate the cosine similarity of each word with the aspect
    aspect = aspect.cpu()
    similarity = word_emb.matmul(aspect.T)

    ordered_words = np.argsort(similarity.detach().numpy())[::-1]
    desc_list = [vocab_inv[w] for w in ordered_words[:15]]
    print(desc_list)

['sleeve', 'placement', 'mm', '2021', '2023', 'min', 'promo', '2022', '2020', '2024', '2017', '2018', '2019', 'copy', 'kickstarter']
['placement', 'sleeve', 'solo', 'edition', 'friend', 'art', 'mm', 'year', '2023', '2021', 'copy', '2022', 'co', 'group', 'people']
['min', 'minute', 'teach', 'turn', 'round', 'solo', '60', 'learn', 'count', 'placement', '2', '1', '3', '45', '4']
['action', 'opponent', 'turn', 'round', 'decision', 'resource', 'plan', 'phase', 'order', 'scoring', 'selection', 'place', 'tile', 'track', 'move']
['filler', 'min', 'kid', 'teach', 'minute', 'weight', 'hour', 'gamer', 'family', 'euro', 'co', 'medium', 'highly', 'adult', 'year']
['sleeve', '2021', 'promo', 'kickstarter', '2023', '2022', 'pledge', 'ks', 'mm', '2020', 'edition', '2019', '2018', '2024', 'metal']
['area', 'action', 'mm', 'money', 'pay', 'selection', 'sleeve', 'order', 'promo', 'resource', 'space', 'gain', 'collect', '2021', 'ship']
['group', 'co', 'people', 'gamer', 'teach', '4', 'learn', 'friend', 's

In [None]:
pred = m.predict(x=train_dataloader)

In [None]:
pred

In [None]:
import pandas as pd

pd.DataFrame(pred)