In [1]:
import os

import numpy as np

os.environ['KERAS_BACKEND'] = "torch"

In [2]:
import torch

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x711f3d398c20>

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset.
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] 
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

In [3]:
corpus_file = "./../data/corpus.preprocessed.csv"  # It's this
# TODO GET MAXLEN FROM EMBEDDINGS DATASET (Which is input shape)
input_shape = (64, 1017)

## Aspect Embedding Size
The aspect embedding size is what will be inferring aspects. It is closest to representative words (?). <br />
We have to identify 7 actual aspects (luck, bookkeeping, downtime...) but that does not mean our matrix should be limited to rows only! What size to search is a good question and should be studied (Which I may be doing later). 

For the first try we setup the aspect_size:
>The optimal number of rows is problem-dependent, so it’s crucial to: <br/>
> Start with a heuristic: Begin with 2–3x the number of aspects.

In [4]:
aspect_size = 2 * 7

## Corpus Considerations
Should move where dataset ipynb is but:

In [5]:
max_vocab_size = 40000
# todo:
# Exclude words that occur fewer than 5–10 times in the entire corpus. 
# These words are often domain-specific or noisy and contribute minimally to meaningful embeddings
# + Remove the ugly words

## Model Setup

In [6]:
import core.embeddings as embeddings
import core.utils

embeddings_model = embeddings.WordEmbedding(
    core.utils.LoadCorpusUtility(), max_vocab_size=max_vocab_size, embedding_size=128,
    target_model_file="./../data/word-embeddings.model", corpus_file=corpus_file
)

aspect_embeddings_model = embeddings.AspectEmbedding(
    aspect_size=aspect_size, embedding_size=128, base_embeddings=embeddings_model,
    target_model_file="./../data/aspects-embedding.model"
)

In [7]:
embeddings_model.load_model()
aspect_embeddings_model.load_model()

INFO:gensim.utils:loading Word2Vec object from ../data/word-embeddings.model
DEBUG:smart_open.smart_open_lib:{'uri': '../data/word-embeddings.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loading wv recursively from ../data/word-embeddings.model.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': '../data/word-embeddings.model', 'datetime': '2024-11-20T17:41:16.249487', 'gensim': '4.3.3', 'python': '3.12.3 (main, Nov  6 2024, 18:32:19) [GCC 13.2.0]', 'platform': 'Linux-6.8.0-49-generic-x86_64-with-glibc2.39', 'event': 'loaded'}


#### Load the data

In [8]:
import dataset
from torch.utils.data import DataLoader

vocabulary = embeddings_model.model.wv.key_to_index
train = dataset.PositiveNegativeCommentGeneratorDataset(
    vocabulary=vocabulary, csv_dataset_path=corpus_file, negative_size=15
)

train_dataloader = DataLoader(train, batch_size=64, shuffle=True)

Pandas Apply:   0%|          | 0/1000 [00:00<?, ?it/s]

In [9]:
from core.model import ABAEGenerator

generator = ABAEGenerator(train.max_seq_length, train.negative_size, embeddings_model, aspect_embeddings_model)
model = generator.make_model()

  super(WeightedAspectEmb, self).__init__(**kwargs)


## Train

In [10]:

from core import utils

## Why SGD: You know why! todo: Link the papers

In [11]:
training_model = generator.make_training_model()
training_model.compile(optimizer='SGD', loss=[utils.max_margin_loss], metrics={'max_margin': utils.max_margin_loss})
history = training_model.fit(x=train_dataloader, batch_size=64, epochs=5)

Epoch 1/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 13.9308 - max_margin_loss: 13.9308
Epoch 2/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 57ms/step - loss: 11.7444 - max_margin_loss: 11.7444
Epoch 3/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 10.4955 - max_margin_loss: 10.4955
Epoch 4/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 10.0111 - max_margin_loss: 10.0111
Epoch 5/5
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - loss: 9.3141 - max_margin_loss: 9.3141


In [12]:
# todo fix loss as I have 3 outputs
from core import utils

# https://stackoverflow.com/questions/57048362/keras-multiple-outputs-loss-only-a-function-of-one
model.compile(optimizer='SGD', loss=[utils.max_margin_loss, ], metrics={'max_margin': utils.max_margin_loss})
model.summary()

In [13]:
# Debug: (inputs == 0).all(dim=-1) # There are some sentences with 0 words! Can it be?
# Still, if all is 0 my model should not be breaking.
history = model.fit(x=train_dataloader, batch_size=64, epochs=5)

ValueError: For a core with multiple outputs, when providing the `loss` argument as a list, it should have as many entries as the core has outputs. Received:
loss=[<function max_margin_loss at 0x711e413f1260>]
of length 1 whereas the core has 3 outputs.

## Model Evaluation

In [None]:
# The model itself is useless. We use the attention weights and aspect embeddings + emb to get
# the real result of our analysis
out = model.predict(x=train_dataloader)

In [None]:
np.argmax(out[2], axis=-1)  # The associated labels

In [14]:
training_model.save("./../data/abae.keras")

DEBUG:h5py._conv:Creating converter from 5 to 3


In [15]:
m = generator.make_model("./../data/abae.keras")

  super(WeightedAspectEmb, self).__init__(**kwargs)


In [17]:
pred = m.predict(x=train_dataloader)

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step


In [19]:
pred

KeyboardInterrupt: 

In [None]:
import pandas as pd
pd.DataFrame(pred)