In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset.
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] 
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

In [2]:
from keras import ops as K
from keras import backend as B


def ortho_reg(W):
    ### Orthogonal regularization for aspect embedding matrix by Ruidan     ###
    w_n = W / K.cast(B.epsilon() + K.sqrt(K.sum(K.square(W), axis=-1, keepdims=True)), B.floatx())
    # sum(w_n * w_n_t - I) * factor
    return K.sum(K.square(K.dot(w_n, K.transpose(w_n)) - K.eye(w_n.shape[0])))

In [3]:
# TODO: Sarebbe da vedere come l'implementazione di Ruidan sia diversa da quella di Keras. Se effettivamente questa importa. A guardare i numeri sono effettivamente diversi!

In [11]:
corpus_file = "./../data/corpus.preprocessed.csv"  # It's this
# TODO GET MAXLEN FROM EMBEDDINGS DATASET (Which is input shape)
input_shape = (32, 256,)

## Model Setup

In [12]:
import model.embeddings as embeddings

embeddings_model = embeddings.WordEmbedding(
    embeddings.LoadCorpusUtility(), max_vocab_size=10000, embedding_size=128,
    target_model_file="./../data/word-embeddings.model", corpus_file=corpus_file
)
aspect_embeddings_model = embeddings.AspectEmbedding(
    aspect_size=4, embedding_size=128, base_embeddings=embeddings_model,
    target_model_file="./../data/aspects-embedding.model"
)

In [13]:
embeddings_model.load_model()
aspect_embeddings_model.load_model()

In [14]:
from model.model import ABAEModelGenerator

generator = ABAEModelGenerator(input_shape, embeddings_model, aspect_embeddings_model)
model = generator.make_model()



#### Load the data

In [8]:
import dataset
from torch.utils.data import DataLoader

train = dataset.CommentDataset(corpus_file)
# I have no idea why I have to collate.
train_dataloader = DataLoader(train, batch_size=64, shuffle=True, collate_fn=lambda x: x)

Pandas Apply:   0%|          | 0/1939904 [00:00<?, ?it/s]

477790308
<class 'pandas.core.series.Series'>
RangeIndex: 1939904 entries, 0 to 1939903
Series name: comments
Non-Null Count    Dtype 
--------------    ----- 
1939904 non-null  object
dtypes: object(1)
memory usage: 455.7 MB
None


## Train

In [15]:
from keras import ops as K


def max_margin_loss(y_true, y_pred):
    return K.mean(y_pred)

In [16]:
model.compile(optimizer='SGD', loss=max_margin_loss, metrics=[max_margin_loss])
model.fit(train_dataloader)

KeyError: 'game'

In [17]:
next(iter(train_dataloader))

KeyError: 'copy'