In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

In [2]:
import torch
import numpy as np

torch.autograd.set_detect_anomaly(True)

<torch.autograd.anomaly_mode.set_detect_anomaly at 0x226ce9c5370>

## Regularization
>We hope to learn vector representations of the most representative aspects for a review dataset.
However, the aspect embedding matrix T may suffer from redundancy problems during training. [...] 
> The regularization term encourages orthogonality among the rows of the aspect embedding matrix T and penalizes redundancy between different aspect vectors
> ~ Ruidan

We use an Orthogonal Regulizer definition of the method can be found here: https://paperswithcode.com/method/orthogonal-regularization. <br/>
For the code we use the default implementation provided by Keras (https://keras.io/api/layers/regularizers/)

In [3]:
corpus_file = "./../data/corpus.preprocessed.csv"  # It's this

## Aspect Embedding Size
The aspect embedding size is what will be inferring aspects. It is closest to representative words (?). <br />
We have to identify 7 actual aspects (luck, bookkeeping, downtime...) but that does not mean our matrix should be limited to rows only! What size to search is a good question and should be studied (Which I may be doing later). 

For the first try we setup the aspect_size:
>The optimal number of rows is problem-dependent, so it’s crucial to: <br/>
> Start with a heuristic: Begin with 2–3x the number of aspects.

In [11]:
aspect_size = 2 * 7

## Model Setup

In [6]:
import core.embeddings as embeddings
import core.utils

embeddings_model = embeddings.WordEmbedding(
    core.utils.LoadCorpusUtility(), max_vocab_size=16000, embedding_size=128,
    target_model_file="./../data/word-embeddings.model", corpus_file=corpus_file
)

aspect_embeddings_model = embeddings.AspectEmbedding(
    aspect_size=aspect_size, embedding_size=128, base_embeddings=embeddings_model,
    target_model_file="./../data/aspects-embedding.model"
)

In [7]:
embeddings_model.load_model()
aspect_embeddings_model.load_model()

INFO:gensim.utils:loading Word2Vec object from ../data/word-embeddings.model
DEBUG:smart_open.smart_open_lib:{'uri': '../data/word-embeddings.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loading wv recursively from ../data/word-embeddings.model.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': '../data/word-embeddings.model', 'datetime': '2024-12-02T15:12:35.165483', 'gensim': '4.3.3', 'python': '3.12.3 (main, Nov  6 2024, 18:32:19) [GCC 13.2.0]', 'platform': 'Linux-6.8.0-49-generic-x86_64-with-glibc2.39', 'event': 'loaded'}


#### Load the data

In [8]:
import dataset
from torch.utils.data import DataLoader

vocabulary = embeddings_model.model.wv.key_to_index

train = dataset.PositiveNegativeCommentGeneratorDataset(
    vocabulary=vocabulary, csv_dataset_path=corpus_file, negative_size=15
)

train_dataloader = DataLoader(train, batch_size=32, shuffle=True)

Loading spacy model.
Loading dataset from file: ./../data/corpus.preprocessed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/50461 [00:00<?, ?it/s]

Max sequence length calculation in progress...
We loose information on 136 points.This is 0.2695150710449654% of the dataset.
Padding sequences to max length (256).
Max sequence length is:  1235  but we will limit sequences to 256 tokens.


In [9]:
from core.model import ABAEGenerator

generator = ABAEGenerator(256, train.negative_size, embeddings_model, aspect_embeddings_model)

## Train

In [9]:
from core import utils

In [10]:
import torch

torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Ti'

We have too much data for my little PC:

> Sampling: Randomly select a subset of your data that represents the overall distribution of aspects. This will help maintain diversity while reducing the size.
Filtering: Focus on the most informative or high-quality samples. For example, if certain reviews are very short, irrelevant, or don't have useful context for aspect extraction, remove them.
Focus on Diversity: If you reduce the data, make sure the remaining dataset is still representative of the diversity of aspects you're trying to capture.

In [11]:
training_model = generator.make_training_model()
training_model.compile(optimizer='SGD', loss=[utils.max_margin_loss], metrics={'max_margin': utils.max_margin_loss})
history = training_model.fit(x=train_dataloader, batch_size=32, epochs=15)

Epoch 1/15


  super(WeightedAspectEmb, self).__init__(**kwargs)


[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 42ms/step - loss: 13.7152 - max_margin_loss: 13.7152
Epoch 2/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 42ms/step - loss: 11.2962 - max_margin_loss: 11.2962
Epoch 3/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 40ms/step - loss: 9.4733 - max_margin_loss: 9.4733
Epoch 4/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 41ms/step - loss: 8.7165 - max_margin_loss: 8.7165
Epoch 5/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 41ms/step - loss: 8.3181 - max_margin_loss: 8.3181
Epoch 6/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 41ms/step - loss: 8.1459 - max_margin_loss: 8.1459
Epoch 7/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 40ms/step - loss: 7.9774 - max_margin_loss: 7.9774
Epoch 8/15
[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 41ms/step 

In [12]:
# How to Address Issues (If Any):
# Introduce Hard Negatives:
# Instead of randomly selecting negative samples, use hard negatives—examples that are more challenging to distinguish from positive pairs. This keeps the max-margin loss informative and prevents the model from converging too quickly.

# Regularization:
# Apply regularization (e.g., L2 regularization) to prevent overfitting and ensure the model generalizes well.

# Early Stopping:
# If the loss plateaus and aspect quality is satisfactory, consider using early stopping to avoid unnecessary training.

In [13]:
training_model.save("./../data/abae.keras")

DEBUG:h5py._conv:Creating converter from 5 to 3


## Model Evaluation

In [10]:
# Load evaluation model
inference_model = generator.make_model("./../data/abae.keras")

  super(WeightedAspectEmb, self).__init__(**kwargs)


In [15]:
out = inference_model.predict(x=train_dataloader)
np.argmax(out[2], axis=-1)  # The associated labels

[1m1577/1577[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 24ms/step


array([7, 0, 7, ..., 0, 0, 0])

## Find aspect most representative words

In [11]:
word_emb = inference_model.get_layer('word_embedding').get_weights()[0]
word_emb = torch.from_numpy(word_emb)
word_emb.shape

torch.Size([12954, 128])

In [14]:
aspect_embeddings = inference_model.get_layer('weighted_aspect_emb').W
vocab_inv = embeddings_model.model.wv.index_to_key

In [28]:

aspect_words = []
aspect_index = 0

for aspect in aspect_embeddings:
    aspect = aspect.cpu()
    # Calculate the cosine similarity of each word with the aspect
    word_emb = word_emb / torch.linalg.norm(word_emb, dim=-1, keepdim=True)
    aspect = aspect / torch.linalg.norm(aspect, dim=-1, keepdim=True)

    similarity = word_emb.matmul(aspect.T)

    numpy_similarity = similarity.detach().numpy()

    ordered_words = np.argsort(numpy_similarity)[::-1]
    desc_list = [(vocab_inv[w], numpy_similarity[w]) for w in ordered_words[:15]]
    aspect_words.append(desc_list)

    print("Aspect ", aspect_index)
    for i in desc_list:
        # hr][/i is not a valid word. meh.
        print("Word: ", i[0], f"({i[1]})")

    aspect_index += 1

Aspect  0
Word:  release (0.5447438955307007)
Word:  buy (0.5116141438484192)
Word:  back (0.4773869514465332)
Word:  unlock (0.477204293012619)
Word:  magic (0.4655134677886963)
Word:  ffg (0.46209990978240967)
Word:  kickstarter (0.46151912212371826)
Word:  marvel (0.459344744682312)
Word:  retail (0.45813944935798645)
Word:  glad (0.4541388750076294)
Word:  pls (0.44647061824798584)
Word:  cosmetic (0.437392920255661)
Word:  copy (0.4348754286766052)
Word:  legendary (0.43482765555381775)
Word:  regret (0.4348164200782776)
Aspect  1
Word:  belive (0.7245154976844788)
Word:  email (0.7236083745956421)
Word:  familiarize (0.7202874422073364)
Word:  incarnation (0.7198400497436523)
Word:  clothe (0.719068169593811)
Word:  tmb (0.7155524492263794)
Word:  technically (0.7127645611763)
Word:  preset (0.7095102071762085)
Word:  kg (0.7089599370956421)
Word:  rondell (0.7079117298126221)
Word:  ppl (0.7070233821868896)
Word:  dame (0.7059849500656128)
Word:  erratas (0.7055681943893433)
Wor

  similarity = word_emb.matmul(aspect.T)


# Evaluate coherence
Topical coherence measures the semantic consistency of terms grouped under a topic or aspect. It checks whether the terms frequently co-occur in similar contexts within your dataset, reflecting a meaningful grouping. For each topic (aspect), calculate pairwise co-occurrence of terms across the dataset. Terms that co-occur frequently in the same context are considered more coherent



In [29]:
# For each word of aspect for the aspect we calculate the coherence by AVG distance between top words
for aspect_most_representative_words in aspect_words:
    coherence = []
    for word in aspect_most_representative_words:
        w, score = word
        for word2 in aspect_most_representative_words:
            w2, score = word2
            if w != w2:
                coherence.append(embeddings_model.model.wv.similarity(w, w2))
    # todo fai avgf cosi natualmente sbagliato  
    print("Aspect i has total coherence of", np.mean(coherence, axis=0))  # AVG

Aspect i has total coherence of 0.6271665
Aspect i has total coherence of 0.8668326
Aspect i has total coherence of 0.84780085
Aspect i has total coherence of 0.74030197
Aspect i has total coherence of 0.7200178
Aspect i has total coherence of 0.8413263
Aspect i has total coherence of 0.8098694
Aspect i has total coherence of 0.54224765
Aspect i has total coherence of 0.69913083
Aspect i has total coherence of 0.7870225
Aspect i has total coherence of 0.7779807
Aspect i has total coherence of 0.5372712
Aspect i has total coherence of 0.73999083
Aspect i has total coherence of 0.65727794


In [18]:
# Words to remove:
"""
\13\
~
"""

In [25]:
gold_standard_topics = ['luck', 'alea', 'bookkeeping', 'downtime', 'strategy', 'interaction', 'complicated', 'complex']
counter = 0
for aspect in aspect_embeddings:
    aspect = aspect.cpu()

    aspect = aspect / torch.linalg.norm(aspect, dim=-1, keepdim=True)
    word_emb = word_emb / torch.linalg.norm(word_emb, dim=-1, keepdim=True)

    print("Aspect ", counter)
    # Calculate the cosine similarity of each word with the aspect
    for topic in gold_standard_topics:
        index = embeddings_model.model.wv.get_index(topic)
        print(f"'{topic}' similarity: ", word_emb[index].dot(aspect))
    embeddings_model.model.wv.get_vector('luck')
    counter += 1

Aspect  0
'luck' similarity:  tensor(-0.0714, grad_fn=<DotBackward0>)
'alea' similarity:  tensor(0.1196, grad_fn=<DotBackward0>)
'bookkeeping' similarity:  tensor(-0.1017, grad_fn=<DotBackward0>)
'downtime' similarity:  tensor(-0.4651, grad_fn=<DotBackward0>)
'strategy' similarity:  tensor(-0.1643, grad_fn=<DotBackward0>)
'interaction' similarity:  tensor(-0.3416, grad_fn=<DotBackward0>)
'complicated' similarity:  tensor(-0.0198, grad_fn=<DotBackward0>)
'complex' similarity:  tensor(-0.1619, grad_fn=<DotBackward0>)
Aspect  1
'luck' similarity:  tensor(0.0218, grad_fn=<DotBackward0>)
'alea' similarity:  tensor(0.5491, grad_fn=<DotBackward0>)
'bookkeeping' similarity:  tensor(0.5055, grad_fn=<DotBackward0>)
'downtime' similarity:  tensor(0.1841, grad_fn=<DotBackward0>)
'strategy' similarity:  tensor(0.1607, grad_fn=<DotBackward0>)
'interaction' similarity:  tensor(0.0652, grad_fn=<DotBackward0>)
'complicated' similarity:  tensor(0.2053, grad_fn=<DotBackward0>)
'complex' similarity:  tens

In [33]:
embeddings_model.model.wv.get_vector('4p')

array([ 0.06464096,  0.09093551,  0.0028071 ,  0.29852405,  0.602484  ,
       -0.7610173 , -0.3652719 ,  0.27201098,  0.15539584, -0.23070729,
        0.85078335, -0.14241563, -0.3791329 , -0.62965673, -0.28053308,
       -0.21491268, -0.12933089,  0.6480796 ,  0.4191215 , -0.7608188 ,
       -0.23891445,  0.14498414,  0.8824801 ,  0.1812332 ,  0.0582756 ,
        0.10913613, -0.5965477 , -0.06786239, -0.42944986,  0.11144818,
        0.01937047, -0.48778266, -0.03961428,  1.133933  , -0.3447342 ,
        0.5208978 ,  0.42408213,  0.21653399,  0.8246546 ,  0.21316518,
       -0.12819354,  0.23317572, -0.04644729,  0.22567058, -0.32728085,
        0.2677897 , -0.2627684 ,  0.24090816, -0.16617578,  0.21517266,
        0.16364598, -0.2701411 ,  0.46338764,  0.6053269 , -0.06960589,
        0.33252728,  0.2450767 , -0.2168208 , -0.9041195 ,  0.1471062 ,
        0.16257447, -0.51723665,  0.20917365, -0.44860741,  0.6085149 ,
       -0.26305255,  0.6510224 ,  0.26636028,  0.12356767, -0.17

# Kickstarter Less Dataset (64K)

File paths (Embeddings and corpus)

In [4]:
corpus_file = "./../data/corpus.preprocessed.kickstarter_removed.csv"
embeddings_file = "./../data/word-embeddings.kickstarter_removed.model"
aspects_file = "./../data/aspects-embedding.kickstarter_removed.model"

### Hyper-parameters
These should have been discussed earlier. <br>
We could do hyperparmeter optimization, but how do we 'validate' our model? <br>

In [5]:
max_vocab_size = 16000  # Maximum amount of different words in vocabulary

In [6]:
word_embedding_size = 128  # Size of the word embeddings
aspect_embedding_size = 128  # Size of the aspect embeddings

## Data Loading

In [9]:
import core.utils

corpus_load_utility = core.utils.LoadCorpusUtility()

In [12]:
import core.embeddings as embeddings

emb_model = embeddings.WordEmbedding(
    corpus_loader_utility=corpus_load_utility, max_vocab_size=max_vocab_size,
    embedding_size=word_embedding_size, target_model_file=embeddings_file, corpus_file=corpus_file
)

aspect_emb_model = embeddings.AspectEmbedding(
    aspect_size=aspect_size, embedding_size=aspect_embedding_size,
    base_embeddings=emb_model, target_model_file=aspects_file
)

Load models

In [13]:
emb_model.load_model()
aspect_emb_model.load_model()

Pandas Apply:   0%|          | 0/50115 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/50115 [00:00<?, ?it/s]

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 196913 words, keeping 8892 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 427194 words, keeping 11177 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 648339 words, keeping 12220 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 884135 words, keeping 12706 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 1127436 words, keeping 12920 word types
INFO:gensim.models.word2vec:collected 12922 word types from a corpus of 1130593 raw words and 50115 sentences
INFO:gensim.models.word2vec:Creating a fresh vocabulary
DEBUG:gensim.utils:starting a new internal lifecycle event log for Word2Vec
INFO:gensim.utils:Word2Vec lifecycle event {'msg': 'effective_m

In [14]:
import dataset
from torch.utils.data import DataLoader

vocabulary = emb_model.model.wv.key_to_index

train = dataset.PositiveNegativeCommentGeneratorDataset(
    vocabulary=vocabulary, csv_dataset_path=corpus_file, negative_size=15
)

train_dataloader = DataLoader(train, batch_size=64, shuffle=True)

Loading spacy model.
Loading dataset from file: ./../data/corpus.preprocessed.kickstarter_removed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/50115 [00:00<?, ?it/s]

Max sequence length calculation in progress...
We loose information on 136 points.This is 0.2713758355781702% of the dataset.
Padding sequences to max length (256).
Max sequence length is:  1235  but we will limit sequences to 256 tokens.


## Model Setup

In [15]:
from core.model import ABAEGenerator

generator = ABAEGenerator(train.max_seq_length, train.negative_size, emb_model, aspect_emb_model)

In [16]:
import torch

# Asses that we have GPU working
torch.cuda.get_device_name(0)

'NVIDIA GeForce RTX 3070 Ti'

## Train Phase

In [17]:
optimizer = 'adam'  # We try Adam as it converges faster
batch_size = 64
epochs = 20

In [18]:
from core import utils

training_model = generator.make_training_model()
training_model.compile(optimizer=optimizer, loss=[utils.max_margin_loss], metrics={'max_margin': utils.max_margin_loss})
history = training_model.fit(x=train_dataloader, batch_size=64, epochs=15)

Epoch 1/15


  super(WeightedAspectEmb, self).__init__(**kwargs)


[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 48ms/step - loss: 9.5550 - max_margin_loss: 9.5550
Epoch 2/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 48ms/step - loss: 4.8051 - max_margin_loss: 4.8051
Epoch 3/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 47ms/step - loss: 4.2061 - max_margin_loss: 4.2061
Epoch 4/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 48ms/step - loss: 4.0448 - max_margin_loss: 4.0448
Epoch 5/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 53ms/step - loss: 3.9617 - max_margin_loss: 3.9617
Epoch 6/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 54ms/step - loss: 3.9426 - max_margin_loss: 3.9426
Epoch 7/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 53ms/step - loss: 3.8991 - max_margin_loss: 3.8991
Epoch 8/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 54ms/step - loss: 3.8871 - max

## Persist model

In [38]:
training_model.save("./../data/abae.kickstarter_removed_64K.keras")

## Model Evaluation

In [39]:
# Load evaluation model
inference_model = generator.make_model("./../data/abae.kickstarter_removed_64K.keras")

In [40]:
word_emb = inference_model.get_layer('word_embedding').get_weights()[0]
word_emb = torch.from_numpy(word_emb)

word_emb.shape

torch.Size([12922, 128])

In [41]:
aspect_embeddings = inference_model.get_layer('weighted_aspect_emb').W
vocab_inv = emb_model.model.wv.index_to_key

In [42]:
import numpy as np

#todo method
aspect_words = []
aspect_index = 0

for aspect in aspect_embeddings:
    aspect = aspect.cpu()
    # Calculate the cosine similarity of each word with the aspect
    word_emb = word_emb / torch.linalg.norm(word_emb, dim=-1, keepdim=True)
    aspect = aspect / torch.linalg.norm(aspect, dim=-1, keepdim=True)

    similarity = word_emb.matmul(aspect.T)

    numpy_similarity = similarity.detach().numpy()

    ordered_words = np.argsort(numpy_similarity)[::-1]
    desc_list = [(vocab_inv[w], numpy_similarity[w]) for w in ordered_words[:15]]
    aspect_words.append(desc_list)

    print("Aspect ", aspect_index)
    for i in desc_list:
        # hr][/i is not a valid word. meh.
        print("Word: ", i[0], f"({i[1]})")

    aspect_index += 1

Aspect  0
Word:  not (0.719325065612793)
Word:  will (0.681557297706604)
Word:  play (0.6385544538497925)
Word:  soon (0.6287999749183655)
Word:  say (0.6282198429107666)
Word:  excited (0.6232393980026245)
Word:  having (0.6223326325416565)
Word:  know (0.6069502830505371)
Word:  hear (0.598689079284668)
Word:  anymore (0.5897776484489441)
Word:  try (0.5886120796203613)
Word:  finally (0.5843713283538818)
Word:  exactly (0.5788878202438354)
Word:  sure (0.5717058181762695)
Word:  continue (0.5636522769927979)
Aspect  1
Word:  carte (0.4393817186355591)
Word:  paddle (0.31424713134765625)
Word:  11/17 (0.26404044032096863)
Word:  maxing (0.22924785315990448)
Word:  players:2 (0.18000350892543793)
Word:  steph (0.176033154129982)
Word:  dice (0.17405401170253754)
Word:  rule (0.15369495749473572)
Word:  65x100 (0.1498761773109436)
Word:  upd (0.1420460045337677)
Word:  boni (0.13681727647781372)
Word:  card (0.13198794424533844)
Word:  worker (0.1316823959350586)
Word:  lv (0.130222275

In [43]:
# For each word of aspect for the aspect we calculate the coherence by AVG distance between top words
counter = 0
for aspect_most_representative_words in aspect_words:
    coherence = []
    for word in aspect_most_representative_words:
        w, score = word
        for word2 in aspect_most_representative_words:
            w2, score = word2
            if w != w2:
                coherence.append(emb_model.model.wv.similarity(w, w2))
    # todo fai avgf cosi natualmente sbagliato  
    print(f"Aspect {counter}  has total coherence of", np.mean(coherence, axis=0))  # AVG
    counter += 1

Aspect 0  has total coherence of 0.6825146
Aspect 1  has total coherence of 0.22206149
Aspect 2  has total coherence of 0.8343714
Aspect 3  has total coherence of 0.615195
Aspect 4  has total coherence of 0.58994347
Aspect 5  has total coherence of 0.66871715
Aspect 6  has total coherence of 0.69296455
Aspect 7  has total coherence of 0.6968302
Aspect 8  has total coherence of 0.73674154
Aspect 9  has total coherence of 0.7429665
Aspect 10  has total coherence of 0.7725564
Aspect 11  has total coherence of 0.9686056
Aspect 12  has total coherence of 0.654336
Aspect 13  has total coherence of 0.5454725


In [45]:
gold_standard_topics = ['luck', 'alea', 'bookkeeping', 'downtime', 'strategy', 'interaction', 'complicated', 'complex',
                        '<UNK>']
counter = 0
for aspect in aspect_embeddings:
    aspect = aspect.cpu()

    aspect = aspect / torch.linalg.norm(aspect, dim=-1, keepdim=True)
    word_emb = word_emb / torch.linalg.norm(word_emb, dim=-1, keepdim=True)

    print("Aspect ", counter)
    # Calculate the cosine similarity of each word with the aspect
    for topic in gold_standard_topics:
        index = emb_model.model.wv.get_index(topic)
        print(f"'{topic}' similarity: ", word_emb[index].dot(aspect))
    emb_model.model.wv.get_vector('luck')
    counter += 1

Aspect  0
'luck' similarity:  tensor(-0.2443, grad_fn=<DotBackward0>)
'alea' similarity:  tensor(-0.0075, grad_fn=<DotBackward0>)
'bookkeeping' similarity:  tensor(-0.0400, grad_fn=<DotBackward0>)
'downtime' similarity:  tensor(0.0263, grad_fn=<DotBackward0>)
'strategy' similarity:  tensor(-0.0364, grad_fn=<DotBackward0>)
'interaction' similarity:  tensor(-0.2552, grad_fn=<DotBackward0>)
'complicated' similarity:  tensor(0.1602, grad_fn=<DotBackward0>)
'complex' similarity:  tensor(-0.0380, grad_fn=<DotBackward0>)
'<UNK>' similarity:  tensor(-0.1207, grad_fn=<DotBackward0>)
Aspect  1
'luck' similarity:  tensor(-0.0703, grad_fn=<DotBackward0>)
'alea' similarity:  tensor(-0.3540, grad_fn=<DotBackward0>)
'bookkeeping' similarity:  tensor(-0.4647, grad_fn=<DotBackward0>)
'downtime' similarity:  tensor(-0.2009, grad_fn=<DotBackward0>)
'strategy' similarity:  tensor(-0.1533, grad_fn=<DotBackward0>)
'interaction' similarity:  tensor(-0.2276, grad_fn=<DotBackward0>)
'complicated' similarity:  

In [37]:
# train more and see
history = training_model.fit(x=train_dataloader, batch_size=64, epochs=15)

Epoch 1/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 51ms/step - loss: 3.8152 - max_margin_loss: 3.8152
Epoch 2/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 50ms/step - loss: 3.8249 - max_margin_loss: 3.8249
Epoch 3/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 50ms/step - loss: 3.8182 - max_margin_loss: 3.8182
Epoch 4/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 49ms/step - loss: 3.7828 - max_margin_loss: 3.7828
Epoch 5/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 52ms/step - loss: 3.8130 - max_margin_loss: 3.8130
Epoch 6/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 52ms/step - loss: 3.7875 - max_margin_loss: 3.7875
Epoch 7/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 50ms/step - loss: 3.7854 - max_margin_loss: 3.7854
Epoch 8/15
[1m784/784[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 50ms/step - loss: 3

# Kickstarter More Dataset (256K)

In [2]:
from train import AbaeModelConfiguration

config = AbaeModelConfiguration(
    corpus_file="../data/processed-dataset/kickstarter-filtered-game-name-filtered/k256_longest.preprocessed.csv",
    model_name="abae.kickstarter_removed.256k",
    aspect_size=16,
    max_vocab_size=40000,
)


In [3]:
from core.train import AbaeModelManager

manager = AbaeModelManager(config)

INFO:gensim.utils:loading Word2Vec object from output\abae.kickstarter_removed.256k.embeddings.model
DEBUG:smart_open.smart_open_lib:{'uri': 'output\\abae.kickstarter_removed.256k.embeddings.model', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'compression': 'infer_from_extension', 'transport_params': None}
INFO:gensim.utils:loading wv recursively from output\abae.kickstarter_removed.256k.embeddings.model.wv.* with mmap=None
INFO:gensim.utils:setting ignored attribute cum_table to None
INFO:gensim.utils:Word2Vec lifecycle event {'fname': 'output\\abae.kickstarter_removed.256k.embeddings.model', 'datetime': '2024-12-09T16:50:21.872806', 'gensim': '4.3.3', 'python': '3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]', 'platform': 'Windows-11-10.0.26100-SP0', 'event': 'loaded'}


In [4]:
import dataset
from torch.utils.data import DataLoader

vocabulary = manager.embedding_model.model.wv.key_to_index
train = dataset.PositiveNegativeCommentGeneratorDataset(
    vocabulary=vocabulary, csv_dataset_path="../data/processed-dataset/kickstarter-filtered-game-name-filtered/k256_longest.preprocessed.csv", negative_size=15
)

train_dataloader = DataLoader(train, batch_size=64, shuffle=True)

Loading spacy model.
Loading dataset from file: ../data/processed-dataset/kickstarter-filtered-game-name-filtered/k256_longest.preprocessed.csv
Generating numeric representation for each word of ds.


Pandas Apply:   0%|          | 0/121600 [00:00<?, ?it/s]

Max sequence length calculation in progress...
We loose information on 3405 points.This is 2.80016447368421% of the dataset.
Padding sequences to max length (256).
Max sequence length is:  2384  but we will limit sequences to 256 tokens.


In [5]:
train_model = manager.prepare_training_model(optimizer='adam')

  super(WeightedAspectEmb, self).__init__(**kwargs)


In [6]:
train_model.fit(train_dataloader, epochs=7, batch_size=64)

Epoch 1/7
[1m1900/1900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 175ms/step - loss: 5.0903 - max_margin_loss: 5.0903
Epoch 2/7
[1m1900/1900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 179ms/step - loss: 2.6581 - max_margin_loss: 2.6581
Epoch 3/7
[1m1900/1900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 178ms/step - loss: 2.5164 - max_margin_loss: 2.5164
Epoch 4/7
[1m1900/1900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m333s[0m 175ms/step - loss: 2.4657 - max_margin_loss: 2.4657
Epoch 5/7
[1m1900/1900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m332s[0m 175ms/step - loss: 2.4425 - max_margin_loss: 2.4425
Epoch 6/7
[1m1900/1900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m335s[0m 176ms/step - loss: 2.4356 - max_margin_loss: 2.4356
Epoch 7/7
[1m1900/1900[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m334s[0m 176ms/step - loss: 2.4117 - max_margin_loss: 2.4117


<keras.src.callbacks.history.History at 0x25427ddc680>

In [7]:
manager.persist_model()

DEBUG:h5py._conv:Creating converter from 5 to 3


In [8]:
#manager.persist_model()
inference_model = manager.prepare_evaluation_model()

In [9]:
import torch

In [10]:
word_emb = inference_model.get_layer('word_embedding').get_weights()[0]
word_emb = torch.from_numpy(word_emb)

word_emb.shape

torch.Size([32315, 128])

In [11]:
aspect_embeddings = inference_model.get_layer('weighted_aspect_emb').W
vocab_inv = manager.embedding_model.model.wv.index_to_key

In [12]:
type(aspect_embeddings[0])

torch.Tensor

In [13]:
import numpy as np

#todo method
aspect_words = []
aspect_index = 0
aspect_embeddings = aspect_embeddings / torch.linalg.norm(aspect_embeddings, dim=-1, keepdim=True)
for aspect in aspect_embeddings:
    aspect = aspect.cpu()
    # Calculate the cosine similarity of each word with the aspect
    word_emb = word_emb / torch.linalg.norm(word_emb, dim=-1, keepdim=True)
    #aspect = aspect / torch.linalg.norm(aspect, dim=-1, keepdim=True)

    similarity = word_emb.matmul(aspect.T)

    numpy_similarity = similarity.detach().numpy()

    ordered_words = np.argsort(numpy_similarity)[::-1]
    desc_list = [(vocab_inv[w], numpy_similarity[w]) for w in ordered_words[:15]]
    aspect_words.append(desc_list)

    print("Aspect ", aspect_index)
    for i in desc_list:
        # hr][/i is not a valid word. meh.
        print("Word: ", i[0], f"({i[1]})")

    aspect_index += 1

Aspect  0
Word:  British (0.8389521241188049)
Word:  invasion (0.8321045637130737)
Word:  british (0.820457398891449)
Word:  russian (0.817176103591919)
Word:  Austrians (0.8170284032821655)
Word:  Turkey (0.816960334777832)
Word:  Italy (0.8125898838043213)
Word:  Moscow (0.8124603033065796)
Word:  Spain (0.8119088411331177)
Word:  Americans (0.8112927675247192)
Word:  warlord (0.8055378198623657)
Word:  Russia (0.8051453828811646)
Word:  Britain (0.8017892837524414)
Word:  Ottoman (0.8010320067405701)
Word:  northern (0.8008626699447632)
Aspect  1
Word:  Resources (0.5151026248931885)
Word:  postage (0.5071635246276855)
Word:  PP (0.5059881210327148)
Word:  ○ (0.49518734216690063)
Word:  handsize (0.4864172339439392)
Word:  Tax (0.48563623428344727)
Word:  resouce (0.4825304448604584)
Word:  ingot (0.4771419167518616)
Word:  Points (0.47412967681884766)
Word:  drm (0.47392070293426514)
Word:  Income (0.47262609004974365)
Word:  Rating (0.4663636088371277)
Word:  us$ (0.46594294905662

  similarity = word_emb.matmul(aspect.T)


In [14]:
# For each word of aspect for the aspect we calculate the coherence by AVG distance between top words
counter = 0
for aspect_most_representative_words in aspect_words:
    coherence = []
    for word in aspect_most_representative_words:
        w, score = word
        for word2 in aspect_most_representative_words:
            w2, score = word2
            if w != w2:
                coherence.append(manager.embedding_model.model.wv.similarity(w, w2))
    # todo fai avgf cosi natualmente sbagliato  
    print(f"Aspect {counter}  has total coherence of", np.mean(coherence, axis=0))  # AVG
    counter += 1

Aspect 0  has total coherence of 0.79174244
Aspect 1  has total coherence of 0.53386927
Aspect 2  has total coherence of 0.33116376
Aspect 3  has total coherence of 0.4914523
Aspect 4  has total coherence of 0.4800929
Aspect 5  has total coherence of 0.89680785
Aspect 6  has total coherence of 0.4224422
Aspect 7  has total coherence of 0.5595991
Aspect 8  has total coherence of 0.6626675
Aspect 9  has total coherence of 0.708104
Aspect 10  has total coherence of 0.5779117
Aspect 11  has total coherence of 0.6887438
Aspect 12  has total coherence of 0.71894604
Aspect 13  has total coherence of 0.44459954
Aspect 14  has total coherence of 0.65887916
Aspect 15  has total coherence of 0.851783
