In [4]:
import os

os.environ['KERAS_BACKEND'] = "torch"

In [5]:
import embeddings
import keras

target_file = "./../data/embeddings-target.model"
corpus_file = "./../data/corpus.preprocessed.csv"

## Vocabulary size and relevant words
It is of benefit keeping in the model words with a minimum occurrence as:

> Yes, generally, removing very infrequent words when building embeddings models (like Word2Vec or GloVe) is standard practice. Rare words, appearing only a few times in a large corpus, don’t have enough context to generate meaningful embeddings, which can dilute the quality of the learned representations.

>However, if your corpus is specialized and every rare word holds unique domain significance, you might choose a lower frequency threshold. But typically, for more general embeddings, filtering out low-frequency words leads to cleaner, more efficient embeddings without much loss in semantic quality.

Therefore we keep a minimum frequency threshold of 4 (a small value, usually 5 is a good starter
but considering our task is very specialized we do it like this).






# Gensim Word2Vec

In [6]:

# TODO: Correctly parametrize: STUDY THE BEST PARAMETERS?
emb_model = embeddings.WordEmbedding(7200, 128, target_file=target_file, corpus_file=corpus_file)
emb_model.load_model()

Pandas Apply:   0%|          | 0/1939904 [00:00<?, ?it/s]

In [7]:
emb_model.model.wv.most_similar("agency")

[('interaction', 0.6038694977760315),
 ('randomness', 0.5666834115982056),
 ('freedom', 0.5644192695617676),
 ('choice', 0.5464454889297485),
 ('interactivity', 0.5412836074829102),
 ('tension', 0.524446427822113),
 ('option', 0.5115113258361816),
 ('variance', 0.5098146796226501),
 ('uncertainty', 0.5078426599502563),
 ('drama', 0.49698948860168457)]

In [8]:
print(f"We have a total of words: {len(emb_model.model.wv.key_to_index)}")

We have a total of words: 65055


In [9]:
emb_model.model.wv[0]

array([ 1.5029165 ,  2.3624861 ,  2.2939138 , -0.13856713,  0.18078902,
       -1.0472201 , -0.56067294,  1.1422796 , -0.49811956, -0.2217462 ,
       -1.1803151 ,  0.15797569,  0.31748328,  1.8089781 , -1.3316301 ,
       -0.7281514 ,  0.22855756, -0.5512726 , -0.24345373, -0.65646636,
        0.4133309 , -0.66919994,  2.3773947 , -0.22472824,  0.13201578,
       -0.4081477 ,  0.70655   , -0.14592205, -1.2804753 ,  0.09212334,
        0.8463506 , -0.04296552,  1.5006573 ,  0.45384318,  0.30417955,
       -0.9204467 ,  1.1173109 ,  0.16690648,  1.4310523 ,  0.5637191 ,
        0.4042798 , -0.5923682 , -1.0711784 , -0.7060842 , -0.41067874,
        1.0436958 , -0.1179394 ,  1.968886  , -0.47020712,  1.7497754 ,
       -0.88896435,  0.47999144,  0.28804758, -1.5539659 ,  1.3963755 ,
        0.66093767,  0.43946657,  1.8425758 ,  0.00559428,  0.02443716,
       -0.48243096, -0.24423185, -0.896593  ,  0.34101382, -0.45089987,
       -0.29079002, -0.88843846, -1.3316691 , -0.20916806,  2.70

In [10]:
# Never resort the words: There is a bug in the Word2Vec model (seems like at least)
# https://stackoverflow.com/questions/68451937/gensim-sort-by-descending-frequency-changes-most-similar-results
# emb_model.model.wv.sort_by_descending_frequency() # Simpler tests

last_index = len(emb_model.model.wv.key_to_index) - 1
frequency = emb_model.model.wv.get_vecattr(last_index, 'count')
most_frequency = emb_model.model.wv.get_vecattr(0, 'count')

print(f"One of the least frequent word is '{emb_model.model.wv.index_to_key[last_index]}'x({frequency})")
print(f"The most frequent word is '{emb_model.model.wv.index_to_key[0]}' x({most_frequency})")

One of the least frequent word is 'redundantly'x(4)
The most frequent word is 'game' x(2241403)


As the reviews explain the games and give some insight I also have some words that have little to do with the gameplay like Darth Vader that

In [11]:
emb_model.model.wv.get_vecattr("b]complexity:[/b", 'count')

49

In [17]:
emb_model.model.wv.most_similar(["eagle", "vagabond"], ["root"], topn=20)

[('fakir', 0.6225370764732361),
 ('buffalo', 0.596053421497345),
 ('dlp', 0.5904033780097961),
 ('pegasus', 0.5874199867248535),
 ('picket', 0.5857054591178894),
 ('inklusive', 0.5801913738250732),
 ('cholm', 0.5785760879516602),
 ('fred', 0.5751404166221619),
 ('krystallium', 0.5749009251594543),
 ('xo', 0.5703404545783997),
 ('fuego', 0.5692890286445618),
 ('spearman', 0.5690612196922302),
 ('preorder', 0.5670077800750732),
 ('talon', 0.5667649507522583),
 ('12/2023', 0.5640817284584045),
 ('moldy', 0.5608451962471008),
 ('ortskarten', 0.5601773858070374),
 ('claude', 0.5599327683448792),
 ('saxophone', 0.5594304800033569),
 ('sergeant', 0.5591108798980713)]

### Should I keep title names?
Yes I should. Game names can bring meaning as most frequently games with similar mechanics or general feeling are referenced in reviews. Keeping them in my corpus I make sure to associate a significance to these comparisons. 

# My Word2Vec Implementation
But is it worth it?

In [None]:
my_word2vec = embeddings.MyWord2Vec(25000, 128, "")

In [None]:
import dataset
from torch.utils.data import DataLoader

ds = dataset.CommentDataset(corpus_file)

training_dataloader = DataLoader(ds, batch_size=32, shuffle=True, collate_fn=lambda x: x)
# TODO: Convert to numbers (indices of a vocabulary) as model doesnt do that by itself
my_word2vec.compile(optimizer="SGD", loss="categorical_crossentropy", metrics=["acc"])
my_word2vec.fit(training_dataloader, epochs=10)

Pandas Apply:   0%|          | 0/1939904 [00:00<?, ?it/s]

In [7]:
ds.dataset.itertuples()

NameError: name 'ds' is not defined