In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

In [3]:
import embeddings
import keras

target_file = "./../data/embeddings-target.model"
corpus_file = "./../data/corpus.preprocessed.csv"

## Vocabulary size and relevant words
It is of benefit keeping in the model words with a minimum occurrence as:

> Yes, generally, removing very infrequent words when building embeddings models (like Word2Vec or GloVe) is standard practice. Rare words, appearing only a few times in a large corpus, don’t have enough context to generate meaningful embeddings, which can dilute the quality of the learned representations.

>However, if your corpus is specialized and every rare word holds unique domain significance, you might choose a lower frequency threshold. But typically, for more general embeddings, filtering out low-frequency words leads to cleaner, more efficient embeddings without much loss in semantic quality.

Therefore we keep a minimum frequency threshold of 4 (a small value, usually 5 is a good starter
but considering our task is very specialized we do it like this).






# Gensim Word2Vec

In [4]:

# TODO: Correctly parametrize: STUDY THE BEST PARAMETERS?
emb_model = embeddings.WordEmbedding(10000, 128, target_file=target_file, corpus_file=corpus_file)
emb_model.load_model()

In [18]:
emb_model.model.wv.most_similar(["wingspan", "dragon"])

[('wyrmspan', 0.5756521224975586),
 ('everdell', 0.5665486454963684),
 ('bird', 0.4801858365535736),
 ('photosynthesis', 0.47118669748306274),
 ('viticulture', 0.46620768308639526),
 ('splendor', 0.46151286363601685),
 ('scythe', 0.4478973150253296),
 ('enchantment', 0.44542452692985535),
 ('bitoku', 0.44291630387306213),
 ('lords', 0.43769797682762146)]

In [6]:
print(f"We have a total of words: {len(emb_model.model.wv.key_to_index)}")

We have a total of words: 65055


In [7]:
emb_model.model.wv[0]

array([ 1.5029165 ,  2.3624861 ,  2.2939138 , -0.13856713,  0.18078902,
       -1.0472201 , -0.56067294,  1.1422796 , -0.49811956, -0.2217462 ,
       -1.1803151 ,  0.15797569,  0.31748328,  1.8089781 , -1.3316301 ,
       -0.7281514 ,  0.22855756, -0.5512726 , -0.24345373, -0.65646636,
        0.4133309 , -0.66919994,  2.3773947 , -0.22472824,  0.13201578,
       -0.4081477 ,  0.70655   , -0.14592205, -1.2804753 ,  0.09212334,
        0.8463506 , -0.04296552,  1.5006573 ,  0.45384318,  0.30417955,
       -0.9204467 ,  1.1173109 ,  0.16690648,  1.4310523 ,  0.5637191 ,
        0.4042798 , -0.5923682 , -1.0711784 , -0.7060842 , -0.41067874,
        1.0436958 , -0.1179394 ,  1.968886  , -0.47020712,  1.7497754 ,
       -0.88896435,  0.47999144,  0.28804758, -1.5539659 ,  1.3963755 ,
        0.66093767,  0.43946657,  1.8425758 ,  0.00559428,  0.02443716,
       -0.48243096, -0.24423185, -0.896593  ,  0.34101382, -0.45089987,
       -0.29079002, -0.88843846, -1.3316691 , -0.20916806,  2.70

In [8]:
# Never resort the words: There is a bug in the Word2Vec model (seems like at least)
# https://stackoverflow.com/questions/68451937/gensim-sort-by-descending-frequency-changes-most-similar-results
# emb_model.model.wv.sort_by_descending_frequency() # Simpler tests

last_index = len(emb_model.model.wv.key_to_index) - 1
frequency = emb_model.model.wv.get_vecattr(last_index, 'count')
most_frequency = emb_model.model.wv.get_vecattr(0, 'count')

print(f"One of the least frequent word is '{emb_model.model.wv.index_to_key[last_index]}'x({frequency})")
print(f"The most frequent word is '{emb_model.model.wv.index_to_key[0]}' x({most_frequency})")

One of the least frequent word is 'redundantly'x(4)
The most frequent word is 'game' x(2241403)


As the reviews explain the games and give some insight I also have some words that have little to do with the gameplay like Darth Vader that

In [11]:
emb_model.model.wv.get_vecattr("b]complexity:[/b", 'count')

49

In [9]:
emb_model.model.wv.most_similar(["root", "pirate"] ,[], topn=20)
# Root + Pirates = Ahoy as they are manufactured and designed by the same company and often referenced together

[('ahoy', 0.5687939524650574),
 ('marauder', 0.5133765339851379),
 ('caribbean', 0.5094255208969116),
 ('swashbuckling', 0.5034607648849487),
 ('oath', 0.4972757399082184),
 ('xia', 0.4971827268600464),
 ('cove', 0.4962853193283081),
 ('firefly', 0.4872472882270813),
 ('shark', 0.47869935631752014),
 ('jamaica', 0.4708572030067444),
 ('plunder', 0.47066861391067505),
 ('godfather', 0.46718424558639526),
 ('merchant', 0.46024590730667114),
 ('smuggler', 0.45988139510154724),
 ('blackbeard', 0.45532387495040894),
 ('gangster', 0.45510271191596985),
 ('mobster', 0.4525229036808014),
 ('explorer', 0.44635242223739624),
 ('colonial', 0.4418928921222687),
 ('spartacus', 0.4414701461791992)]

### Should I keep title names?
Yes I should. Game names can bring meaning as most frequently games with similar mechanics or general feeling are referenced in reviews. Keeping them in my corpus I make sure to associate a significance to these comparisons. 

In [16]:
emb_model.model.wv[0]

array([ 1.5029165 ,  2.3624861 ,  2.2939138 , -0.13856713,  0.18078902,
       -1.0472201 , -0.56067294,  1.1422796 , -0.49811956, -0.2217462 ,
       -1.1803151 ,  0.15797569,  0.31748328,  1.8089781 , -1.3316301 ,
       -0.7281514 ,  0.22855756, -0.5512726 , -0.24345373, -0.65646636,
        0.4133309 , -0.66919994,  2.3773947 , -0.22472824,  0.13201578,
       -0.4081477 ,  0.70655   , -0.14592205, -1.2804753 ,  0.09212334,
        0.8463506 , -0.04296552,  1.5006573 ,  0.45384318,  0.30417955,
       -0.9204467 ,  1.1173109 ,  0.16690648,  1.4310523 ,  0.5637191 ,
        0.4042798 , -0.5923682 , -1.0711784 , -0.7060842 , -0.41067874,
        1.0436958 , -0.1179394 ,  1.968886  , -0.47020712,  1.7497754 ,
       -0.88896435,  0.47999144,  0.28804758, -1.5539659 ,  1.3963755 ,
        0.66093767,  0.43946657,  1.8425758 ,  0.00559428,  0.02443716,
       -0.48243096, -0.24423185, -0.896593  ,  0.34101382, -0.45089987,
       -0.29079002, -0.88843846, -1.3316691 , -0.20916806,  2.70

In [15]:
import torch
layer = emb_model.build_embedding_layer()
layer(torch.tensor([0]))

tensor([[ 1.5029,  2.3625,  2.2939, -0.1386,  0.1808, -1.0472, -0.5607,  1.1423,
         -0.4981, -0.2217, -1.1803,  0.1580,  0.3175,  1.8090, -1.3316, -0.7282,
          0.2286, -0.5513, -0.2435, -0.6565,  0.4133, -0.6692,  2.3774, -0.2247,
          0.1320, -0.4081,  0.7066, -0.1459, -1.2805,  0.0921,  0.8464, -0.0430,
          1.5007,  0.4538,  0.3042, -0.9204,  1.1173,  0.1669,  1.4311,  0.5637,
          0.4043, -0.5924, -1.0712, -0.7061, -0.4107,  1.0437, -0.1179,  1.9689,
         -0.4702,  1.7498, -0.8890,  0.4800,  0.2880, -1.5540,  1.3964,  0.6609,
          0.4395,  1.8426,  0.0056,  0.0244, -0.4824, -0.2442, -0.8966,  0.3410,
         -0.4509, -0.2908, -0.8884, -1.3317, -0.2092,  2.7050,  0.8188,  0.6476,
          2.4784, -1.3069,  1.6553, -0.9248,  1.3477,  0.4510, -0.1565, -0.5708,
         -0.2060,  0.1317, -1.3148,  0.3667, -1.7870,  1.3845,  0.0983,  0.9706,
         -1.4111,  0.7468,  0.3823,  0.9607,  0.0943, -0.8555, -0.6475, -0.7499,
         -0.7993, -3.0169,  

The embedding layer is loaded correctly with the weights I got from Word2Vec

# My Word2Vec Implementation
But is it worth it?

In [None]:
my_word2vec = embeddings.MyWord2Vec(25000, 128, "")

In [None]:
import dataset
from torch.utils.data import DataLoader

ds = dataset.CommentDataset(corpus_file)

training_dataloader = DataLoader(ds, batch_size=32, shuffle=True, collate_fn=lambda x: x)
# TODO: Convert to numbers (indices of a vocabulary) as model doesnt do that by itself
my_word2vec.compile(optimizer="SGD", loss="categorical_crossentropy", metrics=["acc"])
my_word2vec.fit(training_dataloader, epochs=10)

Pandas Apply:   0%|          | 0/1939904 [00:00<?, ?it/s]

In [7]:
ds.dataset.itertuples()

NameError: name 'ds' is not defined