In [1]:
import os

os.environ['KERAS_BACKEND'] = "torch"

# Embeddings Model
We use word2vec to generate the embeddings model. <br>
What algorithm should we choose? CBOW or Skipgram? There are some tradeoffs:

> CBOW is faster and does very well on frequent terms. <br> Skipgram requires more training time but
captures better rare words context

For the embedding size we start with 128 as first value.

In [2]:
# Experiments from here on are done on the 64k ds.
from core.utils import LoadCorpusUtility
from core.embeddings import WordEmbedding

loader = LoadCorpusUtility(min_word_count=4)
corpus = loader.load_data(data_file_path='../data/processed-dataset/full/64k.preprocessed.csv')

emb_model = WordEmbedding(128, "../output/64k-full.embeddings.model", min_word_count=5)
emb_model.generate(corpus)

Pandas Apply:   0%|          | 0/80286 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/80286 [00:00<?, ?it/s]

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 69874 words, keeping 4753 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 144092 words, keeping 5790 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 218736 words, keeping 6198 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 295254 words, keeping 6372 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 372434 words, keeping 6413 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 445937 words, keeping 6429 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 517713 words, keeping 6433 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 593358 words, keeping 6435 word

In [11]:
# Experiments from here on are done on the 64k ds.
from core.utils import LoadCorpusUtility
from core.embeddings import WordEmbedding

loader = LoadCorpusUtility(min_word_count=4)
corpus = loader.load_data(data_file_path='../data/processed-dataset/full/64k.preprocessed.csv')

emb_model = WordEmbedding(128, "../output/64k-full.embeddings-sg.model", min_word_count=5)
emb_model.generate(corpus, sg=True)

Pandas Apply:   0%|          | 0/80286 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/80286 [00:00<?, ?it/s]

INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 69874 words, keeping 4753 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 144092 words, keeping 5790 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 218736 words, keeping 6198 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 295254 words, keeping 6372 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #50000, processed 372434 words, keeping 6413 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #60000, processed 445937 words, keeping 6429 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #70000, processed 517713 words, keeping 6433 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #80000, processed 593358 words, keeping 6435 word

We can see that generating with CBOW is quite faster. <br>
We still have very few records in our dataset therefore the difference is irrelevant. <br>

We opt for SkipGram from the smaller datasets, for 1M (which might be the last test) we will prolly go for the CBOW.

In [13]:
emb_model.model.wv.most_similar(["random"])

[('swingy', 0.8061403632164001),
 ('randomness', 0.7774782180786133),
 ('deterministic', 0.7737147808074951),
 ('mercy', 0.7505597472190857),
 ('outcome', 0.74383145570755),
 ('unbalanced', 0.7373806238174438),
 ('rely', 0.7349971532821655),
 ('lucky', 0.7327756881713867),
 ('pointless', 0.7256832122802734),
 ('chaos', 0.7223337888717651)]

In [8]:
print(f"We have a total of words: {len(emb_model.model.wv.key_to_index)}")

We have a total of words: 6790


In [10]:
# Never resort the words: There is a bug in the Word2Vec model (seems like at least)
# https://stackoverflow.com/questions/68451937/gensim-sort-by-descending-frequency-changes-most-similar-results
# emb_model.model.wv.sort_by_descending_frequency() # Simpler tests

last_index = len(emb_model.model.wv.key_to_index) - 1
frequency = emb_model.model.wv.get_vecattr(last_index, 'count')
most_frequency = emb_model.model.wv.get_vecattr(0, 'count')

print(f"One of the least frequent word is '{emb_model.model.wv.index_to_key[last_index]}'({frequency})")
print(f"The most frequent word is '{emb_model.model.wv.index_to_key[0]}' ({most_frequency})")

One of the least frequent word is '☆'(5)
The most frequent word is 'game' (33514)


In [12]:
emb_model.model.wv.most_similar(["ap"], [], topn=20)
# Root + Pirates = Ahoy as they are manufactured and designed by the same company and often referenced together

[('prone', 0.9203250408172607),
 ('paralysis', 0.8648532629013062),
 ('analysis', 0.8551996350288391),
 ('induce', 0.8508877158164978),
 ('downtime', 0.8423551917076111),
 ('concerned', 0.7912846207618713),
 ('drag', 0.7892791628837585),
 ('consideration', 0.7807860970497131),
 ('tend', 0.7693648934364319),
 ('unforgiving', 0.7632957696914673),
 ('practice', 0.7603340744972229),
 ('affair', 0.7601995468139648),
 ('tendency', 0.7552022337913513),
 ('grind', 0.7545714378356934),
 ('slog', 0.7543879151344299),
 ('dramatic', 0.7520149946212769),
 ('commitment', 0.7518084049224854),
 ('burner', 0.7504880428314209),
 ('extreme', 0.7493808269500732),
 ('curve', 0.7487448453903198)]

In [9]:
len(emb_model.vocabulary())

6436