In [1]:
import argparse
import multiprocessing
import logging

from gensim.models import Word2Vec

### Gensim Word2Vec
_Documentation: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec_
* `sentences` _(iterable of iterables, optional)_: The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See `BrownCorpus`, `Text8Corpus` or `LineSentence` in word2vec module for such examples. See also the tutorial on data streaming in Python. If you don’t supply sentences, the model is left uninitialized – use if you plan to initialize it in some other way.
* `vector_size` _(int, optional)_: Dimensionality of the word vectors.
* `window` _(int, optional)_: Maximum distance between the current and predicted word within a sentence.
* `min_count` _(int, optional)_: Ignores all words with total frequency lower than this.
* `workers` _(int, optional)_: Use these many worker threads to train the model (=faster training with multicore machines).
* `sg` _({0, 1}, optional)_: Training algorithm: 1 for skip-gram; otherwise CBOW.
* `hs` _({0, 1}, optional)_: If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
* `negative` _(int, optional)_: If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.

In short:

| SG | HS | Negative | Training Algorithm |
|----|----|----------|-------------------|
| 1  | 1  |          | Skip-Gram Hierarchical Softmax |
| 1  | 0  | $\neq$ 0 | Skip-Gram Negative Sampling |
| 1  | 0  | = 0 | No training |
| 0  | 1  |          | CBOW Hierarchical Softmax |
| 0  | 0  | $\neq$ 0 | CBOW Negative Sampling |
| 0 | 0  | = 0 | No training |


In [5]:
# documentation: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# Assumption: Provided input is a txt file with one sentence per line.
INPUT = ["../corpus/turkish-texts-tokenized.txt", "../corpus/bounwebcorpus.txt"]
OUTPUT = "word2vec_10epoch.model"
MIN_COUNT = 10   # ignore all words with total frequency lower than this
EMB = 300       # dimensionality of word vectors
WINDOW = 5      # maximum distance between the target and context word within a sentence
EPOCH = 10       # number of iterations (epochs) over the corpus
SG = 1          # training algorithm: 1 for skip-gram; otherwise CBOW
HS = 0          # if 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used. If both of them 0, no training algorithm will be used.
NEGATIVE = 5    # if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used.

# So, if both hs and negative are set to 0, it means that no training algorithm will be used to learn the word embeddings. In this case, you will have to provide pre-trained word embeddings for the model to use.

In [3]:
class LineSentences(object):
    def __init__(self, filenames):
        self.filenames = filenames
    
    # memory-friendly iterator
    def __iter__(self):
        for filename in self.filenames:
            for line in open(filename, "r", encoding="utf-8"):
                yield line.strip().split()

In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(sentences=LineSentences(INPUT), 
                vector_size=EMB, 
                window=WINDOW, 
                min_count=MIN_COUNT, 
                epochs = EPOCH, 
                sg = SG,
                hs = HS,
                negative = NEGATIVE,
                compute_loss=True,
                workers=multiprocessing.cpu_count())
model.wv.save_word2vec_format(OUTPUT, binary=True)
training_loss = model.get_latest_training_loss()
print(training_loss)

2023-03-02 14:58:01,828 : INFO : collecting all words and their counts
2023-03-02 14:58:01,834 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-03-02 14:58:02,177 : INFO : PROGRESS: at sentence #10000, processed 710941 words, keeping 93662 word types
2023-03-02 14:58:02,440 : INFO : PROGRESS: at sentence #20000, processed 1311255 words, keeping 151396 word types
2023-03-02 14:58:02,695 : INFO : PROGRESS: at sentence #30000, processed 1877457 words, keeping 189998 word types
2023-03-02 14:58:02,973 : INFO : PROGRESS: at sentence #40000, processed 2440067 words, keeping 221873 word types
2023-03-02 14:58:03,214 : INFO : PROGRESS: at sentence #50000, processed 2994606 words, keeping 248961 word types
2023-03-02 14:58:03,437 : INFO : PROGRESS: at sentence #60000, processed 3535180 words, keeping 273321 word types
2023-03-02 14:58:03,689 : INFO : PROGRESS: at sentence #70000, processed 4122139 words, keeping 299176 word types
2023-03-02 14:58:03,932 : INFO : P

In [6]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format(OUTPUT, binary=True)

2023-03-03 10:05:08,964 : INFO : loading projection weights from word2vec_10epoch.model
2023-03-03 10:05:26,536 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (1438847, 300) matrix of type float32 from word2vec_10epoch.model', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-03-03T10:05:26.526493', 'gensim': '4.3.0', 'python': '3.9.2 (tags/v3.9.2:1a79785, Feb 19 2021, 13:44:55) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'load_word2vec_format'}


In [7]:
word_vectors.most_similar_cosmul(positive=['kadın', 'kral'], negative=['adam'])

[('kraliçe', 0.8642388582229614),
 ('erkek', 0.8368889689445496),
 ('birendra', 0.8171851634979248),
 ('kraliyet', 0.8053973317146301),
 ('kralın', 0.804827868938446),
 ('arap', 0.7999547123908997),
 ('antiokhos', 0.7965826392173767),
 ('kommegene', 0.7955273389816284),
 ("şah'ın", 0.7921311855316162),
 ("leto'nun", 0.7907087206840515)]