In [1]:
import argparse
import multiprocessing
import logging

from gensim.models import Word2Vec

### Gensim Word2Vec
_Documentation: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec_
* `sentences` _(iterable of iterables, optional)_: The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See `BrownCorpus`, `Text8Corpus` or `LineSentence` in word2vec module for such examples. See also the tutorial on data streaming in Python. If you don’t supply sentences, the model is left uninitialized – use if you plan to initialize it in some other way.
* `vector_size` _(int, optional)_: Dimensionality of the word vectors.
* `window` _(int, optional)_: Maximum distance between the current and predicted word within a sentence.
* `min_count` _(int, optional)_: Ignores all words with total frequency lower than this.
* `workers` _(int, optional)_: Use these many worker threads to train the model (=faster training with multicore machines).
* `sg` _({0, 1}, optional)_: Training algorithm: 1 for skip-gram; otherwise CBOW.
* `hs` _({0, 1}, optional)_: If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
* `negative` _(int, optional)_: If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.

In short:

| SG | HS | Negative | Training Algorithm |
|----|----|----------|-------------------|
| 1  | 1  |          | Skip-Gram Hierarchical Softmax |
| 1  | 0  | $\neq$ 0 | Skip-Gram Negative Sampling |
| 1  | 0  | = 0 | No training |
| 0  | 1  |          | CBOW Hierarchical Softmax |
| 0  | 0  | $\neq$ 0 | CBOW Negative Sampling |
| 0 | 0  | = 0 | No training |


In [7]:
# documentation: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# Assumption: Provided input is a txt file with one sentence per line.
INPUT = ["../corpus/turkish-texts-tokenized.txt", "../corpus/bounwebcorpus.txt"]
MIN_COUNT = 10   # ignore all words with total frequency lower than this
EMB = 300       # dimensionality of word vectors
WINDOW = 5      # maximum distance between the target and context word within a sentence
EPOCH = 10       # number of iterations (epochs) over the corpus
SG = 1          # training algorithm: 1 for skip-gram; otherwise CBOW
HS = 0          # if 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used. If both of them 0, no training algorithm will be used.
NEGATIVE = 5    # if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
OUTPUT = "word2vec_10epoch.model"

# So, if both hs and negative are set to 0, it means that no training algorithm will be used to learn the word embeddings. In this case, you will have to provide pre-trained word embeddings for the model to use.

In [None]:
class LineSentences(object):
    def __init__(self, filenames):
        self.filenames = filenames
    
    # memory-friendly iterator
    def __iter__(self):
        for filename in self.filenames:
            for line in open(filename, "r", encoding="utf-8"):
                yield line.strip().split()

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(sentences=LineSentences(INPUT), 
                vector_size=EMB, 
                window=WINDOW, 
                min_count=MIN_COUNT, 
                epochs = EPOCH, 
                sg = SG,
                hs = HS,
                negative = NEGATIVE,
                compute_loss=True,
                workers=multiprocessing.cpu_count())
model.wv.save_word2vec_format(OUTPUT, binary=True)
training_loss = model.get_latest_training_loss()
print(training_loss)

In [8]:
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format(OUTPUT, binary=True)

In [None]:
word_vectors.most_similar_cosmul(positive=['kadın', 'kral'], negative=['adam'])

In [None]:
# Create vocabulary file
vocab = list(word_vectors.index_to_key)

In [24]:
# Actually, vocabulary is already sorted according to the frequency of the words. But, you can sort it again to be sure.
word_counts = [word_vectors.get_vecattr(word, 'count') for word in vocab]  # get frequency of each word in corpus

# Sort the vocabulary by word counts in descending order
sorted_vocab = [word for _, word in sorted(zip(word_counts, vocab), reverse=True)]
print(sorted_vocab[:10])

[',', '.', 've', 'bir', 'bu', 'da', 'de', 'için', 'iki', 'ile']


In [25]:
# write vocab to corpus/vocab.txt
with open("../corpus/vocab.txt", "w", encoding="utf-8") as f:
    for word in sorted_vocab:
        f.write(word + "\n")