In [1]:
import argparse
import multiprocessing
import logging

from gensim.models.fasttext import FastText
from pathlib import Path
import sys
sys.path.append(str(Path.cwd().parent))

from utils.utils import LineSentences
from utils.utils import callback

### Gensim FastText
Documentation: https://radimrehurek.com/gensim/models/fasttext.html
* `sentences` _(iterable of iterables, optional)_: The sentences iterable can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See `BrownCorpus`, `Text8Corpus` or `LineSentence` in word2vec module for such examples. See also the tutorial on data streaming in Python. If you don’t supply sentences, the model is left uninitialized – use if you plan to initialize it in some other way.
* `vector_size` _(int, optional)_: Dimensionality of the word vectors.
* `window` _(int, optional)_: Maximum distance between the current and predicted word within a sentence.
* `min_count` _(int, optional)_: Ignores all words with total frequency lower than this.
* `workers` _(int, optional)_: Use these many worker threads to train the model (=faster training with multicore machines).
* `sg` _({0, 1}, optional)_: Training algorithm: 1 for skip-gram; otherwise CBOW.
* `hs` _({0, 1}, optional)_: If 1, hierarchical softmax will be used for model training. If 0, and negative is non-zero, negative sampling will be used.
* `negative` _(int, optional)_: If > 0, negative sampling will be used, the int for negative specifies how many “noise words” should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
* `min_n` (int, optional) – Min length of char ngrams to be used for training word representations.
* `max_n` (int, optional) – Max length of char ngrams to be used for training word representations.
* `word_ngrams` _(int, optional)_: If 1, uses enriches word vectors with subword(n-gram) information. If 0, this is equivalent to Word2Vec. If > 1, this parameter is ignored and subwords are used.

In [4]:
# documentation: https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec
# Assumption: Provided input is a txt file with one sentence per line.
INPUT = ["../corpus/turkish-texts-tokenized.txt", "../corpus/bounwebcorpus.txt"]
OUTPUT = "fasttext.model"
MIN_COUNT = 10   # ignore all words with total frequency lower than this
EMB = 300        # dimensionality of word vectors
WINDOW = 5       # maximum distance between the target and context word within a sentence
EPOCH = 5        # number of iterations (epochs) over the corpus
SG = 1           # training algorithm: 1 for skip-gram; otherwise CBOW
HS = 0           # if 1, hierarchical softmax will be used for model training. If set to 0, and negative is non-zero, negative sampling will be used. If both of them 0, no training algorithm will be used.
NEGATIVE = 5     # if > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
MINN = 3         # min length of char ngram
MAXN = 6         # max length of char ngram
WNG = 1          # In Facebook’s FastText, “max length of word ngram” - but gensim only supports the default of 1 (regular unigram word handling).
OUTPUT = "fasttext-5epoch.model"

# So, if both hs and negative are set to 0, it means that no training algorithm will be used to learn the word embeddings. In this case, you will have to provide pre-trained word embeddings for the model to use.

In [5]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = FastText(vector_size=EMB, 
                window=WINDOW, 
                min_count=MIN_COUNT, 
                sg = SG,
                hs = HS,
                negative = NEGATIVE,
                min_n=MINN,
                max_n=MAXN,
                word_ngrams=WNG,
                workers=multiprocessing.cpu_count(),
                callbacks=[callback()])

model.build_vocab(corpus_iterable=LineSentences(INPUT))
model.train(corpus_iterable=LineSentences(INPUT), epochs = model.epochs, total_examples=model.corpus_count, compute_loss=True)

2023-03-12 17:09:08,004 : INFO : FastText lifecycle event {'params': 'FastText<vocab=0, vector_size=300, alpha=0.025>', 'datetime': '2023-03-12T17:09:08.004987', 'gensim': '4.3.0', 'python': '3.9.2 (tags/v3.9.2:1a79785, Feb 19 2021, 13:44:55) [MSC v.1928 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'created'}
2023-03-12 17:09:08,005 : INFO : collecting all words and their counts
2023-03-12 17:09:08,006 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-03-12 17:09:08,098 : INFO : PROGRESS: at sentence #10000, processed 140967 words, keeping 31034 word types
2023-03-12 17:09:08,188 : INFO : PROGRESS: at sentence #20000, processed 291342 words, keeping 48763 word types
2023-03-12 17:09:08,266 : INFO : PROGRESS: at sentence #30000, processed 445293 words, keeping 61864 word types
2023-03-12 17:09:08,344 : INFO : PROGRESS: at sentence #40000, processed 594077 words, keeping 73545 word types
2023-03-12 17:09:08,421 : INFO : PROGRESS: at se

KeyboardInterrupt: 

In [None]:
import tempfile
with tempfile.NamedTemporaryFile(prefix=OUTPUT, delete=False) as tmp:
    model.save(tmp.name, separately=[])

In [None]:
word_vectors = FastText.load(OUTPUT).wv

In [None]:
word_vectors.most_similar_cosmul(positive=['kadın', 'kral'], negative=['adam'])