In [None]:
# source: https://fasttext.cc/docs/en/unsupervised-tutorial.html
# paper: https://arxiv.org/pdf/1607.04606.pdf
import fasttext

In [None]:
MODEL = "skipgram" # or "cbow"
INPUT = "../corpus/Turkish-English Parallel Corpus.txt"
DIM = 120       # dimension of word embeddings
WS = 5          # size of the context window
EPOCH = 5       # number of epochs
NEG = 5         # number of negative examples
LR = 0.05       # learning rate
MINCOUNT = 5    # minimal number of word occurences
MINN = 3        # min length of char ngram
MAXN = 6        # max length of char ngram
WNG = 2  # max length of word ngram
LOSS = "ns"     # loss function {ns, hs, softmax} (negative sampling loss function, hierarchical softmax loss function, softmax loss function, one-vs-all loss function)
OUTPUT = "fasttext.model"

## Parameters
```
input             # training file path (required)
model             # unsupervised fasttext model {cbow, skipgram} [skipgram]
lr                # learning rate [0.05]
dim               # size of word vectors [100]
ws                # size of the context window [5]
epoch             # number of epochs [5]
minCount          # minimal number of word occurences [5]
minn              # min length of char ngram [3]
maxn              # max length of char ngram [6]
neg               # number of negatives sampled [5]
wordNgrams        # max length of word ngram [1]
loss              # loss function {ns, hs, softmax, ova} [ns] 
bucket            # number of buckets [2000000]
thread            # number of threads [number of cpus]
lrUpdateRate      # change the rate of updates for the learning rate [100]
t                 # sampling threshold [0.0001]
verbose           # verbose [2]
```

In [None]:
# train the model
model = fasttext.train_unsupervised(input = INPUT, 
                                    model=MODEL,
                                    dim = DIM,
                                    ws = WS,
                                    epoch = EPOCH,
                                    neg = NEG,
                                    lr = LR,
                                    minCount = MINCOUNT,
                                    minn = MINN,
                                    maxn = MAXN,
                                    wordNgrams = WNG,
                                    loss = LOSS)

In [None]:
# save the model
model.save_model(OUTPUT)

In [None]:
# retrieve the fasttext model
model = fasttext.load_model(OUTPUT)

```
get_dimension           # Get the dimension (size) of a lookup vector (hidden layer).
                        # This is equivalent to `dim` property.
get_input_vector        # Given an index, get the corresponding vector of the Input Matrix.
get_input_matrix        # Get a copy of the full input matrix of a Model.
get_labels              # Get the entire list of labels of the dictionary
                        # This is equivalent to `labels` property.
get_line                # Split a line of text into words and labels.
get_output_matrix       # Get a copy of the full output matrix of a Model.
get_sentence_vector     # Given a string, get a single vector represenation. This function
                        # assumes to be given a single line of text. We split words on
                        # whitespace (space, newline, tab, vertical tab) and the control
                        # characters carriage return, formfeed and the null character.
get_subword_id          # Given a subword, return the index (within input matrix) it hashes to.
get_subwords            # Given a word, get the subwords and their indicies.
get_word_id             # Given a word, get the word id within the dictionary.
get_word_vector         # Get the vector representation of word.
get_words               # Get the entire list of words of the dictionary
                        # This is equivalent to `words` property.
is_quantized            # whether the model has been quantized
predict                 # Given a string, get a list of labels and a list of corresponding probabilities.
quantize                # Quantize the model reducing the size of the model and it's memory footprint.
save_model              # Save the model to the given path
test                    # Evaluate supervised model using file given by path
test_label              # Return the precision and recall score for each label. 
```

In [None]:
model.words   # bug: get rid of phrases like \xa0

In [None]:
if 'kral' in model.get_words(): 
    print(model.get_word_vector('kral'))

In [None]:
# get nearest neighbors
model.get_nearest_neighbors('adam')

In [None]:
# word analogies
# predict what is to Z as what X is to Y: model.get_analogies(X, Y, Z)
model.get_analogies("İstanbul", "Türkiye", "Amerika")

In [None]:
model.get_nearest_neighbors('İngiltere')