/
fasttext.py
1326 lines (1091 loc) · 52.9 KB
/
fasttext.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Authors: Gensim Contributors
# Copyright (C) 2018 RaRe Technologies s.r.o.
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
Introduction
------------
Learn word representations via fastText: `Enriching Word Vectors with Subword Information
<https://arxiv.org/abs/1607.04606>`_.
This module allows training word embeddings from a training corpus with the additional ability to obtain word vectors
for out-of-vocabulary words.
This module contains a fast native C implementation of fastText with Python interfaces. It is **not** only a wrapper
around Facebook's implementation.
This module supports loading models trained with Facebook's fastText implementation.
It also supports continuing training from such models.
For a tutorial see :ref:`sphx_glr_auto_examples_tutorials_run_fasttext.py`.
Usage examples
--------------
Initialize and train a model:
.. sourcecode:: pycon
>>> from gensim.models import FastText
>>> from gensim.test.utils import common_texts # some example sentences
>>>
>>> print(common_texts[0])
['human', 'interface', 'computer']
>>> print(len(common_texts))
9
>>> model = FastText(vector_size=4, window=3, min_count=1) # instantiate
>>> model.build_vocab(sentences=common_texts)
>>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train
Once you have a model, you can access its keyed vectors via the `model.wv` attributes.
The keyed vectors instance is quite powerful: it can perform a wide range of NLP tasks.
For a full list of examples, see :class:`~gensim.models.keyedvectors.KeyedVectors`.
You can also pass all the above parameters to the constructor to do everything
in a single line:
.. sourcecode:: pycon
>>> model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, epochs=10)
The two models above are instantiated differently, but behave identically.
For example, we can compare the embeddings they've calculated for the word "computer":
.. sourcecode:: pycon
>>> import numpy as np
>>>
>>> np.allclose(model.wv['computer'], model2.wv['computer'])
True
In the above examples, we trained the model from sentences (lists of words) loaded into memory.
This is OK for smaller datasets, but for larger datasets, we recommend streaming the file,
for example from disk or the network.
In Gensim, we refer to such datasets as "corpora" (singular "corpus"), and keep them
in the format described in :class:`~gensim.models.word2vec.LineSentence`.
Passing a corpus is simple:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> corpus_file = datapath('lee_background.cor') # absolute path to corpus
>>> model3 = FastText(vector_size=4, window=3, min_count=1)
>>> model3.build_vocab(corpus_file=corpus_file) # scan over corpus to build the vocabulary
>>>
>>> total_words = model3.corpus_total_words # number of words in the corpus
>>> model3.train(corpus_file=corpus_file, total_words=total_words, epochs=5)
The model needs the `total_words` parameter in order to
manage the training rate (alpha) correctly, and to give accurate progress estimates.
The above example relies on an implementation detail: the
:meth:`~gensim.models.fasttext.FastText.build_vocab` method
sets the `corpus_total_words` (and also `corpus_count`) model attributes.
You may calculate them by scanning over the corpus yourself, too.
If you have a corpus in a different format, then you can use it by wrapping it
in an `iterator <https://wiki.python.org/moin/Iterator>`_.
Your iterator should yield a list of strings each time, where each string should be a separate word.
Gensim will take care of the rest:
.. sourcecode:: pycon
>>> from gensim.utils import tokenize
>>> from gensim import utils
>>>
>>>
>>> class MyIter:
... def __iter__(self):
... path = datapath('crime-and-punishment.txt')
... with utils.open(path, 'r', encoding='utf-8') as fin:
... for line in fin:
... yield list(tokenize(line))
>>>
>>>
>>> model4 = FastText(vector_size=4, window=3, min_count=1)
>>> model4.build_vocab(sentences=MyIter())
>>> total_examples = model4.corpus_count
>>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5)
Persist a model to disk with:
.. sourcecode:: pycon
>>> from gensim.test.utils import get_tmpfile
>>>
>>> fname = get_tmpfile("fasttext.model")
>>>
>>> model.save(fname)
>>> model = FastText.load(fname)
Once loaded, such models behave identically to those created from scratch.
For example, you can continue training the loaded model:
.. sourcecode:: pycon
>>> import numpy as np
>>>
>>> 'computation' in model.wv.key_to_index # New word, currently out of vocab
False
>>> old_vector = np.copy(model.wv['computation']) # Grab the existing vector
>>> new_sentences = [
... ['computer', 'aided', 'design'],
... ['computer', 'science'],
... ['computational', 'complexity'],
... ['military', 'supercomputer'],
... ['central', 'processing', 'unit'],
... ['onboard', 'car', 'computer'],
... ]
>>>
>>> model.build_vocab(new_sentences, update=True) # Update the vocabulary
>>> model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>>
>>> new_vector = model.wv['computation']
>>> np.allclose(old_vector, new_vector, atol=1e-4) # Vector has changed, model has learnt something
False
>>> 'computation' in model.wv.key_to_index # Word is still out of vocab
False
.. Important::
Be sure to call the :meth:`~gensim.models.fasttext.FastText.build_vocab`
method with `update=True` before the :meth:`~gensim.models.fasttext.FastText.train` method
when continuing training. Without this call, previously unseen terms
will not be added to the vocabulary.
You can also load models trained with Facebook's fastText implementation:
.. sourcecode:: pycon
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_model = load_facebook_model(cap_path)
Once loaded, such models behave identically to those trained from scratch.
You may continue training them on new data:
.. sourcecode:: pycon
>>> 'computer' in fb_model.wv.key_to_index # New word, currently out of vocab
False
>>> old_computer = np.copy(fb_model.wv['computer']) # Calculate current vectors
>>> fb_model.build_vocab(new_sentences, update=True)
>>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_model.wv['computer']
>>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something
False
>>> 'computer' in fb_model.wv.key_to_index # New word is now in the vocabulary
True
If you do not intend to continue training the model, consider using the
:func:`gensim.models.fasttext.load_facebook_vectors` function instead.
That function only loads the word embeddings (keyed vectors), consuming much less CPU and RAM:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> wv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in wv.key_to_index # Word is out of vocabulary
False
>>> oov_vector = wv['landlord'] # Even OOV words have vectors in FastText
>>>
>>> 'landlady' in wv.key_to_index # Word is in the vocabulary
True
>>> iv_vector = wv['landlady']
Retrieve the word-vector for vocab and out-of-vocab word:
.. sourcecode:: pycon
>>> existent_word = "computer"
>>> existent_word in model.wv.key_to_index
True
>>> computer_vec = model.wv[existent_word] # numpy vector of a word
>>>
>>> oov_word = "graph-out-of-vocab"
>>> oov_word in model.wv.key_to_index
False
>>> oov_vec = model.wv[oov_word] # numpy vector for OOV word
You can perform various NLP word tasks with the model, some of them are already built-in:
.. sourcecode:: pycon
>>> similarities = model.wv.most_similar(positive=['computer', 'human'], negative=['interface'])
>>> most_similar = similarities[0]
>>>
>>> similarities = model.wv.most_similar_cosmul(positive=['computer', 'human'], negative=['interface'])
>>> most_similar = similarities[0]
>>>
>>> not_matching = model.wv.doesnt_match("human computer interface tree".split())
>>>
>>> sim_score = model.wv.similarity('computer', 'human')
Correlation with human opinion on word similarity:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
And on word analogies:
.. sourcecode:: pycon
>>> analogies_result = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
"""
import logging
import numpy as np
from numpy import ones, vstack, float32 as REAL
import gensim.models._fasttext_bin
from gensim.models.word2vec import Word2Vec
from gensim.models.keyedvectors import KeyedVectors, prep_vectors
from gensim import utils
from gensim.utils import deprecated
try:
from gensim.models.fasttext_inner import ( # noqa: F401
train_batch_any,
MAX_WORDS_IN_BATCH,
compute_ngrams,
compute_ngrams_bytes,
ft_hash_bytes,
)
from gensim.models.fasttext_corpusfile import train_epoch_sg, train_epoch_cbow
except ImportError:
raise utils.NO_CYTHON
logger = logging.getLogger(__name__)
class FastText(Word2Vec):
def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100, alpha=0.025,
window=5, min_count=5,
max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6,
sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(),
max_final_vocab=None):
"""Train, use and evaluate word representations learned using the method
described in `Enriching Word Vectors with Subword Information <https://arxiv.org/abs/1607.04606>`_,
aka FastText.
The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and
:meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the
original Fasttext implementation via :func:`~gensim.models.fasttext.load_facebook_model`.
Parameters
----------
sentences : iterable of list of str, optional
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus'
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such
examples. If you don't supply `sentences`, the model is left uninitialized -- use if you plan to
initialize it in some other way.
corpus_file : str, optional
Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format.
You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or
`corpus_file` arguments need to be passed (or none of them, in that case, the model is left
uninitialized).
min_count : int, optional
The model ignores all words with total frequency lower than this.
vector_size : int, optional
Dimensionality of the word vectors.
window : int, optional
The maximum distance between the current and predicted word within a sentence.
workers : int, optional
Use these many worker threads to train the model (=faster training with multicore machines).
alpha : float, optional
The initial learning rate.
min_alpha : float, optional
Learning rate will linearly drop to `min_alpha` as training progresses.
sg : {1, 0}, optional
Training algorithm: skip-gram if `sg=1`, otherwise CBOW.
hs : {1,0}, optional
If 1, hierarchical softmax will be used for model training.
If set to 0, and `negative` is non-zero, negative sampling will be used.
seed : int, optional
Seed for the random number generator. Initial vectors for each word are seeded with a hash of
the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
use of the `PYTHONHASHSEED` environment variable to control hash randomization).
max_vocab_size : int, optional
Limits the RAM during vocabulary building; if there are more unique
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
Set to `None` for no limit.
sample : float, optional
The threshold for configuring which higher-frequency words are randomly downsampled,
useful range is (0, 1e-5).
negative : int, optional
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper.
More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that
other values may perform better for recommendation applications.
cbow_mean : {1,0}, optional
If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used.
hashfxn : function, optional
Hash function to use to randomly initialize weights, for increased training reproducibility.
iter : int, optional
Number of iterations (epochs) over the corpus.
trim_rule : function, optional
Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary,
be trimmed away, or handled using the default (discard if word count < min_count).
Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`),
or a callable that accepts parameters (word, count, min_count) and returns either
:attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`.
The rule, if given, is only used to prune vocabulary during
:meth:`~gensim.models.fasttext.FastText.build_vocab` and is not stored as part of themodel.
The input parameters are of the following types:
* `word` (str) - the word we are examining
* `count` (int) - the word's frequency count in the corpus
* `min_count` (int) - the minimum count threshold.
sorted_vocab : {1,0}, optional
If 1, sort the vocabulary by descending frequency before assigning word indices.
batch_words : int, optional
Target size (in words) for batches of examples passed to worker threads (and
thus cython routines).(Larger batches will be passed if individual
texts are longer than 10000 words, but the standard cython code truncates to that maximum.)
min_n : int, optional
Minimum length of char n-grams to be used for training word representations.
max_n : int, optional
Max length of char ngrams to be used for training word representations. Set `max_n` to be
lesser than `min_n` to avoid char ngrams being used.
word_ngrams : int, optional
In Facebook's FastText, "max length of word ngram" - but gensim only supports the
default of 1 (regular unigram word handling).
bucket : int, optional
Character ngrams are hashed into a fixed number of buckets, in order to limit the
memory usage of the model. This option specifies the number of buckets used by the model.
The default value of 2000000 consumes as much memory as having 2000000 more in-vocabulary
words in your model.
callbacks : :obj: `list` of :obj: `~gensim.models.callbacks.CallbackAny2Vec`, optional
List of callbacks that need to be executed/run at specific stages during training.
max_final_vocab : int, optional
Limits the vocab to a target vocab size by automatically selecting
``min_count```. If the specified ``min_count`` is more than the
automatically calculated ``min_count``, the former will be used.
Set to ``None`` if not required.
Examples
--------
Initialize and train a `FastText` model:
.. sourcecode:: pycon
>>> from gensim.models import FastText
>>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
>>>
>>> model = FastText(sentences, min_count=1)
>>> say_vector = model.wv['say'] # get vector for word
>>> of_vector = model.wv['of'] # get vector for out-of-vocab word
Attributes
----------
wv : :class:`~gensim.models.fasttext.FastTextKeyedVectors`
This object essentially contains the mapping between words and embeddings. These are similar to
the embedding computed in the :class:`~gensim.models.word2vec.Word2Vec`, however here we also
include vectors for n-grams. This allows the model to compute embeddings even for **unseen**
words (that do not exist in the vocabulary), as the aggregate of the n-grams included in the word.
After training the model, this attribute can be used directly to query those embeddings in various
ways. Check the module level docstring for some examples.
"""
self.load = utils.call_on_class_only
self.load_fasttext_format = utils.call_on_class_only
self.callbacks = callbacks
if word_ngrams != 1:
raise NotImplementedError("Gensim's FastText implementation does not yet support word_ngrams != 1.")
self.word_ngrams = word_ngrams
if max_n < min_n:
# with no eligible char-ngram lengths, no buckets need be allocated
bucket = 0
self.wv = FastTextKeyedVectors(vector_size, min_n, max_n, bucket)
# EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
# advanced users should directly resize/adjust as desired after any vocab growth
self.wv.vectors_vocab_lockf = ones(1, dtype=REAL)
self.wv.vectors_ngrams_lockf = ones(1, dtype=REAL)
super(FastText, self).__init__(
sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=vector_size, epochs=epochs,
callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window,
max_vocab_size=max_vocab_size, max_final_vocab=max_final_vocab,
min_count=min_count, sample=sample, sorted_vocab=sorted_vocab,
null_word=null_word, ns_exponent=ns_exponent, hashfxn=hashfxn,
seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha)
def _init_post_load(self, hidden_output):
num_vectors = len(self.wv.vectors)
vocab_size = len(self.wv)
vector_size = self.wv.vector_size
assert num_vectors > 0, 'expected num_vectors to be initialized already'
assert vocab_size > 0, 'expected vocab_size to be initialized already'
# EXPERIMENTAL lockf feature; create minimal no-op lockf arrays (1 element of 1.0)
# advanced users should directly resize/adjust as necessary
self.wv.vectors_ngrams_lockf = ones(1, dtype=REAL)
self.wv.vectors_vocab_lockf = ones(1, dtype=REAL)
if self.hs:
self.syn1 = hidden_output
if self.negative:
self.syn1neg = hidden_output
self.layer1_size = vector_size
def _clear_post_train(self):
"""Clear any cached values that training may have invalidated."""
super(FastText, self)._clear_post_train()
self.wv.adjust_vectors() # ensure composite-word vecs reflect latest training
def estimate_memory(self, vocab_size=None, report=None):
"""Estimate memory that will be needed to train a model, and print the estimates to log."""
vocab_size = vocab_size or len(self.wv)
vec_size = self.vector_size * np.dtype(np.float32).itemsize
l1_size = self.layer1_size * np.dtype(np.float32).itemsize
report = report or {}
report['vocab'] = len(self.wv) * (700 if self.hs else 500)
report['syn0_vocab'] = len(self.wv) * vec_size
num_buckets = self.wv.bucket
if self.hs:
report['syn1'] = len(self.wv) * l1_size
if self.negative:
report['syn1neg'] = len(self.wv) * l1_size
if self.wv.bucket:
report['syn0_ngrams'] = self.wv.bucket * vec_size
num_ngrams = 0
for word in self.wv.key_to_index:
hashes = ft_ngram_hashes(word, self.wv.min_n, self.wv.max_n, self.wv.bucket)
num_ngrams += len(hashes)
# A list (64 bytes) with one np.array (100 bytes) per key, with a total of
# num_ngrams uint32s (4 bytes) amongst them.
# Only used during training, not stored with the model.
report['buckets_word'] = 64 + (100 * len(self.wv)) + (4 * num_ngrams) # TODO: caching & calc sensible?
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words, %i buckets and %i dimensions: %i bytes",
len(self.wv), num_buckets, self.vector_size, report['total'],
)
return report
def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
total_examples=None, total_words=None, **kwargs):
work, neu1 = thread_private_mem
if self.sg:
examples, tally, raw_tally = train_epoch_sg(self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1)
else:
examples, tally, raw_tally = train_epoch_cbow(self, corpus_file, offset, cython_vocab, cur_epoch,
total_examples, total_words, work, neu1)
return examples, tally, raw_tally
def _do_train_job(self, sentences, alpha, inits):
"""Train a single batch of sentences. Return 2-tuple `(effective word count after
ignoring unknown words and sentence length trimming, total word count)`.
Parameters
----------
sentences : iterable of list of str
Can be simply a list of lists of tokens, but for larger corpora,
consider an iterable that streams the sentences directly from disk/network.
See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.
alpha : float
The current learning rate.
inits : tuple of (:class:`numpy.ndarray`, :class:`numpy.ndarray`)
Each worker's private work memory.
Returns
-------
(int, int)
Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count)
"""
work, neu1 = inits
tally = train_batch_any(self, sentences, alpha, work, neu1)
return tally, self._raw_word_count(sentences)
@deprecated(
"Gensim 4.0.0 implemented internal optimizations that make calls to init_sims() unnecessary. "
"init_sims() is now obsoleted and will be completely removed in future versions. "
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4"
)
def init_sims(self, replace=False):
"""
Precompute L2-normalized vectors. Obsoleted.
If you need a single unit-normalized vector for some key, call
:meth:`~gensim.models.keyedvectors.KeyedVectors.get_vector` instead:
``fasttext_model.wv.get_vector(key, norm=True)``.
To refresh norms after you performed some atypical out-of-band vector tampering,
call `:meth:`~gensim.models.keyedvectors.KeyedVectors.fill_norms()` instead.
Parameters
----------
replace : bool
If True, forget the original trained vectors and only keep the normalized ones.
You lose information if you do this.
"""
self.wv.init_sims(replace=replace)
@classmethod
@utils.deprecated(
'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model '
'(to continue training with the loaded full model, more RAM) instead'
)
def load_fasttext_format(cls, model_file, encoding='utf8'):
"""Deprecated.
Use :func:`gensim.models.fasttext.load_facebook_model` or
:func:`gensim.models.fasttext.load_facebook_vectors` instead.
"""
return load_facebook_model(model_file, encoding=encoding)
@utils.deprecated(
'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model '
'(to continue training with the loaded full model, more RAM) instead'
)
def load_binary_data(self, encoding='utf8'):
"""Load data from a binary file created by Facebook's native FastText.
Parameters
----------
encoding : str, optional
Specifies the encoding.
"""
m = _load_fasttext_format(self.file_name, encoding=encoding)
for attr, val in m.__dict__.items():
setattr(self, attr, val)
def save(self, *args, **kwargs):
"""Save the Fasttext model. This saved model can be loaded again using
:meth:`~gensim.models.fasttext.FastText.load`, which supports incremental training
and getting vectors for out-of-vocabulary words.
Parameters
----------
fname : str
Store the model to this file.
See Also
--------
:meth:`~gensim.models.fasttext.FastText.load`
Load :class:`~gensim.models.fasttext.FastText` model.
"""
super(FastText, self).save(*args, **kwargs)
@classmethod
def load(cls, *args, **kwargs):
"""Load a previously saved `FastText` model.
Parameters
----------
fname : str
Path to the saved file.
Returns
-------
:class:`~gensim.models.fasttext.FastText`
Loaded model.
See Also
--------
:meth:`~gensim.models.fasttext.FastText.save`
Save :class:`~gensim.models.fasttext.FastText` model.
"""
return super(FastText, cls).load(*args, rethrow=True, **kwargs)
def _load_specials(self, *args, **kwargs):
"""Handle special requirements of `.load()` protocol, usually up-converting older versions."""
super(FastText, self)._load_specials(*args, **kwargs)
if hasattr(self, 'bucket'):
# should only exist in one place: the wv subcomponent
self.wv.bucket = self.bucket
del self.bucket
class FastTextVocab(utils.SaveLoad):
"""This is a redundant class. It exists only to maintain backwards compatibility
with older gensim versions."""
class FastTextTrainables(utils.SaveLoad):
"""Obsolete class retained for backward-compatible load()s"""
def _pad_ones(m, new_len):
"""Pad array with additional entries filled with ones."""
if len(m) > new_len:
raise ValueError('the new number of rows %i must be greater than old %i' % (new_len, len(m)))
new_arr = np.ones(new_len, dtype=REAL)
new_arr[:len(m)] = m
return new_arr
def load_facebook_model(path, encoding='utf-8'):
"""Load the model from Facebook's native fasttext `.bin` output file.
Notes
------
Facebook provides both `.vec` and `.bin` files with their modules.
The former contains human-readable vectors.
The latter contains machine-readable vectors along with other model parameters.
This function requires you to **provide the full path to the .bin file**.
It effectively ignores the `.vec` output file, since it is redundant.
This function uses the smart_open library to open the path.
The path may be on a remote host (e.g. HTTP, S3, etc).
It may also be gzip or bz2 compressed (i.e. end in `.bin.gz` or `.bin.bz2`).
For details, see `<https://github.com/RaRe-Technologies/smart_open>`__.
Parameters
----------
model_file : str
Path to the FastText output files.
FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
Examples
--------
Load, infer, continue training:
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_model = load_facebook_model(cap_path)
>>>
>>> 'landlord' in fb_model.wv.key_to_index # Word is out of vocabulary
False
>>> oov_term = fb_model.wv['landlord']
>>>
>>> 'landlady' in fb_model.wv.key_to_index # Word is in the vocabulary
True
>>> iv_term = fb_model.wv['landlady']
>>>
>>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
>>> fb_model.build_vocab(new_sent, update=True)
>>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)
Returns
-------
gensim.models.fasttext.FastText
The loaded model.
See Also
--------
:func:`~gensim.models.fasttext.load_facebook_vectors` loads
the word embeddings only. Its faster, but does not enable you to continue
training.
"""
return _load_fasttext_format(path, encoding=encoding, full_model=True)
def load_facebook_vectors(path, encoding='utf-8'):
"""Load word embeddings from a model saved in Facebook's native fasttext `.bin` format.
Notes
------
Facebook provides both `.vec` and `.bin` files with their modules.
The former contains human-readable vectors.
The latter contains machine-readable vectors along with other model parameters.
This function requires you to **provide the full path to the .bin file**.
It effectively ignores the `.vec` output file, since it is redundant.
This function uses the smart_open library to open the path.
The path may be on a remote host (e.g. HTTP, S3, etc).
It may also be gzip or bz2 compressed.
For details, see `<https://github.com/RaRe-Technologies/smart_open>`__.
Parameters
----------
path : str
The location of the model file.
encoding : str, optional
Specifies the file encoding.
Returns
-------
gensim.models.fasttext.FastTextKeyedVectors
The word embeddings.
Examples
--------
Load and infer:
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fbkv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in fbkv.key_to_index # Word is out of vocabulary
False
>>> oov_vector = fbkv['landlord']
>>>
>>> 'landlady' in fbkv.key_to_index # Word is in the vocabulary
True
>>> iv_vector = fbkv['landlady']
See Also
--------
:func:`~gensim.models.fasttext.load_facebook_model` loads
the full model, not just word embeddings, and enables you to continue
model training.
"""
full_model = _load_fasttext_format(path, encoding=encoding, full_model=False)
return full_model.wv
def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output files.
Parameters
----------
model_file : str
Full path to the FastText model file.
encoding : str, optional
Specifies the file encoding.
full_model : boolean, optional
If False, skips loading the hidden output matrix. This saves a fair bit
of CPU time and RAM, but prevents training continuation.
Returns
-------
:class: `~gensim.models.fasttext.FastText`
The loaded model.
"""
with utils.open(model_file, 'rb') as fin:
m = gensim.models._fasttext_bin.load(fin, encoding=encoding, full_model=full_model)
model = FastText(
vector_size=m.dim,
window=m.ws,
epochs=m.epoch,
negative=m.neg,
hs=int(m.loss == 1),
sg=int(m.model == 2),
bucket=m.bucket,
min_count=m.min_count,
sample=m.t,
min_n=m.minn,
max_n=m.maxn,
)
model.corpus_total_words = m.ntokens
model.raw_vocab = m.raw_vocab
model.nwords = m.nwords
model.vocab_size = m.vocab_size
#
# This is here to fix https://github.com/RaRe-Technologies/gensim/pull/2373.
#
# We explicitly set min_count=1 regardless of the model's parameters to
# ignore the trim rule when building the vocabulary. We do this in order
# to support loading native models that were trained with pretrained vectors.
# Such models will contain vectors for _all_ encountered words, not only
# those occurring more frequently than min_count.
#
# Native models trained _without_ pretrained vectors already contain the
# trimmed raw_vocab, so this change does not affect them.
#
model.prepare_vocab(update=True, min_count=1)
model.num_original_vectors = m.vectors_ngrams.shape[0]
model.wv.init_post_load(m.vectors_ngrams)
model._init_post_load(m.hidden_output)
_check_model(model)
model.add_lifecycle_event(
"load_fasttext_format",
msg=f"loaded {m.vectors_ngrams.shape} weight matrix for fastText model from {fin.name}",
)
return model
def _check_model(m):
"""Model sanity checks. Run after everything has been completely initialized."""
if m.wv.vector_size != m.wv.vectors_ngrams.shape[1]:
raise ValueError(
'mismatch between vector size in model params (%s) and model vectors (%s)' % (
m.wv.vector_size, m.wv.vectors_ngrams,
)
)
if hasattr(m, 'syn1neg') and m.syn1neg is not None:
if m.wv.vector_size != m.syn1neg.shape[1]:
raise ValueError(
'mismatch between vector size in model params (%s) and trainables (%s)' % (
m.wv.vector_size, m.wv.vectors_ngrams,
)
)
if len(m.wv) != m.nwords:
raise ValueError(
'mismatch between final vocab size (%s words), and expected number of words (%s words)' % (
len(m.wv), m.nwords,
)
)
if len(m.wv) != m.vocab_size:
# expecting to log this warning only for pretrained french vector, wiki.fr
logger.warning(
"mismatch between final vocab size (%s words), and expected vocab size (%s words)",
len(m.wv), m.vocab_size,
)
def save_facebook_model(model, path, encoding="utf-8", lr_update_rate=100, word_ngrams=1):
"""Saves word embeddings to the Facebook's native fasttext `.bin` format.
Notes
------
Facebook provides both `.vec` and `.bin` files with their modules.
The former contains human-readable vectors.
The latter contains machine-readable vectors along with other model parameters.
**This function saves only the .bin file**.
Parameters
----------
model : gensim.models.fasttext.FastText
FastText model to be saved.
path : str
Output path and filename (including `.bin` extension)
encoding : str, optional
Specifies the file encoding. Defaults to utf-8.
lr_update_rate : int
This parameter is used by Facebook fasttext tool, unused by Gensim.
It defaults to Facebook fasttext default value `100`.
In very rare circumstances you might wish to fiddle with it.
word_ngrams : int
This parameter is used by Facebook fasttext tool, unused by Gensim.
It defaults to Facebook fasttext default value `1`.
In very rare circumstances you might wish to fiddle with it.
Returns
-------
None
"""
fb_fasttext_parameters = {"lr_update_rate": lr_update_rate, "word_ngrams": word_ngrams}
gensim.models._fasttext_bin.save(model, path, fb_fasttext_parameters, encoding)
class FastTextKeyedVectors(KeyedVectors):
def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL):
"""Vectors and vocab for :class:`~gensim.models.fasttext.FastText`.
Implements significant parts of the FastText algorithm. For example,
the :func:`word_vec` calculates vectors for out-of-vocabulary (OOV)
entities. FastText achieves this by keeping vectors for ngrams:
adding the vectors for the ngrams of an entity yields the vector for the
entity.
Similar to a hashmap, this class keeps a fixed number of buckets, and
maps all ngrams to buckets using a hash function.
Parameters
----------
vector_size : int
The dimensionality of all vectors.
min_n : int
The minimum number of characters in an ngram
max_n : int
The maximum number of characters in an ngram
bucket : int
The number of buckets.
count : int, optional
If provided, vectors will be pre-allocated for at least this many vectors. (Otherwise
they can be added later.)
dtype : type, optional
Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
another type is provided here.
Attributes
----------
vectors_vocab : np.array
Each row corresponds to a vector for an entity in the vocabulary.
Columns correspond to vector dimensions. When embedded in a full
FastText model, these are the full-word-token vectors updated
by training, whereas the inherited vectors are the actual per-word
vectors synthesized from the full-word-token and all subword (ngram)
vectors.
vectors_ngrams : np.array
A vector for each ngram across all entities in the vocabulary.
Each row is a vector that corresponds to a bucket.
Columns correspond to vector dimensions.
buckets_word : list of np.array
For each key (by its index), report bucket slots their subwords map to.
When used in training, FastTextKeyedVectors may be decorated with
extra attributes that closely associate with its core attributes,
such as the experimental vectors_vocab_lockf and vectors_ngrams_lockf
training-update-dampening factors.
"""
super(FastTextKeyedVectors, self).__init__(vector_size=vector_size, count=count, dtype=dtype)
self.min_n = min_n
self.max_n = max_n
self.bucket = bucket # count of buckets, fka num_ngram_vectors
self.buckets_word = None # precalculated cache of buckets for each word's ngrams
self.vectors_vocab = np.zeros((count, vector_size), dtype=dtype) # fka (formerly known as) syn0_vocab
self.vectors_ngrams = None # must be initialized later
self.compatible_hash = True
@classmethod
def load(cls, fname_or_handle, **kwargs):
"""Load a previously saved `FastTextKeyedVectors` model.
Parameters
----------
fname : str
Path to the saved file.
Returns
-------
:class:`~gensim.models.fasttext.FastTextKeyedVectors`
Loaded model.
See Also
--------
:meth:`~gensim.models.fasttext.FastTextKeyedVectors.save`
Save :class:`~gensim.models.fasttext.FastTextKeyedVectors` model.
"""
return super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs)
def _load_specials(self, *args, **kwargs):
"""Handle special requirements of `.load()` protocol, usually up-converting older versions."""
super(FastTextKeyedVectors, self)._load_specials(*args, **kwargs)
if not isinstance(self, FastTextKeyedVectors):