# Building fastText based models

### Importing the libraries and data

In [1]:
from gensim.models import FastText
from gensim.test.utils import common_texts

In [2]:
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

### Building a basic model

In [3]:
model = FastText(size=5, window=3, min_count=1)

In [4]:
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)

### Check the vocabulary

In [5]:
model.wv.vocab

{'human': <gensim.models.keyedvectors.Vocab at 0x1103db780>,
 'interface': <gensim.models.keyedvectors.Vocab at 0x1103db7f0>,
 'computer': <gensim.models.keyedvectors.Vocab at 0x1274b84a8>,
 'survey': <gensim.models.keyedvectors.Vocab at 0x1274b8710>,
 'user': <gensim.models.keyedvectors.Vocab at 0x1274b8748>,
 'system': <gensim.models.keyedvectors.Vocab at 0x1274b8780>,
 'response': <gensim.models.keyedvectors.Vocab at 0x1274b87b8>,
 'time': <gensim.models.keyedvectors.Vocab at 0x1274b87f0>,
 'eps': <gensim.models.keyedvectors.Vocab at 0x1274b8828>,
 'trees': <gensim.models.keyedvectors.Vocab at 0x1274b8860>,
 'graph': <gensim.models.keyedvectors.Vocab at 0x1274b8898>,
 'minors': <gensim.models.keyedvectors.Vocab at 0x1274b88d0>}

In [6]:
model.wv['human']

array([ 0.03953331, -0.02951075,  0.02039873,  0.00304991, -0.00968183],
      dtype=float32)

### Checkout the most similar feature

In [7]:
model.wv.most_similar(positive=['computer', 'interface'], negative=['human'])

[('system', 0.908109724521637),
 ('eps', 0.886881947517395),
 ('response', 0.6286922097206116),
 ('user', 0.38861846923828125),
 ('minors', 0.24753454327583313),
 ('time', 0.06086184084415436),
 ('survey', -0.0791618824005127),
 ('trees', -0.40337082743644714),
 ('graph', -0.46148836612701416)]

### min_n and max_n parameters

In [8]:
model = FastText(size=5, window=3, min_count=1, min_n=1, max_n=5)

In [9]:
model.build_vocab(sentences=common_texts)
model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)

### Let's try and fetch a representation for an out of vocabulary word

In [10]:
model.wv['rubber']

array([-0.01671136, -0.01868909, -0.03945312, -0.01389101, -0.0250267 ],
      dtype=float32)

### Checkout the most similar feature using an Out of Vocab term

In [11]:
model.wv.most_similar(positive=['computer', 'human'], negative=['rubber'])

[('time', 0.5615436434745789),
 ('system', 0.4772699475288391),
 ('minors', 0.3850055932998657),
 ('eps', 0.15983597934246063),
 ('user', -0.2565014064311981),
 ('graph', -0.411243200302124),
 ('response', -0.4405473470687866),
 ('trees', -0.6079868078231812),
 ('interface', -0.6381739377975464),
 ('survey', -0.8393087387084961)]

### Extending the built model to incorporate words from new sentences

In [12]:
sentences_to_be_added = [["I", "am", "learning", "Natural", "Language", "Processing"],
                         ["Natural", "Language", "Processing", "is", "cool"]]

In [13]:
model.build_vocab(sentences_to_be_added, update=True)
model.train(sentences=common_texts, total_examples=len(sentences_to_be_added), epochs=10)

In [14]:
model.wv.vocab

{'human': <gensim.models.keyedvectors.Vocab at 0x1103db908>,
 'interface': <gensim.models.keyedvectors.Vocab at 0x1274cbcf8>,
 'computer': <gensim.models.keyedvectors.Vocab at 0x1274cb9e8>,
 'survey': <gensim.models.keyedvectors.Vocab at 0x1274cba20>,
 'user': <gensim.models.keyedvectors.Vocab at 0x1274cba58>,
 'system': <gensim.models.keyedvectors.Vocab at 0x1274cba90>,
 'response': <gensim.models.keyedvectors.Vocab at 0x1274cbac8>,
 'time': <gensim.models.keyedvectors.Vocab at 0x1274cbdd8>,
 'eps': <gensim.models.keyedvectors.Vocab at 0x1274cbcc0>,
 'trees': <gensim.models.keyedvectors.Vocab at 0x1274cbe10>,
 'graph': <gensim.models.keyedvectors.Vocab at 0x1274cbb38>,
 'minors': <gensim.models.keyedvectors.Vocab at 0x1274cbef0>,
 'I': <gensim.models.keyedvectors.Vocab at 0x1274cb320>,
 'am': <gensim.models.keyedvectors.Vocab at 0x1274cb240>,
 'learning': <gensim.models.keyedvectors.Vocab at 0x1274cb2b0>,
 'Natural': <gensim.models.keyedvectors.Vocab at 0x1274cbf28>,
 'Language': <gen