In [1]:
# imports
import pandas as pd
from gensim.utils import simple_preprocess
from gensim.models import FastText

In [2]:
# loading data
df = pd.read_csv('/content/imdb_dataset.csv')
df.head()

Unnamed: 0,text,label
0,i always wrote this series off as being a comp...,0
1,st watched out of dir steve purcell typical ma...,0
2,this movie was so poorly written and directed ...,0
3,the most interesting thing about miryang secre...,1
4,when i first read about berlin am meer i didn ...,0


In [3]:
# proprocessing text; lower case, tokenization
df['text'] = df['text'].apply(simple_preprocess)
df.head()

Unnamed: 0,text,label
0,"[always, wrote, this, series, off, as, being, ...",0
1,"[st, watched, out, of, dir, steve, purcell, ty...",0
2,"[this, movie, was, so, poorly, written, and, d...",0
3,"[the, most, interesting, thing, about, miryang...",1
4,"[when, first, read, about, berlin, am, meer, d...",0


In [4]:
# model initialization; skipgram
model = FastText(sg=1, seed=18)

In [5]:
# building vocab
model.build_vocab(corpus_iterable=df['text'])

In [6]:
# model training
model.train(corpus_iterable=df['text'], total_examples=model.corpus_count, epochs=5)

(4037772, 5604935)

#### Another way to build vocab, and train the model:
``` 
model = FastText(sentences=df['text'], iter=10, window=10, sg=1, vector_size=100, seed=18)
```

In [7]:
# checking similar words
model.wv.most_similar('cinema')

[('cinemax', 0.9708149433135986),
 ('cinemas', 0.9668000936508179),
 ('cinematic', 0.8616501688957214),
 ('korean', 0.773828387260437),
 ('document', 0.7659663558006287),
 ('historian', 0.7640450596809387),
 ('history', 0.7634275555610657),
 ('historic', 0.7578696608543396),
 ('mythology', 0.7575593590736389),
 ('industry', 0.7556672096252441)]

In [8]:
# checking similarity between words
model.wv.similarity('good', 'nice')

0.75753874

In [9]:
# saving model
model.save('MyFastText.model')

In [10]:
# loading model
loaded_model = FastText.load('MyFastText.model')

In [11]:
# rechecking similarity between words
loaded_model.wv.similarity('good', 'nice')

0.75753874