In [1]:
# installing fasttext
!pip install -Uq fasttext

In [2]:
# imports
import pandas as pd
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric, strip_multiple_whitespaces
from gensim.utils import deaccent
import fasttext

In [3]:
# loading data
df = pd.read_csv('/content/imdb_dataset.csv')
df.head()

Unnamed: 0,text,label
0,i always wrote this series off as being a comp...,0
1,st watched out of dir steve purcell typical ma...,0
2,this movie was so poorly written and directed ...,0
3,the most interesting thing about miryang secre...,1
4,when i first read about berlin am meer i didn ...,0


In [4]:
# getting info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [5]:
# preprocessing text
df['text'] = df['text'].apply(strip_punctuation)
df['text'] = df['text'].apply(strip_numeric)
df['text'] = df['text'].apply(deaccent)
df['text'] = df['text'].apply(strip_multiple_whitespaces)
df.head()

Unnamed: 0,text,label
0,i always wrote this series off as being a comp...,0
1,st watched out of dir steve purcell typical ma...,0
2,this movie was so poorly written and directed ...,0
3,the most interesting thing about miryang secre...,1
4,when i first read about berlin am meer i didn ...,0


In [6]:
# saving data for learning word embeddings
data = df.to_csv(path_or_buf='data.txt', columns=['text'], header=None, index=False)

In [7]:
# training model
input_path = '/content/data.txt'
model = fasttext.train_unsupervised(input=input_path, epoch=2, wordNgrams=2)

In [8]:
# querying model
model.get_nearest_neighbors('cinema')

[(0.9939069747924805, 'cinemax'),
 (0.9901336431503296, 'classic'),
 (0.9899078011512756, 'cinemas'),
 (0.977889358997345, 'classical'),
 (0.9745604395866394, 'masterpiece'),
 (0.9737415909767151, 'product'),
 (0.972750186920166, 'genre'),
 (0.9724975228309631, 'greatest'),
 (0.9721636176109314, 'feature'),
 (0.9697199463844299, 'film')]

In [9]:
# saving model
model.save_model('MyModel.bin')

In [None]:
# loading model
loaded_model = fasttext.load_model('/content/MyModel.bin')

In [11]:
# requerying model
loaded_model.get_nearest_neighbors('cinema')

[(0.9939069747924805, 'cinemax'),
 (0.9901336431503296, 'classic'),
 (0.9899078011512756, 'cinemas'),
 (0.977889358997345, 'classical'),
 (0.9745604395866394, 'masterpiece'),
 (0.9737415909767151, 'product'),
 (0.972750186920166, 'genre'),
 (0.9724975228309631, 'greatest'),
 (0.9721636176109314, 'feature'),
 (0.9697199463844299, 'film')]