In [1]:
# installing fasttext
!pip install -Uq fasttext

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone


In [2]:
# imports
import pandas as pd
from gensim.parsing.preprocessing import strip_punctuation, strip_numeric, strip_multiple_whitespaces
from gensim.utils import deaccent
import fasttext
from sklearn.model_selection import train_test_split

In [3]:
# loading data; 0 = neg & 1 = pos
df = pd.read_csv('/content/imdb_dataset.csv')
df.head()

Unnamed: 0,text,label
0,i always wrote this series off as being a comp...,0
1,st watched out of dir steve purcell typical ma...,0
2,this movie was so poorly written and directed ...,0
3,the most interesting thing about miryang secre...,1
4,when i first read about berlin am meer i didn ...,0


In [4]:
# getting info of dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [5]:
# looking at data
df['text'][0]

'i always wrote this series off as being a complete stink fest because jim belushi was involved in it and heavily but then one day a tragic happenstance occurred after a white sox game ended i realized that the remote was all the way on the other side of the room somehow now i could have just gotten up and walked across the room to get the remote or even to the tv to turn the channel but then why not just get up and walk across the country to watch tv in another state nuts to that i said so i decided to just hang tight on the couch and take whatever fate had in store for me what fate had in store was an episode of this show an episode about which i remember very little except that i had once again made a very broad general sweeping blanket judgment based on zero objective or experiential evidence with nothing whatsoever to back my opinions up with and once again i was completely right this show is a total crud pie belushi has all the comedic delivery of a hairy lighthouse foghorn the w

In [6]:
# preprocessing text
df['text'] = df['text'].apply(strip_punctuation)
df['text'] = df['text'].apply(strip_numeric)
df['text'] = df['text'].apply(deaccent)
df['text'] = df['text'].apply(strip_multiple_whitespaces)
df.head()

Unnamed: 0,text,label
0,i always wrote this series off as being a comp...,0
1,st watched out of dir steve purcell typical ma...,0
2,this movie was so poorly written and directed ...,0
3,the most interesting thing about miryang secre...,1
4,when i first read about berlin am meer i didn ...,0


In [7]:
# putting data into desired format
df['desired_format'] = '__label__' + df['label'].astype('str') + ' ' + df['text']
df.head()

Unnamed: 0,text,label,desired_format
0,i always wrote this series off as being a comp...,0,__label__0 i always wrote this series off as b...
1,st watched out of dir steve purcell typical ma...,0,__label__0 st watched out of dir steve purcell...
2,this movie was so poorly written and directed ...,0,__label__0 this movie was so poorly written an...
3,the most interesting thing about miryang secre...,1,__label__1 the most interesting thing about mi...
4,when i first read about berlin am meer i didn ...,0,__label__0 when i first read about berlin am m...


In [8]:
# splitting dataset into training, testing sets
train, test = train_test_split(df['desired_format'], test_size=0.2, shuffle=True, random_state=22)
print('Training shape:', train.shape)
print('Testing shape:', test.shape)

Training shape: (4000,)
Testing shape: (1000,)


In [9]:
# saving training, testing sets into seperate files
train.to_csv('train_data.txt', index=False, header=None)
test.to_csv('test_data.txt', index=False, header=None)

In [10]:
# model training; with hyperparameter tuning
model = fasttext.train_supervised(input='/content/train_data.txt', autotuneValidationFile='/content/test_data.txt')

In [11]:
# model evaluation
_, precision, recall = model.test('/content/test_data.txt')
print(f'Precision: {precision} & Recall: {recall}')

Precision: 0.882 & Recall: 0.882


In [12]:
# making prediction
model.predict('the movie was great')

(('__label__1',), array([1.00000954]))

In [13]:
# making prediction
model.predict(text='pathetic movie')

(('__label__0',), array([1.00000989]))