In [None]:
# In this notebook, we will train our own Word2Vec model.

In [37]:
import numpy as np
import pandas as pd

In [38]:
df = pd.read_csv("/content/IMDB_Dataset.csv")

In [39]:
df.shape

(50000, 2)

In [40]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [41]:
df = df.iloc[:10000]   # sirf 10k rows hi le rahe hain as we are just seeing how to perform text classification task and won't focus on accuracy that much.
df.shape

(10000, 2)

In [42]:
df.duplicated().sum()

np.int64(17)

In [43]:
df.drop_duplicates(inplace = True)

In [44]:
# lowercasing
df['review'] = df['review'].str.lower()

In [45]:
# removing html tags
import re
def remove_html_tags(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_html_tags)

In [46]:
# removing URLs
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'', text)

df['review'] = df['review'].apply(remove_url)

In [47]:
# remove punctuations
import string

exclude = string.punctuation

def remove_punc(text):
    return text.translate(str.maketrans('', '', exclude))

df['review'] = df['review'].apply(remove_punc)

In [48]:
# # spelling correction
# from textblob import TextBlob

# df['review'] = df['review'].apply(lambda text: TextBlob(text = text).correct().string)

# Above code is taking a lot of time to execute, so leaving that

In [49]:
import nltk

In [50]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [51]:
# remove stopwords
from nltk.corpus import stopwords

sw_list = stopwords.words("english")
df['review'] = df['review'].apply(lambda text: [word for word in text.split() if word not in sw_list]).apply(lambda x: " ".join(x))

In [53]:
# !pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [54]:
import gensim

In [55]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess

In [57]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [58]:
story = []
for doc in df['review']:
  raw_sent = nltk.sent_tokenize(doc)
  for sent in raw_sent:
    story.append(simple_preprocess(sent))

In [61]:
story[:2]

[['one',
  'reviewers',
  'mentioned',
  'watching',
  'oz',
  'episode',
  'youll',
  'hooked',
  'right',
  'exactly',
  'happened',
  'methe',
  'first',
  'thing',
  'struck',
  'oz',
  'brutality',
  'unflinching',
  'scenes',
  'violence',
  'set',
  'right',
  'word',
  'go',
  'trust',
  'show',
  'faint',
  'hearted',
  'timid',
  'show',
  'pulls',
  'punches',
  'regards',
  'drugs',
  'sex',
  'violence',
  'hardcore',
  'classic',
  'use',
  'wordit',
  'called',
  'oz',
  'nickname',
  'given',
  'oswald',
  'maximum',
  'security',
  'state',
  'penitentary',
  'focuses',
  'mainly',
  'emerald',
  'city',
  'experimental',
  'section',
  'prison',
  'cells',
  'glass',
  'fronts',
  'face',
  'inwards',
  'privacy',
  'high',
  'agenda',
  'em',
  'city',
  'home',
  'manyaryans',
  'muslims',
  'gangstas',
  'latinos',
  'christians',
  'italians',
  'irish',
  'moreso',
  'scuffles',
  'death',
  'stares',
  'dodgy',
  'dealings',
  'shady',
  'agreements',
  'never',

In [62]:
model = gensim.models.Word2Vec(
      window = 10,
      vector_size = 100,
      min_count = 2
)

In [64]:
model.build_vocab(story)

In [65]:
model.train(story,
            total_examples = model.corpus_count,
            epochs = model.epochs
            )

(5473469, 5901340)

In [66]:
len(model.wv.index_to_key)

35178

In [67]:
# Now, our model is trained to generate embeddings for the given word in the review.
# But the model generates embeddings for a particular word only, not for a full document i.e., a full review.
# So, we will apply the concept of average Word2Vec here to generate embedding for a given document.
# The concept of avg Word2Vec is that the embedding for the given document is the average of the embeddings
# of each word in the given document.

# Example:
# Suppose we have the following review:
# "The movie was amazing"

# After preprocessing (lowercasing, removing stopwords, etc.), we get:
# ["movie", "amazing"]

# Assume our trained Word2Vec model gives the following embeddings:
# movie   -> [0.2, 0.4, 0.6]
# amazing -> [0.8, 0.6, 0.4]

# Then the document embedding is computed as:
# ([0.2, 0.4, 0.6] + [0.8, 0.6, 0.4]) / 2
# = [0.5, 0.5, 0.5]

# So, the final vector representation of the review
# "The movie was amazing"
# becomes: [0.5, 0.5, 0.5]

In [68]:
# Now, let's implement average Word2Vec

In [89]:
def document_vector(doc):
  # remove out of vocabulary words
  doc = [word for word in doc.split() if word in model.wv.index_to_key]

  # return average Word2Vec embedding
  return np.mean(model.wv[doc], axis = 0)

In [90]:
from tqdm import tqdm

In [91]:
X = []

for doc in tqdm(df['review'].values):
  X.append(document_vector(doc))

100%|██████████| 9983/9983 [04:12<00:00, 39.55it/s]


In [92]:
X[0]

array([-0.3039474 ,  0.11244334,  0.00875816,  0.23504525,  0.06876833,
       -0.7090007 ,  0.1607138 ,  0.82827675, -0.2526992 , -0.15914614,
       -0.16849516, -0.48640466,  0.06422277,  0.46162722,  0.27216166,
       -0.1550123 ,  0.2146622 , -0.34711307, -0.09683871, -0.73546326,
        0.35148782,  0.09383149,  0.23062906, -0.13547203, -0.03946864,
        0.14654909, -0.2782778 , -0.15497229, -0.35722232,  0.18791288,
        0.37646988,  0.13527153,  0.14739709, -0.20369454, -0.05219167,
        0.42232588, -0.08362305, -0.45649353, -0.22989792, -0.7188727 ,
        0.08921194, -0.29003742, -0.168483  , -0.00529674,  0.3056745 ,
       -0.00574183, -0.29378873, -0.11844075,  0.23970681,  0.09398975,
        0.1020093 , -0.29465684, -0.26720798, -0.05458615, -0.20183271,
        0.0768586 ,  0.21378063, -0.15533896, -0.2769907 ,  0.08617414,
        0.12921672,  0.08066856,  0.12351014,  0.08897415, -0.54073524,
        0.31932613,  0.1567122 ,  0.20188688, -0.61053485,  0.41

In [93]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(df['sentiment'])

In [94]:
y

array([1, 1, 1, ..., 0, 0, 1])

In [95]:
from sklearn.model_selection import train_test_split

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

In [97]:
from sklearn.naive_bayes import GaussianNB

In [98]:
gnb = GaussianNB()

gnb.fit(X_train, y_train)

In [99]:
y_pred = gnb.predict(X_test)

In [100]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.7411116675012519

In [None]:
# Some practical advice:
# Waise abhi toh humara ocus accuracy improve karne pe nahi tha but if you want to improve accuracy, then:
# 1. Use ensemble techniques like RandomForestClassifier
# 2. Add heuristic features too
# 3. Don't directly jump to DL models. Start from ML models and then gradually go to DL models. Because working with ML models makes us more aware about the nature of our data. And also in some cases, DL models provide only marginal improvement from ML models. But we know that deploying a DL model is way harder than an ML model.
# 4. Make sure the data is balanced. If unbalanced, use the standard techniques to handle the imbalance.