In [20]:
import gensim.downloader as api
from gensim.models import Word2Vec

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np

In [2]:
wv_pretrained = api.load("word2vec-google-news-300")

In [3]:
wv_pretrained.most_similar(positive=["king", "woman"], negative=["man"])

[('queen', 0.7118192911148071),
 ('monarch', 0.6189674735069275),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133),
 ('kings', 0.5236844420433044),
 ('Queen_Consort', 0.5235945582389832),
 ('queens', 0.5181134343147278),
 ('sultan', 0.5098593235015869),
 ('monarchy', 0.5087411403656006)]

In [4]:
wv_pretrained.most_similar(positive=["India", "Delhi"], negative=["Mumbai"])

[('Nepal', 0.619372546672821),
 ('Delhi_Oct.##_ANI', 0.6125845909118652),
 ('Delhi_Mar.##', 0.5995363593101501),
 ('Indias', 0.5982224345207214),
 ('Himachal_Pradesh', 0.5855835676193237),
 ('Delhi_Jan.##_ANI', 0.5768283605575562),
 ('Indiaâ_€_™', 0.5768230557441711),
 ('Delhi_Nov.##_ANI', 0.5760971903800964),
 ('Delhi_Aug.##_ANI', 0.575374186038971),
 ('NEW_DELHI', 0.5662563443183899)]

In [5]:
wv_pretrained.most_similar(positive=["mango", "banana"], negative=["apple"])

[('coconut', 0.6269199252128601),
 ('bananas', 0.6094670295715332),
 ('pineapple', 0.5931852459907532),
 ('mangoes', 0.581656277179718),
 ('cashew', 0.5697671175003052),
 ('papaya', 0.5613257884979248),
 ('cashew_nuts', 0.5515928864479065),
 ('pineapples', 0.5468831658363342),
 ('mangos', 0.5464771389961243),
 ('cashew_nut', 0.5408104658126831)]

In [6]:
# Find most similar words
word_pairs = [
    ("king", "man", "woman"),       
    ("India", "Mumbai", "Delhi"), 
    ("banana", "apple", "mango"),      
]

for words in word_pairs:
    similar_words = wv_pretrained.most_similar(positive=[words[0], words[2]], negative=[words[1]])
    print(f"{words[0]} - {words[1]} + {words[2]} ~= {similar_words[0][0]}")


king - man + woman ~= queen
India - Mumbai + Delhi ~= Nepal
banana - apple + mango ~= coconut


In [7]:
from nltk.corpus import stopwords
import pandas as pd
import string

In [8]:
imdb = pd.read_csv("IMDB Dataset.csv")

In [9]:
imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
def remove_stopwords(text):
    if isinstance(text, str):
        text = text.translate(str.maketrans('', '', string.punctuation))
        words = text.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return ' '.join(filtered_words)
    else:
        return text

imdb['review'] = imdb['review'].apply(remove_stopwords)  

In [12]:
imdb

Unnamed: 0,review,sentiment
0,One reviewers mentioned watching 1 Oz episode ...,positive
1,wonderful little production br br filming tech...,positive
2,thought wonderful way spend time hot summer we...,positive
3,Basically theres family little boy Jake thinks...,negative
4,Petter Matteis Love Time Money visually stunni...,positive
...,...,...
49995,thought movie right good job wasnt creative or...,positive
49996,Bad plot bad dialogue bad acting idiotic direc...,negative
49997,Catholic taught parochial elementary schools n...,negative
49998,Im going disagree previous comment side Maltin...,negative


In [13]:
tokenized_data = imdb['review'].apply(word_tokenize)

In [16]:
skipgram = Word2Vec(
    sentences= tokenized_data,
    sg=1,  
    vector_size=50,  
    window=5,  
    min_count=5,  
)


In [17]:
cbow = Word2Vec(
    sentences=tokenized_data,
    sg=0,  
    vector_size=50,  
    window=5,  
    min_count=5,  
)


In [21]:
def get_average_vector(sentence, model):
    words = sentence.split()
    vectors = [model.wv[word] for word in words if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

imdb['skipgram_vector'] = imdb['review'].apply(lambda x: get_average_vector(x, skipgram))
imdb['cbow_vector'] = imdb['review'].apply(lambda x: get_average_vector(x, cbow))

In [22]:
def regression(x_train, x_test, y_train, y_test):
    clf = LogisticRegression(max_iter=1000)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    return classification_report(y_test, y_pred)


In [24]:
from sklearn.model_selection import train_test_split

skip = np.array(imdb['skipgram_vector'].tolist())
cbow = np.array(imdb['cbow_vector'].tolist())
y = imdb['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

sx_train, sx_test, y_train, y_test = train_test_split(skip, y, test_size=0.2, random_state=42)
cx_train, cx_test, _, _ = train_test_split(cbow, y, test_size=0.2, random_state=42)


In [31]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

print("Skip-gram Model:")
print(regression(sx_train, sx_test, y_train, y_test))

Skip-gram Model:
              precision    recall  f1-score   support

           0       0.87      0.86      0.87      4961
           1       0.87      0.87      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [32]:
print("CBoW Model:")
print(regression(cx_train, cx_test, y_train, y_test))

CBoW Model:
              precision    recall  f1-score   support

           0       0.85      0.84      0.85      4961
           1       0.84      0.86      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000



In [36]:
from gensim.models import KeyedVectors

model_path = 'GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(model_path, binary=True)


Pretrained Word2Vec Model

In [37]:
def get_average_vector(sentence, model):
    words = sentence.split()
    vectors = [model[word] for word in words if word in model]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)

imdb['new_vector'] = imdb['review'].apply(lambda x: get_average_vector(x, model))


new_x = np.array(imdb['new_vector'].tolist())
y = imdb['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

new_x_train, new_x_test, y_train, y_test = train_test_split(new_x, y, test_size=0.2, random_state=42)

print("Pretrained Word2Vec Model:")
print(regression(new_x_train, new_x_test, y_train, y_test))

Pretrained Word2Vec Model:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85      4961
           1       0.85      0.86      0.85      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

