In [1]:
import gensim.downloader as api

# Load pretrained Google News word2vec model
wv_pretrained = api.load("word2vec-google-news-300")



In [2]:
words = ["technology", "movie", "king", "music", "education"]

for word in words:
    print(f"\nTop similar words to '{word}':")
    similar_words = wv_pretrained.most_similar(word, topn=5)
    for sim_word, score in similar_words:
        print(f"{sim_word} -> {score:.4f}")


Top similar words to 'technology':
technologies -> 0.8332
innovations -> 0.6231
technological_innovations -> 0.6102
technol -> 0.6047
technological_advancement -> 0.6036

Top similar words to 'movie':
film -> 0.8677
movies -> 0.8013
films -> 0.7363
moive -> 0.6830
Movie -> 0.6694

Top similar words to 'king':
kings -> 0.7138
queen -> 0.6511
monarch -> 0.6413
crown_prince -> 0.6204
prince -> 0.6160

Top similar words to 'music':
classical_music -> 0.7198
jazz -> 0.6835
Music -> 0.6596
Without_Donny_Kirshner -> 0.6416
songs -> 0.6396

Top similar words to 'education':
eduction -> 0.7980
eduation -> 0.7176
LISA_MICHALS_covers -> 0.6817
Matt_Krupnick_covers -> 0.6798
educational -> 0.6780


In [3]:
# Example 1: king - man + woman ≈ queen
print(wv_pretrained.most_similar(positive=["king", "woman"], negative=["man"], topn=1))

# Example 2: Paris - France + Italy ≈ Rome
print(wv_pretrained.most_similar(positive=["Paris", "Italy"], negative=["France"], topn=1))

# Example 3: doctor - man + woman ≈ nurse (or female doctor)
print(wv_pretrained.most_similar(positive=["doctor", "woman"], negative=["man"], topn=1))


[('queen', 0.7118193507194519)]
[('Milan', 0.7222141623497009)]
[('gynecologist', 0.7093892097473145)]


In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [5]:
import pandas as pd
import csv

df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
df


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [6]:
print(df['sentiment'].value_counts())

sentiment
positive    25000
negative    25000
Name: count, dtype: int64


In [7]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

df['tokens'] = df['review'].apply(clean_text)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [8]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})


In [9]:
import numpy as np

def get_avg_vector(tokens, model):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)


In [10]:
import gensim.downloader as api

wv_google = api.load("word2vec-google-news-300")

X_google = np.array([get_avg_vector(tokens, wv_google) for tokens in df['tokens']])


In [11]:
from gensim.models import Word2Vec

w2v_skipgram = Word2Vec(
    sentences=df['tokens'], vector_size=100, window=5, sg=1, min_count=2, workers=4
).wv

X_skipgram = np.array([get_avg_vector(tokens, w2v_skipgram) for tokens in df['tokens']])


In [12]:
w2v_cbow = Word2Vec(
    sentences=df['tokens'], vector_size=100, window=5, sg=0, min_count=2, workers=4
).wv

X_cbow = np.array([get_avg_vector(tokens, w2v_cbow) for tokens in df['tokens']])


In [13]:
from gensim.models import FastText

ft_model = FastText(
    sentences=df['tokens'], vector_size=100, window=5, sg=1, min_count=2, workers=4
).wv

X_fasttext = np.array([get_avg_vector(tokens, ft_model) for tokens in df['tokens']])


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

def evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc, classification_report(y_test, y_pred, output_dict=True)


In [15]:
y = df['label']

results = {}

results['Pretrained Word2Vec'] = evaluate_model(X_google, y)
results['Custom Skip-gram'] = evaluate_model(X_skipgram, y)
results['Custom CBOW'] = evaluate_model(X_cbow, y)
results['Custom FastText'] = evaluate_model(X_fasttext, y)


In [16]:
import pandas as pd

# Create a list of dicts for each model's performance
performance_data = []

for model_name, (acc, report) in results.items():
    performance_data.append({
        "Model": model_name,
        "Accuracy": round(acc, 4),
        "Precision (Pos)": round(report['1']['precision'], 4),
        "Recall (Pos)": round(report['1']['recall'], 4),
        "F1-score (Pos)": round(report['1']['f1-score'], 4),
    })

# Convert to DataFrame
performance_df = pd.DataFrame(performance_data)

# Display table
print(performance_df.to_string(index=False))


              Model  Accuracy  Precision (Pos)  Recall (Pos)  F1-score (Pos)
Pretrained Word2Vec    0.8318           0.8343        0.8313          0.8328
   Custom Skip-gram    0.8718           0.8696        0.8772          0.8733
        Custom CBOW    0.8419           0.8380        0.8508          0.8443
    Custom FastText    0.8706           0.8694        0.8746          0.8720
