In [1]:
!pip install gensim



In [2]:
import gensim.downloader as api

# Load Google News pretrained model
model = api.load('word2vec-google-news-300')




In [3]:
words = ['apple', 'car', 'university', 'computer', 'music']
for word in words:
    print(f"\nTop similar words for '{word}':")
    similar = model.most_similar(word, topn=5)
    for sim_word, score in similar:
        print(f"{sim_word}: {score:.4f}")



Top similar words for 'apple':
apples: 0.7204
pear: 0.6451
fruit: 0.6410
berry: 0.6302
pears: 0.6134

Top similar words for 'car':
vehicle: 0.7821
cars: 0.7424
SUV: 0.7161
minivan: 0.6907
truck: 0.6736

Top similar words for 'university':
universities: 0.7004
faculty: 0.6781
unversity: 0.6758
undergraduate: 0.6587
univeristy: 0.6585

Top similar words for 'computer':
computers: 0.7979
laptop: 0.6640
laptop_computer: 0.6549
Computer: 0.6473
com_puter: 0.6082

Top similar words for 'music':
classical_music: 0.7198
jazz: 0.6835
Music: 0.6596
Without_Donny_Kirshner: 0.6416
songs: 0.6396


In [4]:
print(model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1))  # ≈ queen
print(model.most_similar(positive=['paris', 'italy'], negative=['france'], topn=1))  # ≈ rome
print(model.most_similar(positive=['walk', 'swimming'], negative=['walked'], topn=1))  # ≈ swim


[('queen', 0.7118193507194519)]
[('lohan', 0.5069674849510193)]
[('swim', 0.7891075611114502)]


In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("lakshmi25npathi/imdb-dataset-of-50k-movie-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/imdb-dataset-of-50k-movie-reviews


In [11]:
import pandas as pd
df = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")


In [12]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab') # Download the missing resource

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # remove HTML tags
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words and len(w) > 1]
    return tokens

df['cleaned'] = df['review'].apply(clean_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
import numpy as np

def get_avg_vector(tokens, model, dim=300):
    valid = [token for token in tokens if token in model]
    if valid:
        return np.mean([model[token] for token in valid], axis=0)
    else:
        return np.zeros(dim)

X = np.array([get_avg_vector(tokens, model) for tokens in df['cleaned']])
y = (df['sentiment'] == 'positive').astype(int)


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))


              precision    recall  f1-score   support

           0       0.81      0.81      0.81      4961
           1       0.81      0.82      0.81      5039

    accuracy                           0.81     10000
   macro avg       0.81      0.81      0.81     10000
weighted avg       0.81      0.81      0.81     10000



In [15]:
from gensim.models import Word2Vec
skipgram_model = Word2Vec(sentences=df['cleaned'], vector_size=100, window=5, sg=1, min_count=5, workers=4)


In [16]:
cbow_model = Word2Vec(sentences=df['cleaned'], vector_size=100, window=5, sg=0, min_count=5, workers=4)


In [17]:
from gensim.models import FastText
fasttext_model = FastText(sentences=df['cleaned'], vector_size=100, window=5, min_count=5, workers=4)


In [18]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# 1. Function to get average word vectors per document
def get_avg_vector(tokens, wv_model, dim):
    valid_tokens = [word for word in tokens if word in wv_model]
    if not valid_tokens:
        return np.zeros(dim)
    return np.mean([wv_model[word] for word in valid_tokens], axis=0)

# 2. Evaluation function
def evaluate_model(wv_model, model_name, dim=100):
    print(f"Evaluating: {model_name}")

    X = np.array([get_avg_vector(tokens, wv_model.wv, dim) for tokens in df['cleaned']])
    y = (df['sentiment'] == 'positive').astype(int)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    clf = RandomForestClassifier()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True)

    return {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1 Score': report['weighted avg']['f1-score']
    }

# 3. Run for all 3 models
results = []

results.append(evaluate_model(skipgram_model, "Custom Skip-gram"))
results.append(evaluate_model(cbow_model, "Custom CBOW"))
results.append(evaluate_model(fasttext_model, "Custom FastText"))

# 4. Create summary table
results_df = pd.DataFrame(results)
print("\n📊 Summary Table:")
print(results_df)

# Optional: Save to CSV
results_df.to_csv("word_vector_model_summary.csv", index=False)


Evaluating: Custom Skip-gram
Evaluating: Custom CBOW
Evaluating: Custom FastText

📊 Summary Table:
              Model  Accuracy  Precision  Recall  F1 Score
0  Custom Skip-gram    0.8470   0.847232  0.8470  0.846954
1       Custom CBOW    0.8375   0.837985  0.8375  0.837410
2   Custom FastText    0.8235   0.824073  0.8235  0.823383
