In [1]:
import gensim.downloader as api

In [2]:
model = api.load('word2vec-google-news-300') #loading the model

In [3]:
# List of words to find similar words for
words = ['mobile','dog','king','water','fire']

for word in words:
    similar_words = model.most_similar(word, topn=5)
    print(f"Similar words for '{word}':")
    for similar_word, similarity in similar_words:
        print(f"  {similar_word} ({similarity})")
    print()

Similar words for 'mobile':
  mobile_phones (0.7054648399353027)
  Mobile (0.6691668629646301)
  smartphone (0.6600653529167175)
  smartphones (0.6404396891593933)
  handsets (0.6404278874397278)

Similar words for 'dog':
  dogs (0.8680490851402283)
  puppy (0.8106428384780884)
  pit_bull (0.780396044254303)
  pooch (0.762737512588501)
  cat (0.7609456777572632)

Similar words for 'king':
  kings (0.7138045430183411)
  queen (0.6510956287384033)
  monarch (0.6413194537162781)
  crown_prince (0.6204219460487366)
  prince (0.6159993410110474)

Similar words for 'water':
  potable_water (0.6799107193946838)
  Water (0.670687198638916)
  sewage (0.6619378328323364)
  groundwater (0.65883469581604)
  Floridan_aquifer (0.6422534584999084)

Similar words for 'fire':
  blaze (0.7516718506813049)
  fires (0.7222490906715393)
  Fire (0.69910728931427)
  flames (0.638767421245575)
  carelessly_discarded_cigarette (0.6215550303459167)



In [5]:
# Test analogies: most_similar method used to perform the analogy tests by specifying positive and negative words.
analogies = [
    ('king', 'man', 'woman'),
    ('Delhi', 'India', 'France'),
    ('apple', 'fruit', 'vegetable'),
    #checks if the model can find a word related to "vegetable" in a similar way "apple" is related to "fruit"
    ('doctor', 'hospital', 'school'),
    ('bird', 'fly', 'swim')
   
]

# analogy tests
for analogy in analogies:
    result = model.most_similar(positive=[analogy[0], analogy[2]], negative=[analogy[1]], topn=1)
    print(f"{analogy[0]} - {analogy[1]} + {analogy[2]} ≈ {result[0][0]} ({result[0][1]})")

king - man + woman ≈ queen (0.7118192911148071)
Delhi - India + France ≈ Paris (0.6828771233558655)
apple - fruit + vegetable ≈ potato (0.5865278244018555)
doctor - hospital + school ≈ guidance_counselor (0.5969594717025757)
bird - fly + swim ≈ swimming (0.557131826877594)


# IMDB Dataset

In [6]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from wordcloud import WordCloud


In [7]:
import pandas as pd
df=pd.read_csv("D:\IMDB Dataset.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [8]:

print(df['sentiment'].value_counts())

positive    25000
negative    25000
Name: sentiment, dtype: int64


In [10]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text) # Remove punctuation and numbers
    text = text.lower() # Convert to lowercase
    tokens = word_tokenize(text) # Tokenize the text
    cleaned_tokens = [word for word in tokens if word not in stopwords.words('english')] # Remove stopwords
    cleaned_text = ' '.join(cleaned_tokens) # Join tokens back to string
    return cleaned_text

In [11]:
subset_df = df.sample(frac=0.1, random_state=42)

In [12]:
subset_df['cleaned_review'] = subset_df['review'].apply(clean_text)

In [15]:
print("\nFirst few rows of the cleaned subset dataset:")
print(subset_df[['review', 'cleaned_review']].head())



First few rows of the cleaned subset dataset:
                                                  review  \
33553  I really liked this Summerslam due to the look...   
9427   Not many television shows appeal to quite as m...   
199    The film quickly gets to a major chase scene w...   
12447  Jane Austen would definitely approve of this o...   
39489  Expectations were somewhat high for me when I ...   

                                          cleaned_review  
33553  really liked summerslam due look arena curtain...  
9427   many television shows appeal quite many differ...  
199    film quickly gets major chase scene ever incre...  
12447  jane austen would definitely approve onebr br ...  
39489  expectations somewhat high went see movie thou...  


# Custom Skip-gram Vectors

In [19]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import pandas as pd

# Subset the data
subset_df = df.sample(frac=0.1, random_state=42)
subset_df['cleaned_review'] = subset_df['review'].apply(clean_text)

# Train custom Skip-gram model
skipgram_model = Word2Vec(subset_df['cleaned_review'], vector_size=100, window=5, sg=1, min_count=1, workers=4)

# Function to create document vectors by averaging word vectors
def get_avg_word2vecs(model, tokens_list):
    vector_size = model.wv.vector_size
    doc_vectors = []
    for tokens in tokens_list:
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
        else:
            avg_vector = np.zeros(vector_size)
        doc_vectors.append(avg_vector)
    return np.array(doc_vectors)

# Create document vectors
X_skipgram = get_avg_word2vecs(skipgram_model, subset_df['cleaned_review'])

# Labels
y = subset_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Split data
X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_skipgram, y, test_size=0.2, random_state=42)

# Train and evaluate model
rf_sg = RandomForestClassifier(n_estimators=100, random_state=42)
rf_sg.fit(X_train_sg, y_train_sg)
y_pred_sg = rf_sg.predict(X_test_sg)
print("Skip-gram Model Performance:")
print(classification_report(y_test_sg, y_pred_sg))




Skip-gram Model Performance:
              precision    recall  f1-score   support

           0       0.55      0.52      0.53       506
           1       0.53      0.56      0.55       494

    accuracy                           0.54      1000
   macro avg       0.54      0.54      0.54      1000
weighted avg       0.54      0.54      0.54      1000



# Custom CBoW Vectors

In [20]:
# Subset the data
subset_df = df.sample(frac=0.1, random_state=42)
subset_df['cleaned_review'] = subset_df['review'].apply(clean_text)

# Train custom CBoW model
cbow_model = Word2Vec(subset_df['cleaned_review'], vector_size=100, window=5, sg=0, min_count=1, workers=4)

# Create document vectors
X_cbow = get_avg_word2vecs(cbow_model, subset_df['cleaned_review'])

# Labels
y = subset_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Split data
X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cbow, y, test_size=0.2, random_state=42)

# Train and evaluate model
rf_cb = RandomForestClassifier(n_estimators=100, random_state=42)
rf_cb.fit(X_train_cb, y_train_cb)
y_pred_cb = rf_cb.predict(X_test_cb)
print("CBoW Model Performance:")
print(classification_report(y_test_cb, y_pred_cb))


Exception ignored in: <function SeekableUnicodeStreamReader.__del__ at 0x7f4f27b468c0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/nltk/data.py", line 1160, in __del__
    if not self.closed:
  File "/usr/local/lib/python3.10/dist-packages/nltk/data.py", line 1180, in closed
    return self.stream.closed
AttributeError: 'SeekableUnicodeStreamReader' object has no attribute 'stream'


CBoW Model Performance:
              precision    recall  f1-score   support

           0       0.58      0.54      0.56       506
           1       0.56      0.59      0.57       494

    accuracy                           0.57      1000
   macro avg       0.57      0.57      0.57      1000
weighted avg       0.57      0.57      0.57      1000



# Word2Vec SkipGram

In [21]:
import gensim.downloader as api

# Subset the data
subset_df = df.sample(frac=0.1, random_state=42)
subset_df['cleaned_review'] = subset_df['review'].apply(clean_text)

# Load the pretrained word2vec model
pretrained_model = api.load('word2vec-google-news-300')

# Function to create document vectors using pretrained word2vec model
def get_avg_pretrained_word2vecs(model, tokens_list):
    vector_size = model.vector_size
    doc_vectors = []
    for tokens in tokens_list:
        vectors = [model[token] for token in tokens if token in model]
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
        else:
            avg_vector = np.zeros(vector_size)
        doc_vectors.append(avg_vector)
    return np.array(doc_vectors)

# Create document vectors
X_pretrained = get_avg_pretrained_word2vecs(pretrained_model, subset_df['cleaned_review'])

# Labels
y = subset_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Split data
X_train_pt, X_test_pt, y_train_pt, y_test_pt = train_test_split(X_pretrained, y, test_size=0.2, random_state=42)

# Train and evaluate model
rf_pt = RandomForestClassifier(n_estimators=100, random_state=42)
rf_pt.fit(X_train_pt, y_train_pt)
y_pred_pt = rf_pt.predict(X_test_pt)
print("Pretrained Word2Vec Model Performance:")
print(classification_report(y_test_pt, y_pred_pt))


Pretrained Word2Vec Model Performance:
              precision    recall  f1-score   support

           0       0.59      0.56      0.58       506
           1       0.57      0.60      0.59       494

    accuracy                           0.58      1000
   macro avg       0.58      0.58      0.58      1000
weighted avg       0.58      0.58      0.58      1000



# Skip-gram Vectors with Hyperparameter Tuning

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

# Subset the data
subset_df = df.sample(frac=0.1, random_state=42)
subset_df['cleaned_review'] = subset_df['review'].apply(clean_text)

# Function to create document vectors by averaging word vectors
def get_avg_word2vecs(model, tokens_list):
    vector_size = model.wv.vector_size
    doc_vectors = []
    for tokens in tokens_list:
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
        else:
            avg_vector = np.zeros(vector_size)
        doc_vectors.append(avg_vector)
    return np.array(doc_vectors)

# Labels
y = subset_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Experiment with different hyperparameters for Skip-gram model
hyperparameters = [
    {'vector_size': 100, 'window': 5, 'min_count': 1},
    {'vector_size': 150, 'window': 10, 'min_count': 2},
    {'vector_size': 200, 'window': 5, 'min_count': 1},
]

for params in hyperparameters:
    skipgram_model = Word2Vec(subset_df['cleaned_review'], vector_size=params['vector_size'], window=params['window'], sg=1, min_count=params['min_count'], workers=4)
    X_skipgram = get_avg_word2vecs(skipgram_model, subset_df['cleaned_review'])

    X_train_sg, X_test_sg, y_train_sg, y_test_sg = train_test_split(X_skipgram, y, test_size=0.2, random_state=42)
    rf_sg = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_sg.fit(X_train_sg, y_train_sg)
    y_pred_sg = rf_sg.predict(X_test_sg)
    print(f"Skip-gram Model Performance with {params}:")
    print(classification_report(y_test_sg, y_pred_sg))




Skip-gram Model Performance with {'vector_size': 100, 'window': 5, 'min_count': 1}:
              precision    recall  f1-score   support

           0       0.56      0.53      0.54       506
           1       0.54      0.58      0.56       494

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.55      0.55      0.55      1000





Skip-gram Model Performance with {'vector_size': 150, 'window': 10, 'min_count': 2}:
              precision    recall  f1-score   support

           0       0.59      0.56      0.57       506
           1       0.57      0.60      0.59       494

    accuracy                           0.58      1000
   macro avg       0.58      0.58      0.58      1000
weighted avg       0.58      0.58      0.58      1000

Skip-gram Model Performance with {'vector_size': 200, 'window': 5, 'min_count': 1}:
              precision    recall  f1-score   support

           0       0.55      0.54      0.55       506
           1       0.54      0.56      0.55       494

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.55      0.55      0.55      1000



# CBoW Vectors with Hyperparameter Tuning

In [23]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

# Subset the data
subset_df = df.sample(frac=0.1, random_state=42)
subset_df['cleaned_review'] = subset_df['review'].apply(clean_text)

# Function to create document vectors by averaging word vectors
def get_avg_word2vecs(model, tokens_list):
    vector_size = model.wv.vector_size
    doc_vectors = []
    for tokens in tokens_list:
        vectors = [model.wv[token] for token in tokens if token in model.wv]
        if vectors:
            avg_vector = np.mean(vectors, axis=0)
        else:
            avg_vector = np.zeros(vector_size)
        doc_vectors.append(avg_vector)
    return np.array(doc_vectors)

# Labels
y = subset_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Experiment with different hyperparameters for CBoW model
hyperparameters = [
    {'vector_size': 100, 'window': 5, 'min_count': 1},
    {'vector_size': 150, 'window': 10, 'min_count': 2},
    {'vector_size': 200, 'window': 5, 'min_count': 1},
]

for params in hyperparameters:
    cbow_model = Word2Vec(subset_df['cleaned_review'], vector_size=params['vector_size'], window=params['window'], sg=0, min_count=params['min_count'], workers=4)
    X_cbow = get_avg_word2vecs(cbow_model, subset_df['cleaned_review'])

    X_train_cb, X_test_cb, y_train_cb, y_test_cb = train_test_split(X_cbow, y, test_size=0.2, random_state=42)
    rf_cb = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_cb.fit(X_train_cb, y_train_cb)
    y_pred_cb = rf_cb.predict(X_test_cb)
    print(f"CBoW Model Performance with {params}:")
    print(classification_report(y_test_cb, y_pred_cb))




CBoW Model Performance with {'vector_size': 100, 'window': 5, 'min_count': 1}:
              precision    recall  f1-score   support

           0       0.57      0.56      0.56       506
           1       0.56      0.58      0.57       494

    accuracy                           0.57      1000
   macro avg       0.57      0.57      0.57      1000
weighted avg       0.57      0.57      0.57      1000





CBoW Model Performance with {'vector_size': 150, 'window': 10, 'min_count': 2}:
              precision    recall  f1-score   support

           0       0.56      0.53      0.55       506
           1       0.54      0.57      0.55       494

    accuracy                           0.55      1000
   macro avg       0.55      0.55      0.55      1000
weighted avg       0.55      0.55      0.55      1000

CBoW Model Performance with {'vector_size': 200, 'window': 5, 'min_count': 1}:
              precision    recall  f1-score   support

           0       0.57      0.55      0.56       506
           1       0.55      0.57      0.56       494

    accuracy                           0.56      1000
   macro avg       0.56      0.56      0.56      1000
weighted avg       0.56      0.56      0.56      1000

