# WORD2VEC SIMILAR WORDS

In [1]:
import gensim.downloader as api

# Load the pretrained word2vec-google-news-300 model
pretrained_model = api.load('word2vec-google-news-300')

# List of chosen words
words = ['computer', 'science', 'music', 'happy', 'river']

# Finding similar words for each chosen word
similar_words = {word: pretrained_model.most_similar(word, topn=5) for word in words}

# Performing analogy tests
analogies = {
    "Paris - France + Germany": pretrained_model.most_similar(positive=['Germany', 'Paris'], negative=['France'], topn=1),
    "boy - girl + queen": pretrained_model.most_similar(positive=['boy', 'queen'], negative=['girl'], topn=1),
    "strong - strength + weak": pretrained_model.most_similar(positive=['weak', 'strong'], negative=['strength'], topn=1)
}

similar_words, analogies

({'computer': [('computers', 0.7979379296302795),
   ('laptop', 0.6640492677688599),
   ('laptop_computer', 0.6548868417739868),
   ('Computer', 0.6473336219787598),
   ('com_puter', 0.6082081198692322)],
  'science': [('faith_Jezierski', 0.6965421438217163),
   ('sciences', 0.6821076273918152),
   ('biology', 0.6775783896446228),
   ('scientific', 0.6535003185272217),
   ('mathematics', 0.6300910115242004)],
  'music': [('classical_music', 0.7197794318199158),
   ('jazz', 0.6834639310836792),
   ('Music', 0.6595720648765564),
   ('Without_Donny_Kirshner', 0.6416223645210266),
   ('songs', 0.6396344304084778)],
  'happy': [('glad', 0.7408890724182129),
   ('pleased', 0.6632170677185059),
   ('ecstatic', 0.6626912951469421),
   ('overjoyed', 0.6599285006523132),
   ('thrilled', 0.6514049172401428)],
  'river': [('creek', 0.7994444370269775),
   ('lake', 0.7919586300849915),
   ('rivers', 0.7777559757232666),
   ('riverbank', 0.7283666729927063),
   ('canal', 0.722176194190979)]},
 {'Par

# MOVIE REVIEW SENTIMENT ANALYSIS

### DATA PREPROCESSING AND EDA

In [2]:
import pandas as pd

# Load the dataset
imdb = pd.read_csv('IMDB Dataset.csv')

# Display basic information and statistics
imdb.info()
imdb.describe()
imdb['sentiment'].value_counts()

# Display some sample reviews
imdb.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
print("Number of identical rows =",len(imdb[imdb.duplicated()]))

Number of identical rows = 418


In [4]:
imdb = imdb.drop_duplicates().reset_index(drop=True)
print("Number of identical rows =",len(imdb[imdb.duplicated()]))
imdb

Number of identical rows = 0


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49577,I thought this movie did a down right good job...,positive
49578,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49579,I am a Catholic taught in parochial elementary...,negative
49580,I'm going to have to disagree with the previou...,negative


In [9]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

stop_words = set(stopwords.words('english'))

# Define the text preprocessing function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove HTML tags using html.parser explicitly
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

# Apply preprocessing to the text data
imdb['preprocessed_text'] = imdb['review'].apply(preprocess_text)

# Display dataframe with added column
imdb.head(10)

  text = BeautifulSoup(text, "html.parser").get_text()


Unnamed: 0,review,sentiment,preprocessed_text
0,One of the other reviewers has mentioned that ...,positive,one reviewers mentioned watching oz episode yo...
1,A wonderful little production. <br /><br />The...,positive,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter matteis love time money visually stunni...
5,"Probably my all-time favorite movie, a story o...",positive,probably alltime favorite movie story selfless...
6,I sure would like to see a resurrection of a u...,positive,sure would like see resurrection dated seahunt...
7,"This show was an amazing, fresh & innovative i...",negative,show amazing fresh innovative idea first aired...
8,Encouraged by the positive comments about this...,negative,encouraged positive comments film looking forw...
9,If you like original gut wrenching laughter yo...,positive,like original gut wrenching laughter like movi...


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

# Tokenize cleaned reviews
tokenized_reviews = [review.split() for review in imdb['preprocessed_text']]

# Train Word2Vec model using Skip-gram
skipgram_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, sg=1)

# Train Word2Vec model using CBoW
cbow_model = Word2Vec(sentences=tokenized_reviews, vector_size=100, window=5, min_count=2, sg=0)

# pretrained_model (word2vec-google-news-300) is already loaded in notebook

def get_vector_representation(reviews, model):
    vectors = []
    for review in reviews:
        tokens = review.split()
        vector = sum([model.wv[word] for word in tokens if word in model.wv])
        vectors.append(vector)
    return vectors

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(imdb['preprocessed_text'], imdb['sentiment'], test_size=0.2, random_state=42)

# Train and evaluate models
def train_and_evaluate(model):
    X_train_vectors = get_vector_representation(X_train, model)
    X_test_vectors = get_vector_representation(X_test, model)
    
    clf = RandomForestClassifier()
    clf.fit(X_train_vectors, y_train)
    y_pred = clf.predict(X_test_vectors)
    
    return classification_report(y_test, y_pred, output_dict=True)

# Report metrics for each model
skipgram_metrics = train_and_evaluate(skipgram_model)
cbow_metrics = train_and_evaluate(cbow_model)
pretrained_metrics = train_and_evaluate(pretrained_model)
# optimized_skipgram_metrics = train_and_evaluate(optimized_skipgram_model)
# optimized_cbow_metrics = train_and_evaluate(optimized_cbow_model)

# Create a summary table
metrics_summary = pd.DataFrame({
    'Skip-gram': skipgram_metrics['weighted avg'],
    'CBoW': cbow_metrics['weighted avg'],
    'Pretrained': pretrained_metrics['weighted avg']
    # 'Optimized Skip-gram': optimized_skipgram_metrics['weighted avg'],
    # 'Optimized CBoW': optimized_cbow_metrics['weighted avg'],
})

metrics_summary

AttributeError: 'KeyedVectors' object has no attribute 'wv'