CSCI 544 - Homework 2 <br>
Neural Networks for Sentiment Analysis <br>
Python Version: 3.12 <br>
Library: PyTorch <br>

In [36]:
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings("ignore")
import multiprocessing

# NLTK
import nltk
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, bigrams

# Gensim for Word2Vec
import gensim.downloader as api
from gensim.models import Word2Vec

# Scikit-learn
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Set random seeds for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_STATE)

print("All imports successful!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

All imports successful!
PyTorch version: 2.10.0+cu128
CUDA available: True


In [2]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz
#          https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz

"""
downloaded the dataset locally through the above links using terminal wget command    
"""



'\ndownloaded the dataset locally through the above links using terminal wget command    \n'

# Question 1: Dataset Generation

# Dataset Preparation

## Read Data

In [3]:
df = pd.read_csv(r'data/amazon_reviews_us_Office_Products_v1_00.tsv.gz', sep='\t', on_bad_lines='skip', low_memory=False)

In [4]:
df.shape

(2640254, 15)

In [5]:
df.head(3)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31


## Keep Reviews and Ratings

In [6]:
df.columns

Index(['marketplace', 'customer_id', 'review_id', 'product_id',
       'product_parent', 'product_title', 'product_category', 'star_rating',
       'helpful_votes', 'total_votes', 'vine', 'verified_purchase',
       'review_headline', 'review_body', 'review_date'],
      dtype='str')

In [7]:
print(df['star_rating'].dtype)
print(df['star_rating'].unique())

str
<StringArray>
['5', '1', '4', '2', '3', '2015-06-05', '2015-02-11', nan, '2014-02-14']
Length: 9, dtype: str


In [8]:
df['star_rating'] = pd.to_numeric(df['star_rating'], errors='coerce')
print(df['star_rating'].dtype)
print(df['star_rating'].unique())

float64
[ 5.  1.  4.  2.  3. nan]


In [9]:
df.dropna(subset=['review_body', 'star_rating'], inplace=True)

In [10]:
df.shape

(2640080, 15)

In [11]:
df = df[['review_body', 'star_rating']]  # selecting only relevant columns
print(df.head(3))
print(df['star_rating'].value_counts().sort_index())  # checking distribution of classes

                                         review_body  star_rating
0                                     Great product.          5.0
1  What's to say about this commodity item except...          5.0
2    Haven't used yet, but I am sure I will like it.          5.0
star_rating
1.0     306967
2.0     138381
3.0     193680
4.0     418348
5.0    1582704
Name: count, dtype: int64




 ## Relabeling and Sampling
 
First form three classes and print their statistics. Then randomly select 250,000 reviews.



In [12]:
balanced_dfs = []

for rating in [1, 2, 3, 4, 5]:
    rating_df = df[df['star_rating'] == rating]
    
    if len(rating_df) >= 50000:
        sampled = rating_df.sample(n=50000, random_state=RANDOM_STATE)
    else:
        print(f"Warning: Only {len(rating_df)} reviews available for rating {rating}")
        sampled = rating_df
    
    balanced_dfs.append(sampled)
    print(f"Rating {rating}: {len(sampled)} reviews sampled")

Rating 1: 50000 reviews sampled
Rating 2: 50000 reviews sampled
Rating 3: 50000 reviews sampled
Rating 4: 50000 reviews sampled
Rating 5: 50000 reviews sampled


In [13]:
# Concatenate all
df_balanced = pd.concat(balanced_dfs, ignore_index=True)
print(df_balanced.shape)
df_balanced['star_rating'].value_counts()

(250000, 2)


star_rating
1.0    50000
2.0    50000
3.0    50000
4.0    50000
5.0    50000
Name: count, dtype: int64

In [14]:
def create_ternary_label(rating):
    """
    rating > 3 → class 1 (Positive)
    rating < 3 → class 2 (Negative)
    rating = 3 → class 3 (Neutral)
    """
    if rating > 3:
        return 1  # Positive
    elif rating < 3:
        return 2  # Negative
    else:
        return 3  # Neutral

# Fix your labels
df_balanced['label'] = df_balanced['star_rating'].apply(create_ternary_label)
print(df_balanced['label'].value_counts().sort_index())

label
1    100000
2    100000
3     50000
Name: count, dtype: int64


# Data Cleaning



In [15]:
CONTRACTIONS_MAP = {
    "ain't": "is not",
    "amn't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "daren't": "dare not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "everyone's": "everyone is",
    "gimme": "give me",
    "gonna": "going to",
    "gotta": "got to",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "I would",
    "i'd've": "I would have",
    "i'll": "I will",
    "i'll've": "I will have",
    "i'm": "I am",
    "i've": "I have",
    "innit": "is it not",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "kinda": "kind of",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "ne'er": "never",
    "o'clock": "of the clock",
    "o'er": "over",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "outta": "out of",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so is",
    "somebody's": "somebody is",
    "someone's": "someone is",
    "something's": "something is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "tis": "it is",
    "twas": "it was",
    "to've": "to have",
    "wanna": "want to",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "whatcha": "what are you",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who'll've": "who will have",
    "who're": "who are",
    "who's": "who is",
    "why's": "why is",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
}

In [16]:
def remove_contractions(text):
    # Sort contractions by length (longest first) to handle compound contractions
    contractions_sorted = sorted(CONTRACTIONS_MAP.keys(), key=len, reverse=True)
    
    # Build pattern with word boundaries
    pattern = re.compile(r'\b(' + '|'.join(re.escape(key) for key in contractions_sorted) + r')\b', 
                        flags=re.IGNORECASE)
    
    def expand_match(contraction):
        match = contraction.group(0)
        match_lower = match.lower()
        
        if match_lower in CONTRACTIONS_MAP:
            expanded = CONTRACTIONS_MAP[match_lower]
            
            # Preserve original capitalization
            if match[0].isupper():
                expanded = expanded[0].upper() + expanded[1:]
            
            return expanded
        
        return match
    
    # Keep expanding until no more contractions found
    prev_text = ""
    while prev_text != text:
        prev_text = text
        text = pattern.sub(expand_match, text)
    
    return text

sample_text = "I can't do this. She's going to the market. Y'all've been great!"
print("Original Text: ", sample_text)
expanded_text = remove_contractions(sample_text)
print("Expanded Text: ", expanded_text)

Original Text:  I can't do this. She's going to the market. Y'all've been great!
Expanded Text:  I cannot do this. She is going to the market. You all have been great!


In [17]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Expand contractions
    text = remove_contractions(text)
    
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

In [18]:
avg_length_before = df_balanced['review_body'].str.len().mean()
avg_length_before

np.float64(341.193312)

In [19]:
df_balanced['review_body'] = df_balanced['review_body'].apply(preprocess_text)

In [20]:
avg_length_after = df_balanced['review_body'].str.len().mean()
avg_length_after

np.float64(324.048708)

In [21]:
print(df_balanced['review_body'].head(3))

0    i purchased these tabs on a whim to put up som...
1       returned it too much garbage involved in setup
2    my upholstered living room chairs are not part...
Name: review_body, dtype: str


# Pre-processing

In [22]:
from nltk import pos_tag
from nltk.corpus import wordnet

def get_wordnet_pos(treebank_tag):
    """Convert treebank POS tags to WordNet POS tags"""
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

In [23]:
from nltk.stem import WordNetLemmatizer
def lemmatize_with_pos(text):
    """
    Enhanced lemmatization that tries multiple POS tags for better results
    """
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    
    if not words:
        return ""
    
    # POS tag the available text
    pos_tags = pos_tag(words)
    
    lemmatized = []
    for word, pos in pos_tags:
        # Get primary WordNet POS
        primary_pos = get_wordnet_pos(pos)
        
        # Try lemmatizing with the detected POS
        lemmatized_word = lemmatizer.lemmatize(word, primary_pos)
        
        # If word didn't change and it might be a verb, try verb lemmatization
        if lemmatized_word == word and primary_pos != wordnet.VERB:
            verb_form = lemmatizer.lemmatize(word, wordnet.VERB)
            # Use verb form if it's different (likely was actually a verb)
            if verb_form != word:
                lemmatized_word = verb_form
        
        lemmatized.append(lemmatized_word)
    
    return ' '.join(lemmatized)

## remove the stop words 

In [24]:
from nltk.corpus import stopwords

def remove_stopwords(text):
    """Remove stopwords but keep negation words"""
    stop_words = set(stopwords.words('english'))
    
    # CRITICAL: Keep negation words for sentiment analysis
    negations = {
        'no', 'not', 'nor', 'never', 'neither', 'nobody', 'nothing', 
        'nowhere', 'none', 'hardly', 'scarcely', 'barely'
    }
    # Remove negation words from stopwords list
    stop_words = stop_words - negations
    
    words = text.split()
    filtered = [word for word in words if word not in stop_words]
    return ' '.join(filtered)

# Test it on a sample
sample_text = "this is not a good product and i do not recommend it"
print("Original:", sample_text)
print("After stopword removal:", remove_stopwords(sample_text))
# Should keep "not" in the output

Original: this is not a good product and i do not recommend it
After stopword removal: not good product not recommend


In [25]:
samples_before_preprocessing = df_balanced['review_body'].head(3).copy()
print("Samples before preprocessing:")
print(samples_before_preprocessing)
avg_length_before_preprocessing = df_balanced['review_body'].str.len().mean()
print(f"Average length before preprocessing:{ avg_length_before_preprocessing: .4f}" )

Samples before preprocessing:
0    i purchased these tabs on a whim to put up som...
1       returned it too much garbage involved in setup
2    my upholstered living room chairs are not part...
Name: review_body, dtype: str
Average length before preprocessing: 324.0487


In [26]:
from nltk.corpus import stopwords
# Save before preprocessing
samples_before_stopwords_removal = df_balanced['review_body'].head(3).copy()
print("Samples before removing stop words:")
print(samples_before_stopwords_removal)
avg_length_before_stopwords_removal = df_balanced['review_body'].str.len().mean()
print("Average length before removing stop words:", avg_length_before_stopwords_removal)
# Now remove stop words
# Apply stopword removal (keeping negations)
df_balanced['review_body'] = df_balanced['review_body'].apply(remove_stopwords)

# After all preprocessing
samples_after_stopwords_removal = df_balanced['review_body'].head(3).copy()
print("Samples after removing stop words:")
print(samples_after_stopwords_removal)
avg_length_after_stopwords_removal = df_balanced['review_body'].str.len().mean()
print("Average length after removing stop words:", avg_length_after_stopwords_removal)

Samples before removing stop words:
0    i purchased these tabs on a whim to put up som...
1       returned it too much garbage involved in setup
2    my upholstered living room chairs are not part...
Name: review_body, dtype: str
Average length before removing stop words: 324.048708
Samples after removing stop words:
0    purchased tabs whim put movie posters used fou...
1                 returned much garbage involved setup
2    upholstered living room chairs not particularl...
Name: review_body, dtype: str
Average length after removing stop words: 209.308632


## perform lemmatization  

In [27]:
#save before lemmatization
samples_before_lemmatization = df_balanced['review_body'].head(3).copy()
print("Samples before lemmatization:")
print(samples_before_lemmatization)
avg_length_before_lemmatization = df_balanced['review_body'].str.len().mean()
print("Average length before lemmatization:", avg_length_before_lemmatization)

# Apply lemmatization
df_balanced['review_body'] = df_balanced['review_body'].apply(lemmatize_with_pos)

# After lemmatization
samples_after_lemmatization = df_balanced['review_body'].head(3).copy()
print("Samples after lemmatization:")
print(samples_after_lemmatization)
avg_length_after_lemmatization = df_balanced['review_body'].str.len().mean()
print("Average length after lemmatization:", avg_length_after_lemmatization)

Samples before lemmatization:
0    purchased tabs whim put movie posters used fou...
1                 returned much garbage involved setup
2    upholstered living room chairs not particularl...
Name: review_body, dtype: str
Average length before lemmatization: 209.308632


Samples after lemmatization:
0    purchase tab whim put movie poster use four in...
1                    return much garbage involve setup
2    upholster live room chair not particularly big...
Name: review_body, dtype: str
Average length after lemmatization: 197.9258


In [28]:
samples_after_preprocessing = df_balanced['review_body'].head(3).copy()
print("Samples after preprocessing:")
print(samples_after_preprocessing)
avg_length_after_preprocessing = df_balanced['review_body'].str.len().mean()
print(f"Average length after preprocessing: {avg_length_after_preprocessing: .4f}")

Samples after preprocessing:
0    purchase tab whim put movie poster use four in...
1                    return much garbage involve setup
2    upholster live room chair not particularly big...
Name: review_body, dtype: str
Average length after preprocessing:  197.9258


# Question 2: Word Embeddings

#### (a) loading pretrained "word2vec-google-news-300” Word2Vec model

In [29]:
pretrained_w2v = api.load('word2vec-google-news-300')



In [30]:
print(f"Vocabulary size: {len(pretrained_w2v.key_to_index):,}")
print(f"Vector dimensionality: {pretrained_w2v.vector_size}")

Vocabulary size: 3,000,000
Vector dimensionality: 300


In [32]:
# SEMANTIC SIMILARITY TEST 1: King - Man + Woman
try:
    result = pretrained_w2v.most_similar(
        positive=['king', 'woman'], 
        negative=['man'], 
        topn=5
    )
    print("\nTop 5 results:")
    for word, score in result:
        print(f"  {word:15} similarity: {score:.4f}")
except KeyError as e:
    print(f"Error: {e}")


Top 5 results:
  queen           similarity: 0.7118
  monarch         similarity: 0.6190
  princess        similarity: 0.5902
  crown_prince    similarity: 0.5499
  prince          similarity: 0.5377


In [33]:
# SEMANTIC SIMILARITY TEST 2: excellent ~ outstanding
try:
    similarity = pretrained_w2v.similarity('excellent', 'outstanding')
    print(f"\nSimilarity(excellent, outstanding) = {similarity:.4f}")
    
    # Show most similar words to 'excellent'
    print("\nWords most similar to 'excellent':")
    similar_words = pretrained_w2v.most_similar('excellent', topn=5)
    for word, score in similar_words:
        print(f"  {word:15} similarity: {score:.4f}")
        
except KeyError as e:
    print(f"Error: {e}")


Similarity(excellent, outstanding) = 0.5567

Words most similar to 'excellent':
  terrific        similarity: 0.7410
  superb          similarity: 0.7063
  exceptional     similarity: 0.6815
  fantastic       similarity: 0.6803
  good            similarity: 0.6443


#### (b) Training custom Word2Vec on our dataset

In [34]:
# Prepare tokenized reviews for Word2Vec training
print("\nPreparing tokenized reviews...")
# Use your preprocessed reviews (already cleaned and lemmatized)
tokenized_reviews = [review.split() for review in df_balanced['review_body']]

print(f"Number of reviews: {len(tokenized_reviews):,}")
print(f"Sample tokenized review:")
print(f"  {tokenized_reviews[0][:20]}...")  # Show first 20 tokens


Preparing tokenized reviews...
Number of reviews: 250,000
Sample tokenized review:
  ['purchase', 'tab', 'whim', 'put', 'movie', 'poster', 'use', 'four', 'inch', 'tab', 'poster', 'not', 'job', 'poster', 'stay', 'day', 'one', 'right', 'start', 'fall']...


In [37]:
custom_w2v = Word2Vec(
    sentences=tokenized_reviews,
    vector_size=300,      # embedding size = 300
    window=11,            # window size = 11
    min_count=10,         # minimum word count = 10
    workers= multiprocessing.cpu_count(),            # use all available CPU cores
    seed=RANDOM_STATE,    # for reproducibility
    epochs=10,            # training epochs
    sg=0,                 # CBOW (0) or Skip-gram (1)
    negative=5            # negative sampling
)

print("\nTraining complete!")
print(f"Vocabulary size: {len(custom_w2v.wv.key_to_index):,}")
print(f"Vector dimensionality: {custom_w2v.wv.vector_size}")

# Save the model
custom_w2v.save('custom_word2vec.model')
print("\nModel saved to 'custom_word2vec.model'")


Training complete!
Vocabulary size: 13,116
Vector dimensionality: 300

Model saved to 'custom_word2vec.model'


In [39]:
# Compare vocabulary sizes
print("\n1. VOCABULARY SIZE:")
print(f"   Pretrained (Google News): {len(pretrained_w2v.key_to_index):,} words")
print(f"   Custom (Office Reviews):  {len(custom_w2v.wv.key_to_index):,} words")
print(f"   Ratio: {len(pretrained_w2v.key_to_index) / len(custom_w2v.wv.key_to_index):.1f}x larger")


1. VOCABULARY SIZE:
   Pretrained (Google News): 3,000,000 words
   Custom (Office Reviews):  13,116 words
   Ratio: 228.7x larger


In [40]:
# Compare training corpus
print("\n2. TRAINING DATA:")
print("   Pretrained: ~100 billion words from Google News")
print("   Custom: 250,000 Amazon office product reviews")


2. TRAINING DATA:
   Pretrained: ~100 billion words from Google News
   Custom: 250,000 Amazon office product reviews


In [41]:


# Test domain-specific words
print("\n3. DOMAIN-SPECIFIC VOCABULARY:")
test_words = ['product', 'quality', 'price', 'shipping', 'recommend', 
              'excellent', 'terrible', 'refund', 'packaging', 'defective']

print(f"\n   {'Word':<15} {'Pretrained':<15} {'Custom':<15}")
print("   " + "-" * 45)

for word in test_words:
    pretrained_exists = word in pretrained_w2v
    custom_exists = word in custom_w2v.wv
    print(f"   {word:<15} {'✓' if pretrained_exists else '✗':<15} {'✓' if custom_exists else '✗':<15}")


3. DOMAIN-SPECIFIC VOCABULARY:

   Word            Pretrained      Custom         
   ---------------------------------------------
   product         ✓               ✓              
   quality         ✓               ✓              
   price           ✓               ✓              
   shipping        ✓               ✗              
   recommend       ✓               ✓              
   excellent       ✓               ✓              
   terrible        ✓               ✓              
   refund          ✓               ✓              
   packaging       ✓               ✗              
   defective       ✓               ✓              


In [42]:

# Compare semantic similarities
print("\n4. SEMANTIC SIMILARITY COMPARISON:")
print("   Testing word pairs from our domain:")

word_pairs = [
    ('good', 'excellent'),
    ('bad', 'terrible'),
    ('buy', 'purchase'),
    ('product', 'item'),
]

print(f"\n   {'Word Pair':<25} {'Pretrained':<15} {'Custom':<15}")
print("   " + "-" * 55)

for word1, word2 in word_pairs:
    try:
        sim_pre = pretrained_w2v.similarity(word1, word2)
    except KeyError:
        sim_pre = None
    
    try:
        sim_cust = custom_w2v.wv.similarity(word1, word2)
    except KeyError:
        sim_cust = None
    
    pre_str = f"{sim_pre:.4f}" if sim_pre else "N/A"
    cust_str = f"{sim_cust:.4f}" if sim_cust else "N/A"
    
    print(f"   {word1} - {word2:<20} {pre_str:<15} {cust_str:<15}")


4. SEMANTIC SIMILARITY COMPARISON:
   Testing word pairs from our domain:

   Word Pair                 Pretrained      Custom         
   -------------------------------------------------------
   good - excellent            0.6443          0.5739         
   bad - terrible             0.6829          0.4758         
   buy - purchase             0.7640          0.7492         
   product - item                 0.2570          0.4883         


In [43]:
# Test Out-of-Vocabulary (OOV) rate
print("\n5. OUT-OF-VOCABULARY (OOV) ANALYSIS:")
print("   Analyzing how many words from our reviews are missing from each model...")

# Sample 1000 reviews
sample_reviews = df_balanced['review_body'].sample(1000, random_state=RANDOM_STATE)

oov_pretrained = 0
oov_custom = 0
total_words = 0

for review in sample_reviews:
    words = review.split()
    for word in words:
        total_words += 1
        if word not in pretrained_w2v:
            oov_pretrained += 1
        if word not in custom_w2v.wv:
            oov_custom += 1

print(f"\n   Total words analyzed: {total_words:,}")
print(f"   Pretrained OOV rate: {oov_pretrained/total_words*100:.2f}%")
print(f"   Custom OOV rate: {oov_custom/total_words*100:.2f}%")


5. OUT-OF-VOCABULARY (OOV) ANALYSIS:
   Analyzing how many words from our reviews are missing from each model...

   Total words analyzed: 34,605
   Pretrained OOV rate: 3.13%
   Custom OOV rate: 2.95%
