In [2]:
import nltk 
import random
import numpy as np
from nltk.tokenize import word_tokenize
import string
import re
from nltk.corpus import stopwords

In [3]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/omkarjadhav/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# Load GloVe word embeddings
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector
    return embeddings_index

# Replace with the correct path on your machine
glove_path = "./glove.6B.100d.txt"
glove_embeddings = load_glove_embeddings(glove_path)

print(f"Loaded {len(glove_embeddings)} word vectors.")


Loaded 400000 word vectors.


In [5]:
def get_review_vector(review, glove_embeddings, embedding_dim=100):
    review = review.lower()
    review = re.sub(f"[{re.escape(string.punctuation)}]", "", review)

    words = word_tokenize(review)

    stop_words = set(stopwords.words("english"))
    words = [w for w in words if not w in stop_words]
    
    word_vectors = [glove_embeddings[word] for word in words if word in glove_embeddings]
    # print("\n\n word_vectors: ", word_vectors)

    if not word_vectors:
        return np.zeros(embedding_dim)
    # print(len(np.mean(word_vectors, axis=0)))
    # print("mean: ", np.mean(word_vectors, axis=0))
    return np.mean(word_vectors, axis=0)


### Model Training

In [6]:
from nltk.corpus import movie_reviews
import random
import pandas as pd

# Prepare dataset
documents = [(movie_reviews.raw(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

# Convert to DataFrame
df = pd.DataFrame(documents, columns=['text', 'label'])
df["vector"] = df["text"].apply(lambda x: get_review_vector(x, glove_embeddings))

In [7]:
df.head()

Unnamed: 0,text,label,vector
0,"armageddon , in itself , symbolizes everything...",pos,"[-0.019514998, 0.2530898, 0.3066087, -0.257927..."
1,anastasia contains something that has been lac...,pos,"[0.02257081, 0.22111125, 0.30900228, -0.128211..."
2,"seen at : amc old pasadena 8 , pasadena , ca (...",neg,"[-0.029053003, 0.13240016, 0.29211634, -0.1305..."
3,capsule : five friends at a stag party are inv...,neg,"[0.030033395, 0.12874076, 0.26184192, -0.22171..."
4,the marvelous british actor derek jacobi stars...,neg,"[-0.0025800485, 0.19706295, 0.2026982, -0.1328..."


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

x = np.stack(df["vector"].values)
y = df["label"].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

# Initiate the model
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# # Predict and Evaluate
y_pred = model.predict(x_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

Accuracy:  0.722
Classification report: 
               precision    recall  f1-score   support

         neg       0.71      0.74      0.73       501
         pos       0.73      0.70      0.72       499

    accuracy                           0.72      1000
   macro avg       0.72      0.72      0.72      1000
weighted avg       0.72      0.72      0.72      1000



### TF-IDF Weighted Averaging of Embeddings
* It gives more importance to rare but meaningful words.

In [9]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

# 1. Fit TF-IDF Vectorizer
tfidf = TfidfVectorizer()
tfidf.fit(df["text"])

# 2. Get IDF scores as a dictionary
idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))
# print("idf_dict: ", idf_dict)

# 3. Function to get weighted average of GloVe embeddings
def get_weighted_vector(text, glove_embeddings, idf_dict, embedding_dim=100):
    words = word_tokenize(text.lower())
    word_vectors = []

    for word in words:
        if word in glove_embeddings and word in idf_dict:
            weight = idf_dict[word]
            # print(":::::: ", glove_embeddings[word] * weight)
            word_vectors.append(glove_embeddings[word] * weight)

    if not word_vectors:
        return np.zeros(embedding_dim)
    return np.mean(word_vectors, axis=0)

# 4. Apply to all reviews
df["vector"] = df["text"].apply(lambda x: get_weighted_vector(x, glove_embeddings, idf_dict))


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

x = np.stack(df["vector"].values)
y = df["label"].values

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=42)

# Initiate the model
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

# # Predict and Evaluate
y_pred = model.predict(x_test)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification report: \n", classification_report(y_test, y_pred))

Accuracy:  0.739
Classification report: 
               precision    recall  f1-score   support

         neg       0.73      0.76      0.75       501
         pos       0.75      0.72      0.73       499

    accuracy                           0.74      1000
   macro avg       0.74      0.74      0.74      1000
weighted avg       0.74      0.74      0.74      1000

