In [1]:
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score



In [2]:
df = pd.read_csv('IMDB_reviews_cleaned.csv')

In [3]:
sample_df = df.sample(1000, random_state=42)

In [9]:
df['sentiment'].value_counts()

0    24882
1    24698
Name: sentiment, dtype: int64

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(sample_df['clean_review'], sample_df['sentiment'], test_size=0.3, random_state=42)


## Embedding Techniques

#### TF-IDF Embedding with SVM model

In [26]:

tfidf_vectorizer = TfidfVectorizer(max_features=1500)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

svm_tfidf = SVC()
svm_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = svm_tfidf.predict(X_test_tfidf)

# Evaluation
print("SVM with TF-IDF Embedding:")
print(classification_report(y_test, y_pred_tfidf))
print("Accuracy:", accuracy_score(y_test, y_pred_tfidf))


SVM with TF-IDF Embedding:
              precision    recall  f1-score   support

           0       0.75      0.85      0.80       151
           1       0.82      0.72      0.77       149

    accuracy                           0.78       300
   macro avg       0.79      0.78      0.78       300
weighted avg       0.79      0.78      0.78       300

Accuracy: 0.7833333333333333


#### Word2Vec Embedding with SVM model

In [34]:
from gensim.models import Word2Vec
import numpy as np

tokenized_reviews = [review.split() for review in X_train]


word2vec = Word2Vec(sentences=tokenized_reviews, vector_size=500, window=5, min_count=1)


def document_embedding(doc, model):
    vectors = [model.wv[word] for word in doc if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(500)


X_train_w2v = np.array([document_embedding(doc.split(), word2vec) for doc in X_train])
X_test_w2v = np.array([document_embedding(doc.split(), word2vec) for doc in X_test])


svm_w2v = SVC()
svm_w2v.fit(X_train_w2v, y_train)
y_pred_w2v = svm_w2v.predict(X_test_w2v)


print("SVM with Word2Vec Embedding:")
print(classification_report(y_test, y_pred_w2v))
print("Accuracy:", accuracy_score(y_test, y_pred_w2v))



SVM with Word2Vec Embedding:
              precision    recall  f1-score   support

           0       0.62      0.13      0.22       151
           1       0.51      0.92      0.66       149

    accuracy                           0.52       300
   macro avg       0.57      0.53      0.44       300
weighted avg       0.57      0.52      0.44       300

Accuracy: 0.5233333333333333


#### GloVe Embedding

In [35]:

glove_file = "glove.6B.100d.txt"
glove_embeddings = {}
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_embeddings[word] = vector

def document_embedding_glove(doc, embeddings):
    vectors = [embeddings[word] for word in doc if word in embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

X_train_glove = np.array([document_embedding_glove(doc.split(), glove_embeddings) for doc in X_train])
X_test_glove = np.array([document_embedding_glove(doc.split(), glove_embeddings) for doc in X_test])

svm_glove = SVC()
svm_glove.fit(X_train_glove, y_train)
y_pred_glove = svm_glove.predict(X_test_glove)

print("SVM with GloVe Embedding:")
print(classification_report(y_test, y_pred_glove))
print("Accuracy:", accuracy_score(y_test, y_pred_glove))


SVM with GloVe Embedding:
              precision    recall  f1-score   support

           0       0.73      0.75      0.74       151
           1       0.74      0.71      0.73       149

    accuracy                           0.73       300
   macro avg       0.73      0.73      0.73       300
weighted avg       0.73      0.73      0.73       300

Accuracy: 0.7333333333333333


#### Conclusion:
From the above we can see that the TF-IDF embedding is performing best with the dataset so we are gonna go ahed and train our models using the same