In [1]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

from tqdm import tqdm

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\PALAK
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\PALAK
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# Load the dataset
import numpy as np
import pandas as pd
data=pd.read_csv('all_kindle_review.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [4]:
# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text)  # Remove special characters
    tokens = nltk.word_tokenize(text.lower())  # Tokenization and lowercase
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    return tokens

In [5]:
# Apply preprocessing
data['cleaned_text'] = data['reviewText'].apply(preprocess_text)

In [6]:
# Word2Vec Model Training
w2v_model = Word2Vec(sentences=data['cleaned_text'], vector_size=100, window=5, min_count=2, workers=4)

In [7]:
# Generate Word Vectors for each review
def get_word2vec_vectors(text):
    vector = np.mean([w2v_model.wv[word] for word in text if word in w2v_model.wv], axis=0)
    return vector if vector is not np.nan else np.zeros(100)

In [8]:
data['word2vec_features'] = data['cleaned_text'].apply(get_word2vec_vectors)

In [10]:
# Prepare data for training
X = np.vstack(data['word2vec_features'].values)
data['sentiment'] = np.where(data['rating'] >= 4, 1, 0)  # Positive = 1, Negative = 0
y = data['sentiment']

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Train a classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 75.21%
              precision    recall  f1-score   support

           0       0.74      0.76      0.75      1190
           1       0.76      0.74      0.75      1210

    accuracy                           0.75      2400
   macro avg       0.75      0.75      0.75      2400
weighted avg       0.75      0.75      0.75      2400

