In [1]:
import gensim
from gensim.models import Word2Vec, KeyedVectors

from tqdm import tqdm

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\PALAK
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\PALAK
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\PALAK
[nltk_data]     GUPTA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load the dataset
import numpy as np
import pandas as pd
data=pd.read_csv('all_kindle_review.csv')
data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [None]:
#checking if data is imbalanced
data['rating'].value_counts()    # not imbalanced

rating
5    3000
4    3000
3    2000
2    2000
1    2000
Name: count, dtype: int64

In [4]:

lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text).lower()  # Remove special characters
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords.words('english')]
    return tokens

In [5]:
# Apply preprocessing
data['cleaned_text'] = data['reviewText'].apply(preprocess_text)

In [6]:
w2v_model = Word2Vec(
    sentences=data['cleaned_text'],
    vector_size=200,  # Increase feature size
    window=7,         # Capture wider context
    min_count=3,      # Exclude rare words
    workers=4,
    epochs=20         # More training iterations

)

In [7]:
# Generate Word Vectors for each review
def get_word2vec_vectors(text):
    vector = np.mean([w2v_model.wv[word] for word in text if word in w2v_model.wv], axis=0)
    return vector if vector is not np.nan else np.zeros(100)

In [8]:
data['word2vec_features'] = data['cleaned_text'].apply(get_word2vec_vectors)

In [9]:
# Prepare data for training
X = np.vstack(data['word2vec_features'].values)
data['sentiment'] = np.where(data['rating'] >= 4, 1, 0)  # Positive = 1, Negative = 0
y = data['sentiment']

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
from xgboost import XGBClassifier

model = XGBClassifier(n_estimators=200, max_depth=7, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

In [12]:
# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print(classification_report(y_test, y_pred))

Accuracy: 80.79%
              precision    recall  f1-score   support

           0       0.82      0.79      0.80      1190
           1       0.80      0.83      0.81      1210

    accuracy                           0.81      2400
   macro avg       0.81      0.81      0.81      2400
weighted avg       0.81      0.81      0.81      2400



In [14]:

#Hyperparameter tuning for XGBoost Classifier
from sklearn.model_selection import RandomizedSearchCV
# Define the XGBoost classifier
xgb_model = XGBClassifier(random_state=42)

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [100, 200, 300],      # Number of trees
    'max_depth': [3, 5, 7],               # Tree depth
    'learning_rate': np.linspace(0.01, 0.3, 10),  # Step size shrinkage
    'subsample': [0.7, 0.8, 1.0],         # Fraction of samples used for fitting
    'colsample_bytree': [0.7, 0.8, 1.0],  # Fraction of features used for fitting
}

In [15]:
# Perform RandomizedSearchCV for hyperparameter tuning
random_search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    scoring='accuracy',
    cv=5,  # 5-fold cross-validation
    n_iter=20,  # Number of random combinations to try
    random_state=42,
    n_jobs=-1  # Use all CPU cores
)

In [16]:
random_search.fit(X_train, y_train)

In [18]:
# Best parameters and model
tuned_model = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

Best Parameters: {'subsample': 1.0, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.042222222222222223, 'colsample_bytree': 0.8}


In [19]:
# Evaluating  the tuned model
y_pred = tuned_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8045833333333333
              precision    recall  f1-score   support

           0       0.81      0.79      0.80      1190
           1       0.80      0.82      0.81      1210

    accuracy                           0.80      2400
   macro avg       0.80      0.80      0.80      2400
weighted avg       0.80      0.80      0.80      2400

