In [64]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.cluster import KMeans,DBSCAN,AgglomerativeClustering
from sklearn.metrics import silhouette_score,accuracy_score,precision_score,recall_score,f1_score,confusion_matrix

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Amazon_reviews - Sheet1.csv')


In [5]:
train_df = df.sample(frac=0.8, random_state=42)
test_df = df.drop(train_df.index)

In [6]:
train_df.shape

(4824, 3)

In [7]:
test_df.shape

(1206, 3)

In [9]:
train_df.columns = ['score', 'summary', 'text']
test_df.columns = ['score', 'summary', 'text']

In [12]:
train_df.reset_index(drop=True, inplace=True)
test_df.reset_index(drop=True, inplace=True)

In [14]:
display(train_df.head())
display(test_df.head())

Unnamed: 0,score,summary,text
0,2,may be a bit overrated,"I actually enjoyed this movie quite a bit, but..."
1,2,Hookah,The hookah is great. The delivery was fast and...
2,2,What! What! No Limits on the rise again!,"Easily one of the dopest cds of the year 2000,..."
3,2,Keep one in your trunk,I purchased this product to use primarily in c...
4,2,This guy was evil,"This book shows you that Bob was a womanizer, ..."


Unnamed: 0,score,summary,text
0,2,Great for the non-audiophile,Reviewed quite a bit of the combo players and ...
1,1,DVD Player crapped out after one year,I also began having the incorrect disc problem...
2,1,Incorrect Disc,"I love the style of this, but after a couple y..."
3,2,Great book for travelling Europe,"I currently live in Europe, and this is the bo..."
4,2,Review of Kelly Club for Toddlers,"For the price of 7.99, this PC game is WELL wo..."


In [16]:
train_df.dropna(subset=['summary'], inplace=True)
train_df.reset_index(drop=True, inplace=True)
print('New shape of train_df:', train_df.shape)
print('Null values after dropping:', train_df.isnull().sum())

New shape of train_df: (4822, 3)
Null values after dropping: score      0
summary    0
text       0
dtype: int64


In [18]:
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [19]:
train_df['clean_text'] = train_df['text'].apply(clean_text)
test_df['clean_text'] = test_df['text'].apply(clean_text)

In [22]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in text.split() if word not in stop_words])
train_df['clean_text'] = train_df['clean_text'].apply(remove_stopwords)
test_df['clean_text'] = test_df['clean_text'].apply(remove_stopwords)




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [25]:
neg_text = " ".join(train_df[train_df['score']==1]['clean_text'])
pos_text = " ".join(train_df[train_df['score']==2]['clean_text'])

In [26]:
from collections import Counter

neg_counter = Counter(neg_text.split())
pos_counter = Counter(pos_text.split())

In [35]:
neg_top10 = pd.DataFrame(neg_counter.most_common(10), columns=['word', 'count'])
pos_top10 = pd.DataFrame(pos_counter.most_common(10), columns=['word', 'count'])

In [37]:
neg_top10

Unnamed: 0,word,count
0,book,1460
1,one,820
2,movie,743
3,like,719
4,would,649
5,dont,503
6,read,498
7,good,496
8,get,458
9,time,436


In [38]:
pos_top10

Unnamed: 0,word,count
0,book,1378
1,one,900
2,great,791
3,good,660
4,read,660
5,like,651
6,movie,606
7,love,451
8,well,423
9,would,423


In [40]:
s1 = set(neg_top10['word'])
s2 = set(pos_top10['word'])


In [42]:
custom_stop_words1 = s1.union(s2)
print("Custom Stop Words (Union of s1 and s2):")
print(custom_stop_words)

Custom Stop Words (Union of s1 and s2):
{'like', 'love', 'book', 'would', 'get', 'one', 'movie', 'time', 'great', 'read', 'good', 'well', 'dont'}


In [47]:
custom_stop_words = list(custom_stop_words)
display(custom_stop_words)

['like',
 'love',
 'book',
 'would',
 'get',
 'one',
 'movie',
 'time',
 'great',
 'read',
 'good',
 'well',
 'dont']

In [49]:
def remove_custom_stopwords(text):
    return " ".join([word for word in text.split() if word not in custom_stop_words])

In [51]:
train_df['clean_text_final'] = train_df['clean_text'].apply(remove_custom_stopwords)
test_df['clean_text_final'] = test_df['clean_text'].apply(remove_custom_stopwords)

In [52]:
train_df.head()

Unnamed: 0,score,summary,text,clean_text,clean_text_final
0,2,may be a bit overrated,"I actually enjoyed this movie quite a bit, but...",actually enjoyed movie quite bit may touch ove...,actually enjoyed quite bit may touch overrated...
1,2,Hookah,The hookah is great. The delivery was fast and...,hookah great delivery fast company helpful cal...,hookah delivery fast company helpful called ex...
2,2,What! What! No Limits on the rise again!,"Easily one of the dopest cds of the year 2000,...",easily one dopest cds year itll probably get s...,easily dopest cds year itll probably slepped h...
3,2,Keep one in your trunk,I purchased this product to use primarily in c...,purchased product use primarily case battery d...,purchased product use primarily case battery d...
4,2,This guy was evil,"This book shows you that Bob was a womanizer, ...",book shows bob womanizer bob way worse book sa...,shows bob womanizer bob way worse says check a...


In [53]:
neg_text_final = " ".join(train_df[train_df['score']==1]['clean_text_final'])
pos_text_final = " ".join(train_df[train_df['score']==2]['clean_text_final'])

In [54]:
neg_counter_final = Counter(neg_text_final.split())
pos_counter_final = Counter(pos_text_final.split())

In [62]:
tfidf = TfidfVectorizer(max_features=1200)
X_train_sub = tfidf.fit_transform(train_df['clean_text_final'])
X_test_sub = tfidf.transform(test_df['clean_text_final'])


In [60]:
y_train_sub = train_df['score'].map({1:0, 2:1}).values
y_test_sub = test_df['score'].map({1:0, 2:1}).values


In [63]:
print("X_train_sub shape:", X_train_sub.shape)
print("X_test_sub shape:", X_test_sub.shape)

X_train_sub shape: (4822, 1200)
X_test_sub shape: (1206, 1200)


In [65]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    start_time = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_time

    start_time = time.time()
    y_pred = model.predict(X_test)
    pred_time = time.time() - start_time

    metrics = {
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1-Score": f1_score(y_test, y_pred),
        "Train Time (s)": train_time,
        "Prediction Time (s)": pred_time
    }
    return y_pred, metrics

In [67]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

In [68]:
results = {}

In [69]:
for name, model in models.items():
    print(f"Training {name} ...")
    y_pred, metrics = evaluate_model(model, X_train_sub, y_train_sub, X_test_sub, y_test_sub)
    results[name] = metrics

Training Logistic Regression ...
Training Naive Bayes ...
Training SVM ...
Training Random Forest ...
Training XGBoost ...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [70]:
results_df = pd.DataFrame(results).T  # transpose for readability
results_df = results_df.sort_values(by="Accuracy", ascending=False)

print("\nAll Model Metrics & Time Comparison on Subset:")
print(results_df)


All Model Metrics & Time Comparison on Subset:
                     Accuracy  Precision    Recall  F1-Score  Train Time (s)  \
SVM                  0.785240   0.766467  0.832520  0.798129        6.509380   
Logistic Regression  0.776949   0.758982  0.824390  0.790335        0.038746   
Random Forest        0.774461   0.765891  0.803252  0.784127        4.516728   
XGBoost              0.768657   0.744186  0.832520  0.785879        8.145959   
Naive Bayes          0.763682   0.747748  0.809756  0.777518        0.004596   

                     Prediction Time (s)  
SVM                             1.127758  
Logistic Regression             0.000712  
Random Forest                   0.052115  
XGBoost                         0.007644  
Naive Bayes                     0.000820  
