In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("bharadwaj6/kindle-reviews")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\kuchh\.cache\kagglehub\datasets\bharadwaj6\kindle-reviews\versions\3


In [3]:
import pandas as pd
import numpy as np
import os
import  warnings
warnings.filterwarnings('ignore')

In [4]:
print(os.listdir(path))

['kindle_reviews.csv', 'kindle_reviews.json']


In [5]:
df = pd.read_csv(os.path.join(path, 'kindle_reviews.csv'))

In [6]:
df

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200
...,...,...,...,...,...,...,...,...,...,...
982614,982614,B00M13FNSS,"[2, 2]",5,Yasss hunny! This is a great read. That Dre is...,"07 23, 2014",A2Y66HD4J5S7QZ,Candi,A Hot Read Indeed!!,1406073600
982615,982615,B00M13FNSS,"[0, 0]",5,I ENJOYED THIS BOOK FROM BEGINNING TO END NOW ...,"07 23, 2014",A17YHECC8H9NEY,Margie,VERY GOOD BOOK,1406073600
982616,982616,B00M13FNSS,"[1, 1]",5,Great book! Cherika was a fool. She let that m...,"07 23, 2014",A20KO0BPMNREJL,Nicki,Great Read,1406073600
982617,982617,B00M13FNSS,"[0, 0]",5,When I say this was an excellent book please b...,"07 23, 2014",A1BQO66R6OLCCW,Nikey,Wow!!,1406073600


In [7]:
df = df[['reviewText', 'overall']]

In [8]:
df['overall'].value_counts()

overall
5    575264
4    254013
3     96194
2     34130
1     23018
Name: count, dtype: int64

In [9]:
df['overall'] = df['overall'].apply(lambda x: 0 if x<=3 else 1)

In [10]:
df['overall'].value_counts()

overall
1    829277
0    153342
Name: count, dtype: int64

In [11]:
from imblearn.under_sampling import RandomUnderSampler

x = df['reviewText']
y = df['overall']

x = pd.DataFrame(x)


# Apply downsampling
rus = RandomUnderSampler(random_state=42)
x, y = rus.fit_resample(x, y)


In [12]:
df1 = x.copy()
df1['rating'] = y

In [13]:
df = df1

In [14]:
df.isna().sum()
df.dropna(inplace=True)

In [15]:
df['rating'].value_counts()

rating
0    153341
1    153338
Name: count, dtype: int64

In [16]:
import re
from bs4 import BeautifulSoup

# remove html tags
df['reviewText'] = df['reviewText'].apply(lambda x: BeautifulSoup(str(x), "html.parser").get_text())

# remove URLs
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'http\S+|www\S+|https\S+', '', x))

# remove emails
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'\S+@\S+', '', x))

# remove special characters
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'[^a-zA-Z0-9 ]+', '', x))

# make lowercase
df['reviewText'] = df['reviewText'].apply(lambda x: x.lower())

# remove extra whitespace
df['reviewText'] = df['reviewText'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())
df

Unnamed: 0,reviewText,rating
12,well written interesting to see sideous throug...,0
13,troy dennings novella recovery was originally ...,0
17,another well written ebook by troy denning but...,0
21,with ylesia a novella originally published in ...,0
25,the events of ylesia take place during destiny...,0
...,...,...
789840,omg this book was great it was the first book ...,1
649909,jointly reviewed for jenny well g after being ...,1
6899,sin is a novella which includes three stories ...,1
32147,the book was well written the story flows smoo...,1


In [17]:
import nltk
from nltk.stem import WordNetLemmatizer

In [18]:
lemmatizer = WordNetLemmatizer()
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word,pos='v') for word in text.split()])

In [19]:
df['reviewText'] = df['reviewText'].apply(lambda x: lemmatize_words(str(x)))
df

Unnamed: 0,reviewText,rating
12,well write interest to see sideous through mau...,0
13,troy dennings novella recovery be originally p...,0
17,another well write ebook by troy denning but w...,0
21,with ylesia a novella originally publish in eb...,0
25,the events of ylesia take place during destiny...,0
...,...,...
789840,omg this book be great it be the first book th...,1
649909,jointly review for jenny well g after be leave...,1
6899,sin be a novella which include three stories b...,1
32147,the book be well write the story flow smoothly...,1


In [20]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(df['reviewText'],df['rating'],test_size=0.2,random_state=42)

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))

In [22]:
x_train_vec = tfidf.fit_transform(x_train)
x_test_vec = tfidf.transform(x_test)

In [23]:
np.set_printoptions(threshold=np.inf)
tfidf.get_feature_names_out()

array(['10', '12', '20', '35', '99', 'ability', 'ability to', 'able',
       'able to', 'about', 'about her', 'about his', 'about how',
       'about it', 'about the', 'about them', 'about this', 'about to',
       'about what', 'above', 'absolutely', 'absolutely love', 'abuse',
       'accept', 'accident', 'across', 'act', 'action', 'action and',
       'actual', 'actually', 'adam', 'add', 'add to', 'addition', 'admit',
       'adore', 'adult', 'adults', 'adventure', 'advice', 'affect',
       'afraid', 'after', 'after all', 'after read', 'after the', 'again',
       'again and', 'against', 'age', 'ago', 'agree', 'ahead', 'alex',
       'alien', 'alive', 'all', 'all about', 'all be', 'all in', 'all it',
       'all of', 'all over', 'all that', 'all the', 'all this', 'allow',
       'almost', 'alone', 'along', 'along the', 'along with', 'alpha',
       'already', 'also', 'also have', 'also the', 'although', 'always',
       'always be', 'amaze', 'amazon', 'american', 'amount', 'amount 

In [24]:
feature_names = tfidf.get_feature_names_out()

x_train_vec = pd.DataFrame.sparse.from_spmatrix(x_train_vec, columns=feature_names)
x_test_vec = pd.DataFrame.sparse.from_spmatrix(x_test_vec, columns=feature_names)

In [25]:
x_train_vec

Unnamed: 0,10,12,20,35,99,ability,ability to,able,able to,about,...,you wont,you would,youll,young,younger,your,youre,yourself,youve,zombie
0,0,0,0,0,0,0,0,0,0,0,...,0,0.065104,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0.035872,0,0,0,0,0,0,0,0.014515,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0.080935,...,0,0,0,0,0,0,0.158577,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245338,0,0,0,0,0,0,0,0,0,0.156824,...,0,0,0,0,0,0,0,0,0,0
245339,0,0,0,0,0,0,0,0,0,0.040449,...,0,0,0,0,0,0,0,0,0,0
245340,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
245341,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
x_test_vec

Unnamed: 0,10,12,20,35,99,ability,ability to,able,able to,about,...,you wont,you would,youll,young,younger,your,youre,yourself,youve,zombie
0,0,0,0,0,0,0,0,0,0,0.07373,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.090556,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0.095031,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61331,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61333,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
61334,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0.10093,0,0,0,0


In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
import xgboost
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression


rf = RandomForestClassifier()
nb = MultinomialNB()
xgb = XGBClassifier()
lr = LogisticRegression()

In [28]:
rf.fit(x_train_vec,y_train)

In [29]:
nb.fit(x_train_vec,y_train)

In [30]:
xgb.fit(x_train_vec,y_train)

In [31]:
lr.fit(x_train_vec,y_train)

In [33]:
from sklearn.metrics import classification_report, accuracy_score

In [34]:
test_pred = rf.predict(x_test_vec)
print(accuracy_score(y_test, test_pred))
print(classification_report(y_test, test_pred))

0.8260890830833442
              precision    recall  f1-score   support

           0       0.81      0.85      0.83     30535
           1       0.84      0.80      0.82     30801

    accuracy                           0.83     61336
   macro avg       0.83      0.83      0.83     61336
weighted avg       0.83      0.83      0.83     61336



In [35]:
test_pred = nb.predict(x_test_vec)
print(accuracy_score(y_test, test_pred))
print(classification_report(y_test, test_pred))

0.8370614321116473
              precision    recall  f1-score   support

           0       0.83      0.84      0.84     30535
           1       0.84      0.83      0.84     30801

    accuracy                           0.84     61336
   macro avg       0.84      0.84      0.84     61336
weighted avg       0.84      0.84      0.84     61336



In [36]:
test_pred = xgb.predict(x_test_vec)
print(accuracy_score(y_test, test_pred))
print(classification_report(y_test, test_pred))

0.8386428850919525
              precision    recall  f1-score   support

           0       0.83      0.84      0.84     30535
           1       0.84      0.83      0.84     30801

    accuracy                           0.84     61336
   macro avg       0.84      0.84      0.84     61336
weighted avg       0.84      0.84      0.84     61336



In [37]:
test_pred = lr.predict(x_test_vec)
print(accuracy_score(y_test, test_pred))
print(classification_report(y_test, test_pred))

0.8533487674448937
              precision    recall  f1-score   support

           0       0.85      0.85      0.85     30535
           1       0.85      0.86      0.85     30801

    accuracy                           0.85     61336
   macro avg       0.85      0.85      0.85     61336
weighted avg       0.85      0.85      0.85     61336

