## Load The Dataset

In [54]:
import pandas as pd
df = pd.read_csv('all_kindle_review.csv')

In [55]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,asin,helpful,rating,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,11539,B0033UV8HI,"[8, 10]",3,"Jace Rankin may be short, but he's nothing to ...","09 2, 2010",A3HHXRELK8BHQG,Ridley,Entertaining But Average,1283385600
1,1,5957,B002HJV4DE,"[1, 1]",5,Great short read. I didn't want to put it dow...,"10 8, 2013",A2RGNZ0TRF578I,Holly Butler,Terrific menage scenes!,1381190400
2,2,9146,B002ZG96I4,"[0, 0]",3,I'll start by saying this is the first of four...,"04 11, 2014",A3S0H2HV6U1I7F,Merissa,Snapdragon Alley,1397174400
3,3,7038,B002QHWOEU,"[1, 3]",3,Aggie is Angela Lansbury who carries pocketboo...,"07 5, 2014",AC4OQW3GZ919J,Cleargrace,very light murder cozy,1404518400
4,4,1776,B001A06VJ8,"[0, 1]",4,I did not expect this type of book to be in li...,"12 31, 2012",A3C9V987IQHOQD,Rjostler,Book,1356912000


In [56]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Unnamed: 0.1    12000 non-null  int64 
 1   Unnamed: 0      12000 non-null  int64 
 2   asin            12000 non-null  object
 3   helpful         12000 non-null  object
 4   rating          12000 non-null  int64 
 5   reviewText      12000 non-null  object
 6   reviewTime      12000 non-null  object
 7   reviewerID      12000 non-null  object
 8   reviewerName    11962 non-null  object
 9   summary         11998 non-null  object
 10  unixReviewTime  12000 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 1.0+ MB


In [57]:
df.describe()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,rating,unixReviewTime
count,12000.0,12000.0,12000.0,12000.0
mean,5999.5,10024.275667,3.25,1344537000.0
std,3464.24595,10502.233123,1.421619,43693740.0
min,0.0,0.0,1.0,960249600.0
25%,2999.75,2999.75,2.0,1316218000.0
50%,5999.5,5999.5,3.5,1356826000.0
75%,8999.25,12475.75,4.25,1376870000.0
max,11999.0,47770.0,5.0,1405814000.0


In [58]:
## required data
data = df[['reviewText', 'rating']].copy()
# data.shape
# data.isnull().sum()
data['rating'].unique()

array([3, 5, 4, 2, 1])

## Preprocessing and Cleaning

In [59]:
## Convert scale of reviews from 1-5 to 0 and 1s

data['rating'] = data['rating'].apply(lambda x:0 if x < 3 else 1)


In [60]:
## Lower case everything
# data['reviewText'] = data['reviewText'].apply(lambda x:x.lower())
data['reviewText'] = data['reviewText'].str.lower()
data

Unnamed: 0,reviewText,rating
0,"jace rankin may be short, but he's nothing to ...",1
1,great short read. i didn't want to put it dow...,1
2,i'll start by saying this is the first of four...,1
3,aggie is angela lansbury who carries pocketboo...,1
4,i did not expect this type of book to be in li...,1
...,...,...
11995,valentine cupid is a vampire- jena and ian ano...,1
11996,i have read all seven books in this series. ap...,1
11997,this book really just wasn't my cuppa. the si...,1
11998,"tried to use it to charge my kindle, it didn't...",0


In [61]:
pip install lxml

Note: you may need to restart the kernel to use updated packages.


In [62]:
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

In [63]:
## Remove special characters
data['reviewText'] = data['reviewText'].apply(lambda x : re.sub('[^a-z 0-9 ]', "",x))

## Remove stopwords
data['reviewText'] = data['reviewText'].apply(lambda x : " ".join([word for word in x.split() if word not in stopwords.words('english')]))

In [64]:
## Remove url 
df['reviewText']=df['reviewText'].apply(lambda x: re.sub(r'(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?', '' , str(x)))
## Remove html tags
df['reviewText']=df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())
## Remove any additional spaces
df['reviewText']=df['reviewText'].apply(lambda x: " ".join(x.split()))

data

  df['reviewText']=df['reviewText'].apply(lambda x: BeautifulSoup(x, 'lxml').get_text())


Unnamed: 0,reviewText,rating
0,jace rankin may short hes nothing mess man hau...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four books wasnt expect...,1
3,aggie angela lansbury carries pocketbooks inst...,1
4,expect type book library pleased find price right,1
...,...,...
11995,valentine cupid vampire jena ian another vampi...,1
11996,read seven books series apocalypticadventure o...,1
11997,book really wasnt cuppa situation man capturin...,1
11998,tried use charge kindle didnt even register ch...,0


In [65]:
## Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [66]:
def lemmatize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

In [67]:
data['reviewText']=data['reviewText'].apply(lambda x:lemmatize_words(x))

In [68]:
data.head()

Unnamed: 0,reviewText,rating
0,jace rankin may short he nothing mess man haul...,1
1,great short read didnt want put read one sitti...,1
2,ill start saying first four book wasnt expecti...,1
3,aggie angela lansbury carry pocketbook instead...,1
4,expect type book library pleased find price right,1


In [77]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(data['reviewText'],data['rating'],
                                              test_size=0.20)

In [78]:
## Counvectorizer
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer()
X_train_bow = bow.fit_transform(X_train).toarray()
X_test_bow = bow.transform(X_test).toarray()

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [80]:
from sklearn.naive_bayes import GaussianNB
nb_model_bow = GaussianNB().fit(X_train_bow, y_train)
nb_model_tfidf = GaussianNB().fit(X_train_tfidf, y_train)

In [81]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [82]:
y_pred_bow = nb_model_bow.predict(X_test_bow)
y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)

In [83]:
confusion_matrix(y_test, y_pred_tfidf)

array([[492, 300],
       [748, 860]])

In [84]:
accuracy_score(y_test, y_pred_bow)

0.5645833333333333