# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
data=pd.read_csv('kindle_reviews.csv')

In [2]:
data=data.head(50000)
data

Unnamed: 0.1,Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,0,B000F83SZQ,"[0, 0]",5,I enjoy vintage books and movies so I enjoyed ...,"05 5, 2014",A1F6404F1VG29J,Avidreader,Nice vintage story,1399248000
1,1,B000F83SZQ,"[2, 2]",4,This book is a reissue of an old one; the auth...,"01 6, 2014",AN0N05A9LIJEQ,critters,Different...,1388966400
2,2,B000F83SZQ,"[2, 2]",4,This was a fairly interesting read. It had ol...,"04 4, 2014",A795DMNCJILA6,dot,Oldie,1396569600
3,3,B000F83SZQ,"[1, 1]",5,I'd never read any of the Amy Brewster mysteri...,"02 19, 2014",A1FV0SX13TWVXQ,"Elaine H. Turley ""Montana Songbird""",I really liked it.,1392768000
4,4,B000F83SZQ,"[0, 1]",4,"If you like period pieces - clothing, lingo, y...","03 19, 2014",A3SPTOKDG7WBLN,Father Dowling Fan,Period Mystery,1395187200
...,...,...,...,...,...,...,...,...,...,...
49995,49995,B004PLNP2K,"[0, 0]",5,Anne Marie Novark has done it again. Book 3 in...,"05 27, 2013",A3P31Y2T3HBUS6,Jobanga,A Match Made In Texas,1369612800
49996,49996,B004PLNP2K,"[0, 0]",4,This goes to show just how much hurt losing yo...,"05 17, 2013",A1CPJD87PMPJQJ,KareBear1965,worth the time and money,1368748800
49997,49997,B004PLNP2K,"[0, 0]",4,Despite the physical beauty of Novark's charac...,"03 30, 2014",A1K1P8Y3LO20AZ,K. Pilon,I'm a sucker for a sensitive handsome man,1396137600
49998,49998,B004PLNP2K,"[0, 0]",4,"I have enjoyed this series, and will continue ...","08 16, 2013",AK3MABTV8TK8M,"lcj ""jcl623""",Not the best in the series....,1376611200


In [3]:
data.shape

(50000, 10)

In [4]:
data.isna().sum()

Unnamed: 0          0
asin                0
helpful             0
overall             0
reviewText          1
reviewTime          0
reviewerID          0
reviewerName      149
summary             0
unixReviewTime      0
dtype: int64

In [5]:
data.dtypes

Unnamed: 0         int64
asin              object
helpful           object
overall            int64
reviewText        object
reviewTime        object
reviewerID        object
reviewerName      object
summary           object
unixReviewTime     int64
dtype: object

In [6]:
#deleting the unwanted columns from the dataset
del data['Unnamed: 0']
del data['asin']
del data['helpful']
del data['reviewTime']
del data['reviewerID']
del data['reviewerName']
del data['unixReviewTime']

In [7]:
data.head(10)

Unnamed: 0,overall,reviewText,summary
0,5,I enjoy vintage books and movies so I enjoyed ...,Nice vintage story
1,4,This book is a reissue of an old one; the auth...,Different...
2,4,This was a fairly interesting read. It had ol...,Oldie
3,5,I'd never read any of the Amy Brewster mysteri...,I really liked it.
4,4,"If you like period pieces - clothing, lingo, y...",Period Mystery
5,4,A beautiful in-depth character description mak...,Review
6,4,I enjoyed this one tho I'm not sure why it's c...,Nice old fashioned story
7,4,Never heard of Amy Brewster. But I don't need ...,Enjoyable reading and reminding the old times
8,5,Darth Maul working under cloak of darkness com...,Darth Maul
9,4,This is a short story focused on Darth Maul's ...,"Not bad, not exceptional"


In [8]:
data.overall.value_counts()

5    23090
4    14980
3     7013
2     2832
1     2085
Name: overall, dtype: int64

In [9]:
data.isna().sum()

overall       0
reviewText    1
summary       0
dtype: int64

In [10]:
#joining review description and summary into one col
data['reviewText']=data['reviewText']+" "+data['summary']

In [11]:
del data['summary']

In [12]:
data.isna().sum()


overall       0
reviewText    1
dtype: int64

In [13]:
#since there is only one null value, replace it with blank space
data['reviewText'].fillna("",inplace = True)

In [14]:
#Grouping the overall rating of scale 1-5 to 2 categories
def review_sentiment(rating):
    #0(positive) and  with 1(negative) 
    if(rating == 5 or rating == 4 or rating==3):
        return 0
    else:
        return 1

In [15]:
data.overall = data.overall.apply(review_sentiment)

In [16]:
data.overall.value_counts()

0    45083
1     4917
Name: overall, dtype: int64

In [17]:
data.head(50)

Unnamed: 0,overall,reviewText
0,0,I enjoy vintage books and movies so I enjoyed ...
1,0,This book is a reissue of an old one; the auth...
2,0,This was a fairly interesting read. It had ol...
3,0,I'd never read any of the Amy Brewster mysteri...
4,0,"If you like period pieces - clothing, lingo, y..."
5,0,A beautiful in-depth character description mak...
6,0,I enjoyed this one tho I'm not sure why it's c...
7,0,Never heard of Amy Brewster. But I don't need ...
8,0,Darth Maul working under cloak of darkness com...
9,0,This is a short story focused on Darth Maul's ...


# NLP

In [18]:
import re,string
import nltk
#nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer


In [19]:
from string import punctuation

In [23]:
#data['reviewText']

In [24]:
import nltk
nltk.download('punkt')
def cleaning(text):
    import nltk
    
    #Remove special characters
    letters = re.sub("[^a-zA-Z]", " ", text)
    
    #Convert to lower case
    letters = letters.lower()
    
    # Tokenize
    tokens = nltk.word_tokenize(letters)
    
    #Convering stopwords list to set data type
    stops = set(nltk.corpus.stopwords.words("english"))
    
    #Removing stopwords 
    words = [w for w in tokens if not w in stops]
    
    #Stemming
    words = [nltk.stem.SnowballStemmer('english').stem(w) for w in words]
    
    #Joining the words back into one string separated by space
    return " ".join(words)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
data['reviewText'] = data['reviewText'].apply(cleaning)


In [26]:
data['reviewText'][10]

'think one book audio good stori either way good ol maul audio book'

In [27]:
d=[]
for i in range(50000):
    d.append(data['reviewText'][i])

In [28]:
d

['enjoy vintag book movi enjoy read book plot unusu think kill someon self defens leav scene bodi without notifi polic hit someon jaw knock would wash today still good read nice vintag stori',
 'book reissu old one author born era say nero wolf introduct quit interest explain author forgotten never heard languag littl date time like call gun heater also made good use fire dictionari look word like deshabill canarsi still well worth look see differ',
 'fair interest read old style terminolog glad get read stori coars crasslanguag read fun relax like free ebooksbecaus check writer decid intrigu innov enough command englishthat convey stori without crude languag oldi',
 'never read ami brewster mysteri one realli hook realli like',
 'like period piec cloth lingo enjoy mysteri author guess least way period mysteri',
 'beauti depth charact descript make like fast pace movi piti mr merwin write instead ami brewster mysteri review',
 'enjoy one tho sure call ami brewster mysteri much clean we

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv= CountVectorizer(max_features=1500)
x=cv.fit_transform(d).toarray()
y=data.iloc[:,:1].values

In [30]:
y

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]], dtype=int64)

In [31]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)


In [32]:
x_train.shape

(40000, 1500)

In [33]:
y_train.shape

(40000, 1)

In [34]:
x_test.shape

(10000, 1500)

# ANN

In [35]:
from tensorflow.keras.models import Sequential #to initialize models
from tensorflow.keras.layers import Dense #adding layers
from tensorflow.keras.layers import Conv2D #convolution layer
from tensorflow.keras.layers import MaxPool2D #maxpooling
from tensorflow.keras.layers import Flatten 

In [59]:
model=Sequential()
model.add(Dense(input_dim=1500,kernel_initializer='random_uniform',activation='relu',units=1600))
model.add(Dense(units=100,kernel_initializer='random_uniform',activation='relu'))
model.add(Dense(units=1,kernel_initializer='random_uniform',activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.fit(x_train,y_train,epochs=5,batch_size=16)

Train on 40000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1b71c7189c8>

In [60]:
model.save("amazo.h5")

In [61]:
ypred=model.predict(x_test)

In [62]:
ypred

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]], dtype=float32)

In [63]:
import joblib
joblib.dump(cv.vocabulary_,"amazo.save")

['amazo.save']

In [64]:
loaded=CountVectorizer(decode_error='replace',vocabulary=joblib.load('amazo.save'))

In [84]:
d="Writing was good"
d=d.split('delimiter')
result=model.predict(loaded.transform(d))
print(result)
prediction=result>0.5
#print(prediction)
if prediction[0] == False:
    print("Positive review")
elif prediction[0] == True:
    print("Negative review")

[[0.01901091]]
Positive review
