In [1]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [4]:
data = pd.read_csv('goodreads.tsv' , sep='\t')
print(data)

                                                  title          date  rating  \
0                       The Body: A Guide for Occupants  Oct 11, 2019     5.0   
1                                     Resistance Reborn  Feb 28, 2020     4.0   
2                                  The Book of Two Ways  Jul 13, 2020     4.0   
3     How to Write One Song: Loving the Things We Cr...  Feb 21, 2021     4.0   
4                                    The Giver of Stars  Sep 08, 2020     4.0   
...                                                 ...           ...     ...   
8428                                 The Giver of Stars  Feb 12, 2020     5.0   
8429        Minor Feelings: An Asian American Reckoning  May 21, 2020     4.0   
8430       Trixie and Katya's Guide to Modern Womanhood  Oct 29, 2020     4.0   
8431         To Wake the Giant: A Novel of Pearl Harbor  May 22, 2020     4.0   
8432                                     Finding Ashley  Apr 14, 2021     4.0   

                           

In [5]:
english_stops = set(stopwords.words('english'))
     

In [6]:
df=data.dropna()
df.tail()

Unnamed: 0,title,date,rating,body
8428,The Giver of Stars,"Feb 12, 2020",5.0,I loved this book! Let me start by saying that...
8429,Minor Feelings: An Asian American Reckoning,"May 21, 2020",4.0,What did i just read?I chose this title becaus...
8430,Trixie and Katya's Guide to Modern Womanhood,"Oct 29, 2020",4.0,**3.45 stars ( if we were using a 10/10 scale ...
8431,To Wake the Giant: A Novel of Pearl Harbor,"May 22, 2020",4.0,"Thanks to Netgalley, Random House and Ballenti..."
8432,Finding Ashley,"Apr 14, 2021",4.0,Finding Ashley starts with Melissa working har...


In [7]:
for i in range(0,len(df)-1):
    if type(df.iloc[i]['body']) != str:
        df.iloc[i]['body'] = str(df.iloc[i]['body'])
#function to represent sentiment -1(negetive);0(neutral);1(positive)
def sentiment(n):
    return 1 if n>=4 else (-1 if n<=2 else 0)
#Applying Sentiment Function 
df['sent'] = df['rating'].apply(sentiment)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,title,date,rating,body,sent
0,The Body: A Guide for Occupants,"Oct 11, 2019",5.0,Lovingly presented with humour and kindness an...,1
1,Resistance Reborn,"Feb 28, 2020",4.0,I read through this book quicker than anticip...,1
2,The Book of Two Ways,"Jul 13, 2020",4.0,Jodi Picoult does it again in this new novel. ...,1
3,How to Write One Song: Loving the Things We Cr...,"Feb 21, 2021",4.0,I’m a music freak without a shred of musical a...,1
4,The Giver of Stars,"Sep 08, 2020",4.0,The Giver of Stars by Jojo Moyes won the Book ...,1


In [8]:
# Cleaning Text

def clean_text(text):
    """
    Steps :
    - Removing HTML tags
    - Removing punctuation
    - Lowering text
    """
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)    
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [10]:
x1 = []
sentences = list(df['body'])
for sen in sentences:
    x1.append(clean_text(sen))



In [13]:
df['Review']=x1 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [17]:
def load_dataset():
    df1 = df
    x_data = df['Review']       # Reviews/Input
    y_data = df['sent']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sent')
print(y_data)

Reviews
0       [lovingly, presented, humour, kindness, great,...
1       [read, book, quicker, anticipated, begun, yest...
2       [jodi, picoult, new, novel, plane, crash, main...
3       [music, freak, without, shred, musical, abilit...
4       [giver, stars, jojo, moyes, book, bucket, list...
                              ...                        
8428    [loved, book, let, start, saying, really, read...
8429    [read, chose, title, unenlightened, asian, exp...
8430    [stars, using, scale, itd, trixie, katya, real...
8431    [thanks, netgalley, random, house, ballentine,...
8432    [finding, ashley, starts, melissa, working, ha...
Name: Review, Length: 8139, dtype: object 

Sent
0       1
1       1
2       1
3       1
4       1
       ..
8428    1
8429    1
8430    1
8431    1
8432    1
Name: sent, Length: 8139, dtype: int64


In [18]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
2803    [milly, aubrey, jonah, cousins, really, know, ...
5995    [good, book, kind, predictable, areas, less, e...
3711    [received, arc, random, house, publishing, gro...
4538    [wanting, read, book, since, first, heard, alw...
5025    [good, book, desolation, road, really, good, b...
                              ...                        
2271    [collection, essays, asian, american, resonate...
796     [say, second, instalment, aurora, cycle, well,...
7508     [stars, think, great, beginners, feminist, book]
1812    [stars, simulation, become, indistinguishable,...
3412    [stars, aboard, strick, family, dysfunction, t...
Name: Review, Length: 6511, dtype: object 

1983    [holy, shit, book, exquisite, challenge, sooth...
4619    [essay, excellent, starting, usually, quick, p...
3545    [liked, plot, teenage, girls, sent, convents, ...
4292    [thought, provoking, charged, collection, pers...
4162                       [overall, underwhelming, read]
                  

In [21]:
def get_max_length():
    review_length = []
    for Review in x_train:
        review_length.append(len(Review))

    return int(np.ceil(np.mean(review_length)))

In [22]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[ 733  782  916 ...    0    0    0]
 [  17    1  143 ...    0    0    0]
 [ 558  274  622 ...    0    0    0]
 ...
 [  69   21   31 ...    0    0    0]
 [  69 8770  247 ...  696   41    2]
 [  69 4301 1697 ... 2089 1605 1785]] 

Encoded X Test
 [[2650 1567    1 ...    0    0    0]
 [ 866  377  908 ...   82    0    0]
 [  98   58 1785 ...    0    0    0]
 ...
 [   6  372    3 ...    0    0    0]
 [   6  240  800 ...    0    0    0]
 [  61  490 1762 ... 7025  106 4747]] 

Maximum review length:  82


In [23]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 82, 32)            888224    
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dense (Dense)                (None, 1)                 65        
Total params: 913,121
Trainable params: 913,121
Non-trainable params: 0
_________________________________________________________________
None


In [24]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [None]:
#finding the vocabulary size and then perform padding 
vocab_size = len(token.word_index) + 1
maxlen = 50
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)
X_train
X_test

In [25]:
model.fit(x_train, y_train, batch_size = 128, epochs = 5, callbacks=[checkpoint])

Epoch 1/5

Epoch 00001: accuracy improved from -inf to 0.70174, saving model to models\LSTM.h5
Epoch 2/5

Epoch 00002: accuracy did not improve from 0.70174
Epoch 3/5

Epoch 00003: accuracy did not improve from 0.70174
Epoch 4/5

Epoch 00004: accuracy did not improve from 0.70174
Epoch 5/5

Epoch 00005: accuracy improved from 0.70174 to 0.74551, saving model to models\LSTM.h5


<tensorflow.python.keras.callbacks.History at 0x1aa2fdce148>

In [26]:
y_pred = model.predict_classes(x_test, batch_size = 128)

true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1

print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))



Correct Prediction: 1096
Wrong Prediction: 532
Accuracy: 67.32186732186733


In [27]:
loaded_model = load_model('models/LSTM.h5')

In [32]:
review = str(input('Books Review: '))

Books Review: book is awesome


In [33]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  book is awesome
Filtered:  ['book awesome']


In [34]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0]]


In [36]:
result = loaded_model.predict(tokenize_words)
print(result*100)

[[81.09067]]


In [37]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
