# Sentiment Analysis on Kindle Book Review
***

In [1]:
# Importing library

import numpy as np
import pandas as pd
import keras
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense, Dropout, SpatialDropout1D
from tensorflow.keras.layers import Embedding
from tensorflow.keras.utils import to_categorical
import keras_tuner as kt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, log_loss
from sklearn.metrics import plot_confusion_matrix
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

import nltk
from nltk import tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from collections import Counter
import string
import re

# 1. Read database

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wongj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\wongj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\wongj\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [3]:
# loading  dataset

data = pd.read_csv('train.csv')
data_realtest = pd.read_csv('test.csv')

# 2. Data Cleaning

In [4]:
# Training constants for padded_sequence, maxlen will be rounded off to 400
data['reviewText'].apply(lambda x : len(x.split(' '))).quantile(0.95)

369.0

In [5]:
# Show most frequent 80 words, used for manual pre-processing
data_pre = data
stop = set(stopwords.words('english'))
data_pre['reviewText'] = data_pre['reviewText'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

from collections import Counter
Counter(" ".join(data_pre['reviewText']).split()).most_common(80)

[('I', 23419),
 ('book', 6382),
 ('The', 5047),
 ('story', 5018),
 ('read', 4328),
 ('one', 3465),
 ('like', 3431),
 ('This', 3002),
 ('would', 2913),
 ('really', 2655),
 ('It', 2574),
 ('good', 2518),
 ('love', 2222),
 ('get', 2207),
 ('characters', 2172),
 ('reading', 1766),
 ('first', 1747),
 ('much', 1736),
 ('books', 1682),
 ('author', 1668),
 ('even', 1643),
 ('-', 1605),
 ('could', 1605),
 ('time', 1571),
 ('little', 1543),
 ('book.', 1511),
 ('it.', 1360),
 ('short', 1318),
 ('well', 1304),
 ('two', 1301),
 ('great', 1271),
 ('know', 1260),
 ('way', 1232),
 ('think', 1210),
 ("I'm", 1185),
 ('story.', 1179),
 ('sex', 1138),
 ('enjoyed', 1097),
 ('series', 1087),
 ('find', 1069),
 ('never', 1069),
 ('also', 1069),
 ('make', 1058),
 ('see', 1022),
 ('want', 991),
 ('There', 975),
 ('She', 970),
 ('many', 964),
 ('character', 959),
 ('found', 955),
 ('read.', 949),
 ('He', 946),
 ('A', 941),
 ('plot', 938),
 ('going', 918),
 ('liked', 900),
 ('But', 881),
 ('got', 875),
 ('bit', 8

In [6]:
review = []
my_stopwords = set(nltk.corpus.stopwords.words('english'))
my_stopwords.remove('no')
my_stopwords.remove('not')
my_stopwords.remove('very')
my_stopwords.add('book')
my_stopwords.add('story')
my_stopwords.add('author')
my_stopwords.add('read')
my_stopwords.add('reading')
my_stopwords.add('character')
my_stopwords.add('I')
my_stopwords.add('The')
my_stopwords.add('This')
my_stopwords.add('It')

lemmatizer = WordNetLemmatizer()
vocabulary = {}
review_size = []

def tokenize(text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return nltk.tokenize.word_tokenize(nopunct)


for i in range(len(data['rating'])):
    review.append(data['summary'][i] +' '+ data['reviewText'][i])

counts = Counter()

for i in range(len(review)):
    r = []
    tokens = tokenize(review[i])
    for t in tokens:
        if t not in my_stopwords and len(t) > 1:
            r.append(lemmatizer.lemmatize(t))
            
    review[i] = r
    counts.update(r)
    review_size.append(len(r))
    
data['review'] = review

review = []
review_size = []

for i in range(len(data_realtest['Id'])):
    review.append(data_realtest['summary'][i] +' '+ data_realtest['reviewText'][i])

counts = Counter()

for i in range(len(review)):
    r = []
    tokens = tokenize(review[i])
    for t in tokens:
        if t not in my_stopwords and len(t) > 1:
            r.append(lemmatizer.lemmatize(t))
            
    review[i] = r
    counts.update(r)
    review_size.append(len(r))
    
data_realtest['review'] = review

In [7]:
data.isnull().sum()

rating        0
reviewText    0
summary       0
review        0
dtype: int64

In [8]:
data.rating.value_counts()

4    2400
5    2200
1    1700
2    1500
3    1200
Name: rating, dtype: int64

# 3. Preprocessing

In [9]:
# split data into X and y
X = data.review
y = data.rating
X_realtrain = X
y_realtrain = y
X_realtest = data_realtest.review


# 60:20:20 split
X_tv, X_test, y_tv, y_test = train_test_split(X,y,test_size=0.20,random_state=0)
X_train, X_vali, y_train, y_vali = train_test_split(X_tv, y_tv, test_size = 1/4,random_state=0)

y_train_array = np.array(y_train)
y_vali_array = np.array(y_vali)
y_tv_array = np.array(y_tv)
y_test_array = np.array(y_test)
y_realtrain_array = np.array(y_realtrain)

X_train.shape, X_test.shape

((5400,), (1800,))

In [10]:
from sklearn.preprocessing import LabelEncoder
# One Hot Encode Y values:
encoder = LabelEncoder()

y_train = encoder.fit_transform(y_train.values)
y_train = to_categorical(y_train) 

y_vali = encoder.fit_transform(y_vali.values)
y_vali = to_categorical(y_vali) 

y_tv = encoder.fit_transform(y_tv.values)
y_tv = to_categorical(y_tv) 

y_test = encoder.fit_transform(y_test.values)
y_test = to_categorical(y_test) 

y_realtrain = encoder.fit_transform(y_realtrain.values)
y_realtrain = to_categorical(y_realtrain) 

In [11]:
tokenizer = Tokenizer(num_words=10000, lower=True) # num_words:the maximum number of words to keep, based on word frequency
tokenizer.fit_on_texts(X)
sequences_train = tokenizer.texts_to_sequences(X_train)
sequences_vali = tokenizer.texts_to_sequences(X_vali)
sequences_tv = tokenizer.texts_to_sequences(X_tv)
sequences_test = tokenizer.texts_to_sequences(X_test)
sequences_realtrain = tokenizer.texts_to_sequences(X)
sequences_realtest = tokenizer.texts_to_sequences(X_realtest)

vocab_size = len(tokenizer.word_index) + 1

padded_sequence_train = pad_sequences(sequences_train, maxlen=400) # maxlen, higher num takes longer to run
padded_sequence_vali = pad_sequences(sequences_vali, maxlen=400)
padded_sequence_tv = pad_sequences(sequences_tv, maxlen=400)
padded_sequence_test = pad_sequences(sequences_test, maxlen=400)
padded_sequence_realtrain = pad_sequences(sequences_realtrain, maxlen=400)
padded_sequence_realtest = pad_sequences(sequences_realtest, maxlen=400)

print('Number of Tokens:', len(tokenizer.word_index))
print("Max Token Index:", padded_sequence_train.max(), "\n")

print('Sample Before Processing:', X_train.values[0])
print('Sample After Processing:', tokenizer.sequences_to_texts([padded_sequence_train[0]]), '\n')

print('What the model will interpret:', padded_sequence_train[0].tolist())

Number of Tokens: 23727
Max Token Index: 9999 

Sample Before Processing: ['seriously', 'short', 'even', 'qualify', 'novella', 'guess', 'get', 'pay', 'free']
Sample After Processing: ['seriously short even qualify novella guess get pay free'] 

What the model will interpret: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

# 4. Hyperparameter Tuning

## 4.1 Tuning

In [12]:
def build_model(hp):
    model = Sequential()
    model.add(Embedding(input_dim = vocab_size, output_dim = 128, input_length=400))
    model.add(LSTM(units=hp.Int('units_LSTM',min_value=16,max_value=256,step=16)))
    model.add(Dropout(hp.Float('rate', min_value=0.0, max_value=0.5, step=0.1)))
    # Tune the number of dense layers
    for i in range(hp.Int('num_layers', 0, 3)):
        model.add(Dense(units=hp.Int('units_'+str(i), min_value=16, max_value=256, step=16), activation="relu"))    
    model.add(Dense(5, activation='softmax'))
    hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3])
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'],optimizer='adam')

    return model

tuner = kt.RandomSearch(build_model,objective="val_accuracy", max_trials=3,executions_per_trial=2,overwrite=True)

tuner.search(padded_sequence_train, y_train, validation_data=(padded_sequence_vali, y_vali),
                    epochs = 5,
                    batch_size=64)

# Get the optimal hyperparameters
best_hps_1=tuner.get_best_hyperparameters(num_trials=1)[0]


Trial 3 Complete [00h 27m 51s]
val_accuracy: 0.4750000089406967

Best val_accuracy So Far: 0.5016666650772095
Total elapsed time: 00h 48m 30s
INFO:tensorflow:Oracle triggered exit


In [13]:
# Build the model with the optimal hyperparameters and train it on the data for 5 epochs
model = tuner.hypermodel.build(best_hps_1)
history = model.fit(padded_sequence_train, y_train, epochs=5,batch_size=64, validation_data=(padded_sequence_vali, y_vali))

val_acc_per_epoch = history.history['val_accuracy']
best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
print('Best epoch: %d' % (best_epoch,))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Best epoch: 3


In [15]:
# Add units_1,units_2 etc depending on the num_layers
print(f"""
{best_hps_1.get('units_LSTM')}
{best_hps_1.get('rate')}
{best_hps_1.get('num_layers')}
{best_hps_1.get('learning_rate')}
""")



128
0.30000000000000004
0
0.001



# 5. Models

## 5.1 Baseline

In [16]:
# Naive Bayse Baseline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])
text_clf.fit(tokenizer.sequences_to_texts_generator(padded_sequence_tv), y_tv_array)
predictions = text_clf.predict(tokenizer.sequences_to_texts_generator(padded_sequence_test)) 
print('Accuracy Using Naive Bayes: ', (predictions == y_test_array).mean())
print('F1 Score:', f1_score(y_test_array, predictions, average='weighted'))

Accuracy Using Naive Bayes:  0.4816666666666667
F1 Score: 0.4289962919683218


## 5.2 Model

In [17]:
model = Sequential() 
model.add(Embedding(input_dim = vocab_size, output_dim = 128, input_length=400))
model.add(LSTM(144))
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
model.add(Dense(5, activation='softmax')) 

model.compile(loss='categorical_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])  

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 400, 128)          3037184   
                                                                 
 lstm_2 (LSTM)               (None, 144)               157248    
                                                                 
 dropout_2 (Dropout)         (None, 144)               0         
                                                                 
 dense_3 (Dense)             (None, 128)               18560     
                                                                 
 dense_4 (Dense)             (None, 5)                 645       
                                                                 
Total params: 3,213,637
Trainable params: 3,213,637
Non-trainable params: 0
_________________________________________________________________
None


In [18]:
history = model.fit(padded_sequence_tv, y_tv, validation_data=(padded_sequence_test, y_test),
                    epochs = 3,
                    batch_size=64)

Epoch 1/3
Epoch 2/3
Epoch 3/3


# 6. Predictions

In [19]:
model_best = Sequential() 
model_best.add(Embedding(input_dim = vocab_size, output_dim = 128, input_length=400))
model_best.add(LSTM(144))
model_best.add(Dropout(0.1))
model_best.add(Dense(128, activation='relu'))
model_best.add(Dense(5, activation='softmax')) 

model_best.compile(loss='categorical_crossentropy',optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), metrics=['accuracy'])  

print(model_best.summary())


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 400, 128)          3037184   
                                                                 
 lstm_3 (LSTM)               (None, 144)               157248    
                                                                 
 dropout_3 (Dropout)         (None, 144)               0         
                                                                 
 dense_5 (Dense)             (None, 128)               18560     
                                                                 
 dense_6 (Dense)             (None, 5)                 645       
                                                                 
Total params: 3,213,637
Trainable params: 3,213,637
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
history_best = model_best.fit(padded_sequence_realtrain, y_realtrain,
                    epochs = 2,
                    batch_size=64)

Epoch 1/2
Epoch 2/2


In [21]:
y_pred = model_best.predict(padded_sequence_realtest)
df_pred = pd.DataFrame(y_pred, columns = [1,2,3,4,5])
df_pred = df_pred.idxmax(axis=1)

df_pred

0       3
1       3
2       4
3       5
4       4
       ..
2995    3
2996    4
2997    1
2998    5
2999    5
Length: 3000, dtype: int64