In [1]:
# Importing essential libraries and functions

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from numpy import array
import tensorflow as tf

from keras.preprocessing.text import one_hot, Tokenizer
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split

from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
reviews = pd.read_csv(r".\Dataset\yelpReviewsDataset.csv")

In [3]:
def sentiment_to_int(sentiment):
    sentiment_map = {
        1: 0,
        2: 1,
        3: 2,
        4: 3,
        5: 4
    }
    return sentiment_map.get(sentiment, -1)

In [4]:
texts = reviews['Review'].values
labels = reviews['Rating'].apply(sentiment_to_int).values

In [5]:
import pickle
with open(r".\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    tokenized_Reviews = pickle.load(file)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(tokenized_Reviews, labels, test_size=0.20, random_state=42)

In [7]:
# One-hot encode labels
y_train = to_categorical(y_train, num_classes=5)
y_test = to_categorical(y_test, num_classes=5)

In [8]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [10]:
import io
import json
tokenizer_json = word_tokenizer.to_json()
with io.open('rnn_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [11]:
# Adding 1 to store dimensions for words for which no pretrained word embeddings exist
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

75957

In [12]:
max_length = 1000

In [13]:
# Padding all reviews to fixed length
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

In [14]:
# Load GloVe word embeddings and create an Embeddings Dictionary
from numpy import asarray
from numpy import zeros
embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [15]:
# Create Embedding Matrix having 100 columns 
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.

embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [16]:
embedding_matrix.shape

(75957, 100)

Recurrent Neural Network (LSTM)

In [17]:
from keras.layers import LSTM
# Neural Network architecture
lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))
lstm_model.add(Dense(5, activation='softmax'))

In [18]:
# Model compiling
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(lstm_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         7595700   
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense (Dense)               (None, 5)                 645       
                                                                 
Total params: 7,713,593
Trainable params: 117,893
Non-trainable params: 7,595,700
_________________________________________________________________
None


In [19]:
# Model Training
lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [20]:
# Predictions on the Test Set
score_lstm = lstm_model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_lstm[0])
print("Test Accuracy:", score_lstm[1])

Test Score: 1.6094543933868408
Test Accuracy: 0.19900000095367432


In [21]:
#Predict
y_prediction = lstm_model.predict(X_test)
y_prediction = np.argmax(y_prediction, axis = 1)
y_test=np.argmax(y_test, axis=1)
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_prediction , normalize='pred')
print(result)

[[0.      0.      0.      0.      0.20246]
 [0.      0.      0.      0.      0.19986]
 [0.      0.      0.      0.      0.1988 ]
 [0.      0.      0.      0.      0.19988]
 [0.      0.      0.      0.      0.199  ]]


In [22]:
accuracy_score = metrics.accuracy_score(y_prediction, y_test)

print('RNN accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_prediction)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, y_prediction))

RNN accuracy is 19.90%
------------------------------------------------
Confusion Matrix:
   0  1  2  3      4
0  0  0  0  0  10123
1  0  0  0  0   9993
2  0  0  0  0   9940
3  0  0  0  0   9994
4  0  0  0  0   9950
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     10123
           1       0.00      0.00      0.00      9993
           2       0.00      0.00      0.00      9940
           3       0.00      0.00      0.00      9994
           4       0.20      1.00      0.33      9950

    accuracy                           0.20     50000
   macro avg       0.04      0.20      0.07     50000
weighted avg       0.04      0.20      0.07     50000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
