In [1]:
# Importing essential libraries and functions

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from numpy import array
import tensorflow as tf

from keras.preprocessing.text import one_hot, Tokenizer
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split

from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
reviews = pd.read_csv(r".\Dataset\yelpReviewsDataset.csv")

In [3]:
def sentiment_to_int(sentiment):
    sentiment_map = {
        1: 0,
        2: 1,
        3: 2,
        4: 3,
        5: 4
    }
    return sentiment_map.get(sentiment, -1)

In [4]:
texts = reviews['Review'].values
labels = reviews['Rating'].apply(sentiment_to_int).values

In [5]:
import pickle
with open(r".\TokenizedDataset\yelpReviewsDatasetTokens.pkl", "rb") as file:
    tokenized_Reviews = pickle.load(file)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(tokenized_Reviews, labels, test_size=0.20, random_state=42)

In [7]:
# One-hot encode labels
y_train = to_categorical(y_train, num_classes=5)
y_test = to_categorical(y_test, num_classes=5)

In [8]:
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [9]:
import io
import json
tokenizer_json = word_tokenizer.to_json()
with io.open('cnn_tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [10]:
# Adding 1 to store dimensions for words for which no pretrained word embeddings exist
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

75957

In [11]:
max_length = 1000

In [12]:
# Padding all reviews to fixed length
X_train = pad_sequences(X_train, padding='post', maxlen=max_length)
X_test = pad_sequences(X_test, padding='post', maxlen=max_length)

In [13]:
# Load GloVe word embeddings and create an Embeddings Dictionary
from numpy import asarray
from numpy import zeros
embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

In [14]:
# Create Embedding Matrix having 100 columns 
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.

embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [15]:
embedding_matrix.shape

(75957, 100)

Convolutional Neural Network

In [16]:
from keras.layers import Conv1D
# Neural Network architecture
cnn_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
cnn_model.add(embedding_layer)
cnn_model.add(Conv1D(128, 5, activation='relu'))
cnn_model.add(GlobalMaxPooling1D())
cnn_model.add(Dense(5, activation='softmax'))

In [17]:
# Model compiling
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(cnn_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1000, 100)         7595700   
                                                                 
 conv1d (Conv1D)             (None, 996, 128)          64128     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 5)                 645       
                                                                 
Total params: 7,660,473
Trainable params: 64,773
Non-trainable params: 7,595,700
_________________________________________________________________
None


In [18]:
# Model training
cnn_model_history = cnn_model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
# Predictions on the Test Set
score_cnn = cnn_model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_cnn[0])
print("Test Accuracy:", score_cnn[1])

Test Score: 1.0913615226745605
Test Accuracy: 0.5415999889373779


In [20]:
#Predict
y_prediction = cnn_model.predict(X_test)
y_prediction = np.argmax(y_prediction, axis = 1)
y_test=np.argmax(y_test, axis=1)
#Create confusion matrix and normalizes it over predicted (columns)
result = confusion_matrix(y_test, y_prediction , normalize='pred')
print(result)

[[0.68280061 0.26037377 0.06423841 0.01247401 0.01226636]
 [0.23632674 0.46721571 0.23774834 0.05032918 0.01693925]
 [0.05611365 0.21169887 0.44692526 0.20045045 0.04018692]
 [0.01349569 0.04307887 0.19441816 0.46058559 0.24310748]
 [0.01126332 0.01763277 0.05666982 0.27616078 0.6875    ]]


In [22]:
accuracy_score = metrics.accuracy_score(y_prediction, y_test)

print('CNN accuracy is',str('{:04.2f}'.format(accuracy_score*100))+'%')
print('------------------------------------------------')
print('Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test, y_prediction)))
print('------------------------------------------------')
print('Classification Report:')
print(classification_report(y_test, y_prediction))

CNN accuracy is 54.16%
------------------------------------------------
Confusion Matrix:
      0     1     2     3     4
0  6729  2466   679   144   105
1  2329  4425  2513   581   145
2   553  2005  4724  2314   344
3   133   408  2055  5317  2081
4   111   167   599  3188  5885
------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.66      0.67     10123
           1       0.47      0.44      0.45      9993
           2       0.45      0.48      0.46      9940
           3       0.46      0.53      0.49      9994
           4       0.69      0.59      0.64      9950

    accuracy                           0.54     50000
   macro avg       0.55      0.54      0.54     50000
weighted avg       0.55      0.54      0.54     50000



Recurrent Neural Network (LSTM)

In [23]:
from keras.layers import LSTM
# Neural Network architecture
lstm_model = Sequential()
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=max_length, trainable=False)
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))
lstm_model.add(Dense(5, activation='softmax'))

In [24]:
# Model compiling
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(lstm_model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 459, 100)          10533700  
                                                                 
 lstm (LSTM)                 (None, 128)               117248    
                                                                 
 dense_1 (Dense)             (None, 5)                 645       
                                                                 
Total params: 10,651,593
Trainable params: 117,893
Non-trainable params: 10,533,700
_________________________________________________________________
None


In [25]:
# Model Training
lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Epoch 1/10

KeyboardInterrupt: 

In [51]:
# Predictions on the Test Set
score_lstm = lstm_model.evaluate(X_test, y_test, verbose=1)
print("Test Score:", score_lstm[0])
print("Test Accuracy:", score_lstm[1])

Test Score: 1.3516192436218262
Test Accuracy: 0.48896461725234985


In [53]:
len(X_test)

6162

In [69]:
# Passing tokenised instance to the LSTM model for predictions
unseen_sentiments = lstm_model.predict(X_test)
unseen_sentiments



array([[0.08029749, 0.06125883, 0.10466078, 0.21865538, 0.5351275 ],
       [0.0802975 , 0.06125883, 0.10466078, 0.21865535, 0.5351275 ],
       [0.08029749, 0.06125883, 0.10466078, 0.21865538, 0.5351275 ],
       ...,
       [0.08029749, 0.06125883, 0.10466078, 0.21865538, 0.5351275 ],
       [0.08029749, 0.06125883, 0.10466078, 0.21865538, 0.5351275 ],
       [0.08029749, 0.06125883, 0.10466078, 0.21865538, 0.5351275 ]],
      dtype=float32)

In [70]:
y_test

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)