<a href="https://colab.research.google.com/github/Terry-Migwi/Amazon_Reviews_Sentiment_Analysis/blob/main/NN_CDs_Vinyl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Defining the question

The objective of this notebook is to classify sentiment reviews of CDs and Vinyl products using `Neural Networks` for `Natural Language Processing`. The network was built with 10 epochs, and an overall test accuracy of 90%.



In [None]:
# import necessary libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

# for deep learning
from keras.utils import to_categorical
from keras import models
from keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# for nlp
# Use regular expression
import re

# Get a bunch of tools from nltk for nlp
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.corpus import stopwords

# Get English stopwords
en_stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# First upload a review data set to the runtime

# Initiate an empty list to store sliced dataframes (chunks)
myList = []

# Specify the chunksize
chunksize = 1000

# nrows should be more than the total number of rows in the whole file
for chunk in pd.read_json('/content/drive/MyDrive/Colab Notebooks/CDs_and_Vinyl_5.json.gz', compression='gzip', nrows=10000000, lines=True, chunksize=chunksize):
    myList.append(chunk)

# put the chunks together
myData = pd.concat(myList,axis=0)

In [None]:
# preview the top of the data
myData.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,verified,reviewText,overall,reviewTime,summary,unixReviewTime,style,vote,image
0,A1H1DL4K669VQ9,1393774,Judith Paladino,True,Love it!! Great seller!,5,"04 29, 2016",Five Stars,1461888000,,,
1,A3V5XBBT7OZG5G,1393774,gflady,True,One of my very favourite albums from one of my...,5,"02 23, 2016",One of my very favourite albums from one of my...,1456185600,,,
2,A3SNL7UJY7GWBI,1393774,Lady Leatherneck,True,"THank you Jesus Lord God, that brother Green's...",5,"02 11, 2016",Five Stars,1455148800,,,


In [None]:
# adding the status column to the dataset
# creating a list of our conditions
conditions = [
              myData['overall'] >= 4,
              myData['overall'] <= 2
]

# create a list of the values we want to assign for each
values = [1, 0]

# creating a new columnn and using np.select to assign values to it using our lists as attributes

myData['label'] = np.select(conditions, values)

# previewing the column
myData.head(3)

Unnamed: 0,reviewerID,asin,reviewerName,verified,reviewText,overall,reviewTime,summary,unixReviewTime,style,vote,image,label
0,A1H1DL4K669VQ9,1393774,Judith Paladino,True,Love it!! Great seller!,5,"04 29, 2016",Five Stars,1461888000,,,,1
1,A3V5XBBT7OZG5G,1393774,gflady,True,One of my very favourite albums from one of my...,5,"02 23, 2016",One of my very favourite albums from one of my...,1456185600,,,,1
2,A3SNL7UJY7GWBI,1393774,Lady Leatherneck,True,"THank you Jesus Lord God, that brother Green's...",5,"02 11, 2016",Five Stars,1455148800,,,,1


In [None]:
# only keep the verified reviews
# myData = myData[myData['verified'] == True]
# len(myData)

743956

In [None]:
# Operate the data cleaning process on reviewText

# Make sure the column 'reviewText' is of string type
myData['reviewText'] = myData['reviewText'].apply(str)
# Remove special characters
myData['clean_review'] = myData['reviewText'].str.replace("[^a-zA-Z']"," ",regex=True)
# Remove leading and trailing whitespaces
myData['clean_review'] = myData['clean_review'].str.strip()
# convert to lowercase
myData['clean_review'] = myData['clean_review'].str.lower()


In [None]:
#defining the function to remove stopwords from tokenized text
# def remove_stopwords(text):
#     output= [i for i in text if i not in en_stopwords]
#     return output

# #applying the function
# myData['clean_review']= myData['clean_review'].apply(lambda x:remove_stopwords(x))

myData['clean_review'] = myData['clean_review'].apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))


In [None]:

#defining function for tokenization

def tokenization(text):
    tokens = re.split('W+',text)
    return tokens
#applying function to the column
myData['clean_review'] = myData['clean_review'].apply(lambda x: tokenization(x))

In [None]:
myData['clean_review']

0                                          love great seller
1          one favourite albums one favourite singers hap...
2          thank jesus lord god brother green's music sti...
3          recall loving albums maybe one forgot figured ...
4          keith green pioneer field christian rock loved...
                                 ...                        
1443750    night hawk ten outstanding new recordings cowb...
1443751                                                   ok
1443752         great music great sound love music lots hits
1443753                        really good fun quality stuff
1443754    great lp guess love albums order came time cd'...
Name: clean_review, Length: 743956, dtype: object

In [None]:
type(targets)

numpy.ndarray

In [None]:
# Split the data into training and test sets
train_data, test_data = train_test_split(myData, test_size=0.2, random_state=42)


In [None]:
# Create a tokenizer and fit it on the training data
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['clean_review'])

In [None]:
# Convert the text reviews to sequences of word indices
train_sequences = tokenizer.texts_to_sequences(train_data['clean_review'])
test_sequences = tokenizer.texts_to_sequences(test_data['clean_review'])

In [None]:
# Pad the sequences to a fixed length
maxlen = 256
x_train = pad_sequences(train_sequences, maxlen=maxlen)
x_test = pad_sequences(test_sequences, maxlen=maxlen)

In [None]:
# Print the shapes of x_train and x_test
print('Shape of x_train:', x_train.shape)
print('Shape of x_test:', x_test.shape)

Shape of x_train: (1155004, 256)
Shape of x_test: (288751, 256)


In [None]:
y_train = train_data['label'].values
y_test = test_data['label'].values

In [None]:
print('Shape of y_train:', y_train.shape)
print('Shape of y_test:', y_test.shape)

Shape of y_train: (1155004,)
Shape of y_test: (288751,)


In [None]:
from tensorflow import keras
from tensorflow.keras import layers

# Define the model architecture
model = keras.Sequential()

model.add(layers.Embedding(input_dim=10000, output_dim=16))
model.add(layers.GlobalAveragePooling1D())
model.add(layers.Dense(units=16, activation='relu'))
model.add(layers.Dense(units=1, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 16)          160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


Compile the network using adam optimzer and binary crossentropy loss function

In [None]:
# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train the model
model.fit(x_train, y_train, epochs=10, batch_size=256, validation_data=(x_test, y_test))



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8fe39f48b0>

In [None]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9082


In [None]:
# Evaluate the model on test data
loss, accuracy = model.evaluate(x_test, y_test, verbose=False)
print(f'Test Accuracy: {accuracy:.4f}')

Test Accuracy: 0.9314
