In [1]:
import findspark
import numpy as np
import re
import nltk
import tensorflow as tf
import json
from nltk.corpus import stopwords

""" 


This code imports the following Python libraries:

findspark: This library is used to find and initialize a Spark session.
numpy: This library is used for scientific computing with Python.
re: This library provides regular expression operations.
nltk: This library provides natural language processing (NLP) tools.
tensorflow: This library is used for machine learning and deep learning.
json: This library is used to encode and decode JSON data.
stopwords: This library provides a list of common English stop words.
These libraries are commonly used for tasks such as:

Text preprocessing: This involves cleaning and transforming text data to make it suitable for machine learning or NLP tasks.
Feature engineering: This involves creating new features from existing features to improve the performance of machine learning models.
Machine learning: This involves training and evaluating machine learning models to make predictions or decisions.
Deep learning: This involves training and evaluating deep learning models to solve complex machine learning problems.
Natural language processing: This involves developing and applying computer programs to understand and process human language.






"""

In [2]:
from numpy import array

from keras.preprocessing.text import one_hot, Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dropout, Dense
from keras.layers import Flatten, GlobalMaxPooling1D, Embedding, Conv1D, LSTM
from sklearn.model_selection import train_test_split



""""

This code imports the following classes and functions:

array from numpy: This function is used to create NumPy arrays.
one_hot from keras.preprocessing.text: This function converts a text sequence to a one-hot encoding.
Tokenizer from keras.preprocessing.text: This class is used to tokenize text into sequences of integers.
pad_sequences from keras.preprocessing.sequence: This function pads a sequence of integers to a fixed length.
Sequential from keras.models: This class is used to create sequential Keras models.
Activation from keras.layers: This class is used to add activation functions to Keras layers.
Dropout from keras.layers: This class is used to add dropout regularization to Keras layers.
Dense from keras.layers: This class is used to add fully connected layers to Keras models.
Flatten from keras.layers: This class is used to flatten the output of a Keras layer.
GlobalMaxPooling1D from keras.layers: This class is used to perform global max pooling on the output of a 1D Keras layer.
Embedding from keras.layers: This class is used to add embedding layers to Keras models.
Conv1D from keras.layers: This class is used to add convolutional layers to Keras models.
LSTM from keras.layers: This class is used to add long short-term memory (LSTM) layers to Keras models.
train_test_split from sklearn.model_selection: This function splits a dataset into training and test sets.

Commonly utilized for buildingbuilding and training text classification models with Keras(sentiment analysis)



"""

In [3]:
import pyspark
print(pyspark.version)

"""
importing pyspark and checking version
"""

<module 'pyspark.version' from 'c:\\Users\\siyam\\anaconda3\\lib\\site-packages\\pyspark\\version.py'>


In [4]:
findspark.init()
findspark.find()

'C:\\spark\\spark-3.4.1-bin-hadoop3.tgz'

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext

""" 
initializing a spark session and importing SQLContext object for reading data from our json file 
"""


In [7]:
## Loading the dataset

In [16]:
spark = SparkSession.builder.appName('SentimentAnalysisRNN').getOrCreate()

FileNotFoundError: [WinError 2] The system cannot find the file specified

In [1]:
dfReviews = SQLContext.read.json(r"Appliances.json", multiline=True).select("reviewText","overall")

"""
It reads a JSON file called Appliances.json into a Spark DataFrame using the SQLContext.read.json() method.
It specifies that the JSON file contains multiline records by setting the multiline parameter to True.
It selects the reviewText and overall columns from the DataFrame using the select() method.
The resulting DataFrame will contain two columns: reviewText and overall. The reviewText column will contain the text of the customer review, and the overall column will contain the customer's overall rating of the appliance out of 5.


"""

NameError: name 'SQLContext' is not defined

In [25]:
dfReviews.shape

"""
The variable dfReviews_shape will be a tuple containing two elements: the number of rows and the number of columns in the DataFrame.

"""

NameError: name 'dfReviews' is not defined

In [26]:
 ##Checking for missing values

dfReviews.isnull().values.any()

NameError: name 'dfReviews' is not defined

In [None]:
dfReviews = dfReviews[['reviewText','overall']]
dfReviews.columns = ['ProductReview', 'Sentiment']

dfReviews.head()


"""


The code dfReviews = dfReviews[['reviewText','overall']] selects the reviewText and overall columns from the dfReviews DataFrame and stores them in a new DataFrame called dfReviews.

The code dfReviews.columns = ['ProductReview', 'Sentiment'] renames the reviewText and overall columns to ProductReview and Sentiment, respectively.

The code dfReviews.head() prints the first few rows of the dfReviews DataFrame.

"""

In [None]:

POSITIVE_VALUE = 1
NEGATIVE_VALUE = 0
def determine_sentiments(labels):
  sentiments = []
  for label in labels:
    if label > 3.0:
      sentiment = POSITIVE_VALUE
    elif label <= 3.0:
      sentiment = NEGATIVE_VALUE
    sentiments.append(sentiment)
  return sentiments


""" 
The code above essentially is a function 
that takes the ratings out of 5 and assigns them a 
positive or negative value (1 or 0) based on the the ratign being 
smaller or greater than 3. 
The values are stored in a array and ultimately will be used used to encode our sentiment column values

"""

In [None]:
dfReviews['Sentiment'] = determine_sentiments(dfReviews.Sentiment)

In [None]:
# Let's observe distribution of positive / negative sentiments in dataset

import seaborn as sns
sns.countplot(x='Sentiment', data=dfReviews)

In [None]:
dfReviews["ProductReview"][2]

# You can see that our text contains punctuations, brackets, HTML tags and numbers 
# We will preprocess this text in the next section

In [None]:
TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    '''Removes HTML tags: replaces anything between opening and closing <> with empty space'''

    return TAG_RE.sub('', text)

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
def preprocess_text(sen):
    '''Cleans text data up, leaving only 2 or more char long non-stepwords composed of A-Z & a-z only
    in lowercase'''
    
    sentence = sen.lower()

    # Remove html tags
    sentence = remove_tags(sentence)

    # Remove punctuations and numbers
    sentence = re.sub('[^a-zA-Z]', ' ', sentence)

    # Single character removal
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)  # When we remove apostrophe from the word "Mark's", the apostrophe is replaced by an empty space. Hence, we are left with single character "s" that we are removing here.

    # Remove multiple spaces
    sentence = re.sub(r'\s+', ' ', sentence)  # Next, we remove all the single characters and replace it by a space which creates multiple spaces in our text. Finally, we remove the multiple spaces from our text as well.

    # Remove Stopwords
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
    sentence = pattern.sub('', sentence)

    return sentence

In [None]:
# Calling preprocessing_text function on appliance product_reviews

X = []
sentences = list(dfReviews['ProductReview'])
for sen in sentences:
    X.append(preprocess_text(sen))

In [None]:
# Sample cleaned up appliance review 

X[2]

# As we shall use Word Embeddings, stemming/lemmatization is not performed as a preprocessing step here
#

In [None]:
y= dfReviews['Sentiment']
#assigning target variable

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# The train set will be used to train our deep learning models 
# while test set will be used to evaluate how well our model performs 
#The train and test split is also how we adress issues associated to overtfitting and underfitting.

In [None]:
# Embedding layer expects the words to be in numeric form 
# Using Tokenizer function from keras.preprocessing.text library
# Method fit_on_text trains the tokenizer 
# Method texts_to_sequences converts sentences to their numeric form

word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts(X_train)

X_train = word_tokenizer.texts_to_sequences(X_train)
X_test = word_tokenizer.texts_to_sequences(X_test)

In [None]:
vocab_length = len(word_tokenizer.word_index) + 1

vocab_length


"""


The vocab_length of a word tokenizer is the number of unique words that the tokenizer has seen. 
The word_tokenizer.word_index property returns a dictionary that maps words to their integer indices. 
The keys to the dictionary are the unique words that the tokenizer has seen, and the values are the integer indices that are assigned to the words.

The + 1 in the vocab_length = len(word_tokenizer.word_index) + 1 expression is to account for the [UNK] token. 
The [UNK] token is used to represent words that the tokenizer has not seen before.
Can be considered and used as a hyperparameter for when we are fiting our model.
"""

In [None]:
# Padding all reviews to fixed length 100

maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [None]:
# Load GloVe word embeddings and create an Embeddings Dictionary

from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('a2_glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary [word] = vector_dimensions
glove_file.close()


""" 
The code creates a dictionary of word embeddings from the GloVe word embedding file a2_glove.6B.100d.txt.

The GloVe word embedding file is a text file that contains one line per word.
Each line contains the word and its corresponding word embedding vector. 
The word embedding vector is a dense vector of real numbers that represents the meaning of the word.

The code iterates over the lines in the GloVe file and splits each line into two parts: the word and the word embedding vector. 
The word embedding vector is converted to a NumPy array. 
The word and the word embedding vector are then added to the embeddings_dictionary dictionary.



"""

In [None]:
# Create Embedding Matrix having 100 columns 
# Containing 100-dimensional GloVe word embeddings for all words in our corpus.

embedding_matrix = zeros((vocab_length, 100))
for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [None]:
embedding_matrix.shape

""" 
he shape of an embedding matrix is (vocab_size, embedding_dim), where:

vocab_size is the number of unique words in the vocabulary.
The embedding_dim is the dimensionality of the word embeddings.
For example, if the vocabulary contains 10,000 unique words and the embedding dimension is 100, 
then the embedding matrix will have a shape of (10000, 100).

The embedding matrix is a dense matrix that represents the meaning of words in a machine learning model. 
Each row in the embedding matrix represents a word, and each column in the embedding matrix represents a dimension of the word's meaning.

The embedding matrix is typically initialized with random values, and then the model learns to update the values of the embedding matrix during training.



"""

In [None]:
from keras.layers import LSTM

In [None]:
# Neural Network architecture

lstm_model = Sequential()
#Adding the embedding layer to the model 
embedding_layer = Embedding(vocab_length, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)

#Adding an LSTM Layer to the model
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(128))

#Output layer of the model that predicts the  sentiment of the input text/review
lstm_model.add(Dense(1, activation='sigmoid'))


"""

Embedding layer: The embedding layer converts each word in the input text to a dense vector of real numbers. The embedding matrix is initialized with the pre-trained GloVe word embeddings.
LSTM layer: The LSTM layer learns long-term dependencies in the input text.
Output layer: The output layer predicts the class of the input text.
The trainable=False parameter in the embedding_layer layer prevents the embedding matrix from being updated during training. This is useful if you are using pre-trained word embeddings and you do not want to update them.

To train the model, you can use the model.fit() method. The training dataset should contain text data and the corresponding class labels.

Once the model is trained, you can use it to predict the class of new text data using the model.predict() method.



"""

In [None]:
# Model compiling

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(lstm_model.summary())

In [None]:
# Model Training

lstm_model_history = lstm_model.fit(X_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

In [None]:
# Predictions on the Test Set

score = lstm_model.evaluate(X_test, y_test, verbose=1)

In [None]:
# Model Performance

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

In [None]:
""" 
Model Optimization

Tuning the hyperparameters of the embedding layer, adjusting the parameters by increasing/decreasing weights,vocab length, embedding dimensionality and the maximum lenght of the input text sequences

"""

In [None]:
# Model Performance Charts

import matplotlib.pyplot as plt

plt.plot(lstm_model_history.history['acc'])
plt.plot(lstm_model_history.history['val_acc'])

plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()

plt.plot(lstm_model_history.history['loss'])
plt.plot(lstm_model_history.history['val_loss'])

plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'], loc='upper left')
plt.show()


"""

The lstm_model_history.history variable contains a dictionary of metrics that were recorded during training. 
The acc key in the dictionary contains the accuracy of the model on the training dataset, and the val_acc key contains the accuracy of the model on the test dataset.

The plt.plot() function is used to plot the accuracy of the model on the training and test datasets. The plt.title(), plt.ylabel(), and plt.xlabel() functions are used to set the title, y-axis label, and x-axis label of the plot, respectively. 
The plt.legend() function is used to add a legend to the plot.

The plt.show() function is used to display the plot.

The loss key in the lstm_model_history.history dictionary contains the loss of the model on the training dataset, and the val_loss key contains the loss of the model on the test dataset.

The plt.plot(), plt.title(), plt.ylabel(), plt.xlabel(), and plt.legend() functions are used to create a plot of the loss of the model on the training and test datasets, similar to the plot of the accuracy of the model.

The plt.show() function is used to display the plot.

These plots can be used to evaluate the performance of the LSTM model. The accuracy and loss of the model on the training and test datasets can be used to determine whether the model is overfitting or underfitting the training data.

If the accuracy of the model on the test dataset is significantly lower than the accuracy of the model on the training dataset, then the model is overfitting the training data. Overfitting occurs when the model learns the specific features of the training data too well, and is unable to generalize to new data.

If the accuracy of the model on the test dataset is significantly higher than the accuracy of the model on the training dataset, then the model is underfitting the training data. Underfitting occurs when the model does not learn enough features from the training data, and is unable to make accurate predictions on new data.

The accuracy and loss of the model on the training and test datasets can be used to tune the hyperparameters of the LSTM model, such as the number of epochs to train the model for and the learning rate. The goal of tuning the hyperparameters is to improve the accuracy of the model on the test dataset without overfitting the training data.




"""