## Data Preprocessing and EDA

### Importing libraries

In [11]:
import pandas as pd
import re 

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
import string
from nltk.stem import WordNetLemmatizer


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sophia.bouchama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sophia.bouchama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\sophia.bouchama\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [12]:
# Read in file
reviews_data = pd.read_csv("../data/Reviews.csv")

In [13]:
reviews_data = reviews_data.drop_duplicates(subset=["UserId","Time","Text"])

In [14]:
# HelpfulnessDenominator should be greater than or equal to HelpfulnessNumerator. Remove those that do not satisfy this condition 
reviews_data = reviews_data[reviews_data["HelpfulnessNumerator"] <= reviews_data["HelpfulnessDenominator"]]

In [15]:
# Convert the Time column into datetime format
reviews_data['Time'] = pd.to_datetime(reviews_data['Time'], unit='s')

In [16]:
# Create columns for Positive, Negative and Neutral Reviews based on Score
reviews_data['PositiveReviews'] = reviews_data['Score'] > 3
reviews_data['NegativeReviews'] = reviews_data['Score'] < 3
reviews_data['NeutralReviews'] = reviews_data['Score'] == 3

In [17]:
# Create a preprocessing function with optional steps
def preprocessing(sentence, remove_html=True, lowercasing=True, remove_numbers=True, remove_punctuation=True, tokenize=True, remove_stopwords=True, lemmatize=True):

    # Removing whitespaces
    sentence = sentence.strip()

    # Removing HTML tags
    if remove_html:
        sentence = re.sub(r'<.*?>', '', sentence)

    # Lowercasing
    if lowercasing:
        sentence = sentence.lower()

    # Removing numbers
    if remove_numbers:
        sentence = ''.join(char for char in sentence if not char.isdigit())

    # Removing punctuation
    if remove_punctuation:
        for punctuation in string.punctuation:
            sentence = sentence.replace(punctuation, '')

    # Tokenizing
    if tokenize:    
        sentence = word_tokenize(sentence)

    # Removing stopwords
    if remove_stopwords and tokenize:
        stop_words = set(stopwords.words('english'))
        sentence = [word for word in sentence if word not in stop_words]

    # Lemmatizing
    if lemmatize and tokenize:
        lemmatizer = WordNetLemmatizer()
        lemmatized = [lemmatizer.lemmatize(word) for word in sentence]
        sentence = " ".join(lemmatized)

    return sentence   

## RNN (LSTM) Model

Firstly, we need to create our labelled data to create a binary classification problem. 

label 0 = negative review

label 1 = positive review


Where positive review is > 3 and negative review is < 3.

Create a copy of the dataframe in order to apply specific preprocessing for this model

In [18]:

lstm_df = reviews_data.copy()

In [22]:
lstm_df['Text'] = lstm_df['Text'].apply(lambda x: preprocessing(x, lowercasing=False, remove_numbers=False, remove_punctuation=True, tokenize=False))


In [23]:
lstm_df['Label'] = lstm_df['PositiveReviews'].astype(int)

In [24]:
from sklearn.model_selection import train_test_split

X = lstm_df['Text']
y= lstm_df['Label']
X_train, X_test, y_train, y_test = train_test_split(lstm_df['Text'], lstm_df['Label'], test_size=0.2, random_state=42, stratify=y)


In [47]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)  # Fit only on training data
X_train_token = tokenizer.texts_to_sequences(X_train)
X_test_token = tokenizer.texts_to_sequences(X_test)


X_train_pad = pad_sequences(X_train_token, padding='post', dtype='float32', maxlen=100)
X_test_pad = pad_sequences(X_test_token, padding='post', dtype='float32', maxlen=100)


In [48]:
from tensorflow.keras import Sequential, layers

vocab_size = len(tokenizer.word_index)
embedding_dim = 50

# Define model architecture
model = Sequential()
model.add(layers.Embedding(input_dim=vocab_size + 1,
                    output_dim=embedding_dim,
                    mask_zero=True))
model.add(layers.LSTM(20))
model.add(layers.Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])



In [49]:
from tensorflow.keras.callbacks import EarlyStopping

es = EarlyStopping(patience=4, restore_best_weights=True)

In [50]:
# Train the model
model.fit(X_train_pad, y_train, epochs=10, batch_size=16, validation_split=0.2, callbacks=[es])



Epoch 1/10


[1m15756/15756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3378s[0m 214ms/step - accuracy: 0.8647 - loss: 0.3165 - val_accuracy: 0.9068 - val_loss: 0.2272
Epoch 2/10
[1m15756/15756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1493s[0m 95ms/step - accuracy: 0.9235 - loss: 0.1920 - val_accuracy: 0.9104 - val_loss: 0.2256
Epoch 3/10
[1m15756/15756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1495s[0m 95ms/step - accuracy: 0.9442 - loss: 0.1435 - val_accuracy: 0.9106 - val_loss: 0.2318
Epoch 4/10
[1m15756/15756[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1533s[0m 97ms/step - accuracy: 0.9607 - loss: 0.1060 - val_accuracy: 0.9027 - val_loss: 0.2553
Epoch 5/10
[1m 2210/15756[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m23:09[0m 103ms/step - accuracy: 0.9769 - loss: 0.0691

In [None]:
# Save the model
model.model.save("./RNN1.h5")

In [None]:
# Evaluate the model
model.evaluate(X_test_pad, y_test)