# Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
import plotly.express as px
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
import re
from tensorflow.keras.layers import Embedding, Dense, Dropout, LSTM, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import utils
import os
from sklearn.metrics import confusion_matrix, classification_report

import warnings
warnings.filterwarnings("ignore")

In [2]:
'''Build a function to convert data into a data frame with 2 columns: label, text,
The function takes the file or the train/test data
and a for loop loops over the text in the file to split texts from labels'''

def process_data(file):
    data = []
    for index, row in file.iterrows():
         # first line data is raw data
        line = row[0]

        #split lines into text and labels
        label, text = line.split(' ', 1)

        #remove the __label__ only keep the number
        label = label.replace('__label__', '')

        #append
        data.append((label, text.strip()))

    cols = ['label', 'review']
    return pd.DataFrame(data, columns=cols)

# 2.2 Data Cleaning

In [3]:
def text_cleaning(text):
  #convert to lower case
  text=text.lower()

  #remove special characters and numsbers and extra whitespace
  pattern_punc = r'[^a-zA-Z\s]'
  text = re.sub(pattern_punc, '', text)
  return text


# Data-set loading

In [4]:
#train_path = '/Dataset/Amazon Reviews DataSet/train.ft.txt.bz2'
#test_path = '/Dataset/Amazon Reviews DataSet/test.ft.txt.bz2'

In [5]:
train_path = 'DataSet/train.ft.txt.bz2/train.ft.txt.bz2'
test_path = 'dataSet/test.ft.txt.bz2/test.ft.txt.bz2'

#3- Read Data


In [6]:
train_data=pd.read_csv(train_path,compression='bz2',delimiter='\t')
test_data=pd.read_csv(test_path,compression='bz2',delimiter='\t')

In [7]:
"""train_data=pd.read_csv(
    train_path,compression='bz2',
    delimiter='\t'
).sample(frac=0.4, random_state=42) # random_state ensures reproducibility
test_data=pd.read_csv(
    test_path,
    compression='bz2',
    delimiter='\t'
).sample(frac=0.5, random_state=42) # random_state ensures reproducibility"""

"train_data=pd.read_csv(\n    train_path,compression='bz2',\n    delimiter='\t'\n).sample(frac=0.4, random_state=42) # random_state ensures reproducibility\ntest_data=pd.read_csv(\n    test_path,\n    compression='bz2',\n    delimiter='\t'\n).sample(frac=0.5, random_state=42) # random_state ensures reproducibility"

In [8]:
print(train_data.head())

  __label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^
0  __label__2 The best soundtrack ever to anythin...                                                                                                                                                                                                                                                                                                                                                                                                   
1  __label__2 Amazing!: This soundtrack is my fav...                                                                    

it's shown that the data consists of 2 parts:¶
label, text. Therefore we will build a function that separates each part from each other and create a dataframe of 2 columns

In [9]:
test_data.head()

Unnamed: 0,"__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I'm in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life's hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing ""Who was that singing ?"""
0,__label__2 One of the best game music soundtra...
1,__label__1 Batteries died within a year ...: I...
2,"__label__2 works fine, but Maha Energy is bett..."
3,__label__2 Great for the non-audiophile: Revie...
4,__label__1 DVD Player crapped out after one ye...


In [10]:
print("Train data shape" ,train_data.shape)
print("Test data shape" ,test_data.shape)

Train data shape (3599999, 1)
Test data shape (399999, 1)


In [11]:
train=process_data(train_data)
train.head()

Unnamed: 0,label,review
0,2,The best soundtrack ever to anything.: I'm rea...
1,2,Amazing!: This soundtrack is my favorite music...
2,2,Excellent Soundtrack: I truly like this soundt...
3,2,"Remember, Pull Your Jaw Off The Floor After He..."
4,2,an absolute masterpiece: I am quite sure any o...


In [12]:
test=process_data(test_data)
test.head()

Unnamed: 0,label,review
0,2,One of the best game music soundtracks - for a...
1,1,Batteries died within a year ...: I bought thi...
2,2,"works fine, but Maha Energy is better: Check o..."
3,2,Great for the non-audiophile: Reviewed quite a...
4,1,DVD Player crapped out after one year: I also ...


1 is the negative review while 2 is the positive one so we will replace 2 with 1 and 1 with 0

In [13]:
train['label'].value_counts()

label
1    1800000
2    1799999
Name: count, dtype: int64

In [14]:
train['label']=train['label'].replace({"2":"1","1":"0"})
test['label']=test['label'].replace({"2":"1","1":"0"})

In [15]:
print("Count of lables in the train data is:",train['label'].value_counts())
print("_"*50)
print("Count of lables in the test data is:",test['label'].value_counts())

Count of lables in the train data is: label
0    1800000
1    1799999
Name: count, dtype: int64
__________________________________________________
Count of lables in the test data is: label
0    200000
1    199999
Name: count, dtype: int64


# 4- Cleaned Data

In [16]:
train['review_cleaned']=train['review'].apply(text_cleaning)
train.head()
train.tail()

Unnamed: 0,label,review,review_cleaned
3599994,0,Don't do it!!: The high chair looks great when...,dont do it the high chair looks great when it ...
3599995,0,"Looks nice, low functionality: I have used thi...",looks nice low functionality i have used this ...
3599996,0,"compact, but hard to clean: We have a small ho...",compact but hard to clean we have a small hous...
3599997,0,what is it saying?: not sure what this book is...,what is it saying not sure what this book is s...
3599998,1,Makes My Blood Run Red-White-And-Blue: I agree...,makes my blood run redwhiteandblue i agree tha...


In [17]:
test['review_cleaned']=test['review'].apply(text_cleaning)
test.head()
test.tail()

Unnamed: 0,label,review,review_cleaned
399994,0,Unbelievable- In a Bad Way: We bought this Tho...,unbelievable in a bad way we bought this thoma...
399995,0,"Almost Great, Until it Broke...: My son reciev...",almost great until it broke my son recieved th...
399996,0,Disappointed !!!: I bought this toy for my son...,disappointed i bought this toy for my son who...
399997,1,Classic Jessica Mitford: This is a compilation...,classic jessica mitford this is a compilation ...
399998,0,"Comedy Scene, and Not Heard: This DVD will be ...",comedy scene and not heard this dvd will be a ...


In [18]:
import nltk
nltk.download('stopwords')
print(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

# 5- Tokenization and Padding

In [19]:
max_words = 1000
max_len = 100

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(train['review_cleaned'])

X_train = tokenizer.texts_to_sequences(train['review_cleaned'])
X_test = tokenizer.texts_to_sequences(test['review_cleaned'])

X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

In [20]:
y_train=train['label']
y_test=test['label']

#6- Data Splitting

In [21]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [22]:
X_train = np.array(X_train)
print("Train:", X_train.shape)
y_train = np.array(y_train)
print("Train:", y_train.shape)
print("-"*50)
X_valid = np.array(X_valid)
print("Valid:", X_valid.shape)
y_valid = np.array(y_valid)
print("Valid:", y_valid.shape)
print("-"*50)
X_test = np.array(X_test)
print("Test:", X_test.shape)
y_test = np.array(y_test)
print("Test:", y_test.shape)

Train: (2879999, 100)
Train: (2879999,)
--------------------------------------------------
Valid: (720000, 100)
Valid: (720000,)
--------------------------------------------------
Test: (399999, 100)
Test: (399999,)


In [23]:
y_train=y_train.astype('int32')
y_valid=y_valid.astype('int32')
y_test=y_test.astype('int32')



# 7- LSTM model building

In [24]:
model = Sequential()
model.add(Input(shape=(max_words,), dtype='int32'))
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(units=128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=256))
model.add(Dense(1,activation='sigmoid'))
model.summary()

# 8- Model Training

In [None]:
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
model.fit(X_train,y_train,validation_data=(X_valid,y_valid),epochs=5,batch_size=2048, verbose=1)

Epoch 1/5
[1m  37/1407[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m2:08:11[0m 6s/step - accuracy: 0.6134 - loss: 0.6726

# 9- Model Evaluation

In [None]:
history = model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=5, batch_size=2048, verbose=1)

# Plotting loss
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plotting accuracy
plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
loss,acc=model.evaluate(X_test,y_test)
print("Validation Loss = ", loss)
print("Validation Accuracy = ", acc)
print("Validation Accuracy = ", acc*100,"%")

In [None]:
loss, acc = model.evaluate(X_test, y_test)
print("Test Loss = ", loss)
print("Test Accuracy = ", acc*100, "%")

# 10- Classification Report

In [None]:
preds = model.predict(X_test)

threshold = 0.5
pred_labels = (preds >= threshold)
true_labels = y_test

cr = classification_report(true_labels, pred_labels)
print(cr)

# 11- Confusion Matrix

In [None]:
import seaborn as sns
CM = confusion_matrix(y_test, pred_labels)
sns.heatmap(CM, annot = True,fmt='g', cmap='Reds')
CM

#12 Save model

In [None]:
model.save('sentiment_lstm_model.h5')