<a href="https://colab.research.google.com/github/Sifatkhan-1915020/deeplearning-/blob/main/Amazon_Reviews_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'amazon-reviews:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F3868600%2F6713271%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20241012%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20241012T143356Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D5c0ea9b8e1a7d79167f6ad0e95c38b1dc1ffc1be4f98848174befecd1b592005ddcdc797b9f3347fa1ab5e89a86ff7c609a1c7f5a90804616ddf294f35de949f08c99e6f1deefe0770a725a47cebdcb142e8e8577f1650d2c037e48e8f75a211518011ef4739fbecbe7a179de1e51012cc52db201ce014b99f531b4b817cae73d5ff3c698b91611c9ddc46147676212c72b79e66433aa5093235d95cb8b73a9bf27b1aa6e11857fa4b7991761bf669d23d691e5d20a358f46df890ccdb3ff7cdc9646f477bc51d188e08efcb27726fe356511bfb42ee2d8764bb35846dabf674de96ac02faed49954e875c6e75b17541194ac7b4868ef70e4a6ec9a243ea39d2'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


# Import Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm
import random

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from collections import Counter
from string import punctuation

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score , f1_score, accuracy_score,confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.layers import Dense , Embedding , Bidirectional , LSTM

lemma = WordNetLemmatizer()

# Loading Dataset

In [None]:
df = pd.read_json('Amazon reviews.json' , lines = True)

In [None]:
df.columns

In [None]:
df.drop(['reviewerID', 'asin', 'reviewerName', 'helpful','summary', 'unixReviewTime', 'reviewTime'],axis='columns',inplace=True)

In [None]:
df= df.rename(columns={"reviewText":"Review","overall": "Rating"})

In [None]:
df['review_len'] = [len(text.split()) for text in df.Review]

In [None]:
df = df[~(df['review_len'] < 20) & ~(df['review_len'] > 40)]

In [None]:
def convert_label(df) :
    if df['Rating'] <= 3.0 :
        rate = 0 # for Negative
    else :
        rate = 1 # for Positive

    return rate

In [None]:
df['Rating'] = df.apply(convert_label , axis = 1)

In [None]:
df.head()

# EDA

In [None]:
label_count = df['Rating'].value_counts()
fig,axes = plt.subplots(nrows=1, ncols=2, figsize=(10,5))

sns.set_theme(style='darkgrid', palette='pastel')
color = sns.color_palette(palette='pastel')
explode = [0.02]*len(label_count)

axes[0].pie(label_count.values, labels=label_count.index, autopct='%1.1f%%', colors=color, explode=explode)
axes[0].set_title('Percentage Label')

sns.countplot(df['Rating'] , ax=axes[1])
axes[1].set_title('Count Label')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
def MostWordsUsed(txt , n_words) :
    all_text = ''.join(df[txt].values)

    all_text = re.sub(r'\d+', '', all_text) # numbers
    all_text = re.sub(r'[^\w\s]', '', all_text) # special characters

    words = all_text.split()

    # remove puncs
    punc = list(punctuation)
    words = [w for w in words if w not in punc]

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if not word in stop_words]

    word_counts = Counter(words)

    top_words = word_counts.most_common(n_words)

    return top_words

In [None]:
top_words = MostWordsUsed('Review' , 40)

xaxis = [word[0] for word in top_words]
yaxis = [word[1] for word in top_words]

plt.figure(figsize=(16,5))
plt.bar(xaxis , yaxis)
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.title('Most Commonly Used Words', fontsize=25)
plt.xticks(rotation=45)
plt.subplots_adjust(bottom=0.15)
plt.show()

In [None]:
plt.figure(figsize=(16,5))
ax = sns.countplot(x='review_len', data=df[(df['review_len']<=1000) & (df['review_len']>10)], palette='Blues_r')
plt.title('Count of sentence with high number of words', fontsize=25)
plt.yticks([])
ax.bar_label(ax.containers[0])
plt.ylabel('count')
plt.xlabel('')
plt.show()

# Data Preprocessing

In [None]:
def DataPrep(text) :
    text = re.sub(r'\d+', '', text) # numbers
    text = re.sub(r'[^\w\s]', '', text) # special characters

    # tokenization
    tokens = nltk.word_tokenize(text)

    # remove puncs
    punc = list(punctuation)
    words = [word for word in tokens if word not in punc]

    # remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if not word in stop_words]

    # lemmatization
    words = [lemma.lemmatize(word) for word in words]

    text = ' '.join(words)

    return text

In [None]:
df['cleaned_reviews'] = df['Review'].apply(DataPrep)

In [None]:
print(f'There are around {int(df["cleaned_reviews"].duplicated().sum())} duplicated reviews, we will remove them.')

In [None]:
df.drop_duplicates("cleaned_reviews", inplace=True)

# Split the data

In [None]:
x_train , x_val , y_train , y_val = train_test_split(df['cleaned_reviews'] , df['Rating'] , train_size = 0.80 , random_state = 42)

In [None]:
len(x_train) , len(x_val)

# Feature Extraction

In [None]:
vec = TfidfVectorizer()
vec.fit(x_train)
print("No. of feature words: ",len(vec.get_feature_names()))

In [None]:
x_train = vec.transform(x_train).toarray()
x_val = vec.transform(x_val).toarray()

In [None]:
x_train.shape , x_val.shape

# Logistic Regression

In [None]:
lr = LogisticRegression(random_state=42)
lr.fit(x_train , y_train)

In [None]:
train_acc1 = lr.score(x_train , y_train)

In [None]:
lr_pred = lr.predict(x_val)

val_acc1 = accuracy_score(y_val , lr_pred)

val_precision1 = precision_score(y_val , lr_pred , average='weighted')
val_recall1 = recall_score(y_val , lr_pred , average='weighted')
val_f1score1 = f1_score(y_val , lr_pred , average='weighted')

In [None]:
print(f"The training accuracy for logistic regression : {(train_acc1*100):0.2f}%\n")
print(f"The validation accuracy for logistic regression : {(val_acc1*100):0.2f}%\n")
print(f"The precision for logistic regression : {val_precision1:0.2f}\n")
print(f"The recall for logistic regression : {val_recall1:0.2f}\n")
print(f"The f1 score for logistic regression : {val_f1score1:0.2f}\n")

In [None]:
lr_cm = confusion_matrix(y_val , lr_pred)
sns.heatmap(lr_cm, annot=True,fmt='3g')
plt.show()

# Random Forest Classifier

In [None]:
rf = RandomForestClassifier()
rf.fit(x_train , y_train)

In [None]:
train_acc2 = rf.score(x_train , y_train)

In [None]:
rf_pred = rf.predict(x_val)

val_acc2 = accuracy_score(y_val , lr_pred)

val_precision2 = precision_score(y_val , rf_pred , average='weighted')
val_recall2 = recall_score(y_val , rf_pred , average='weighted')
val_f1score2 = f1_score(y_val , rf_pred , average='weighted')

In [None]:
print(f"The training accuracy for Random Forest : {(train_acc2*100):0.2f}%\n")
print(f"The validation accuracy for Random Forest : {(val_acc2*100):0.2f}%\n")
print(f"The precision for Random Forest : {val_precision2:0.2f}\n")
print(f"The recall for Random Forest : {val_recall2:0.2f}\n")
print(f"The f1 score for Random Forest : {val_f1score2:0.2f}\n")

In [None]:
rf_cm = confusion_matrix(y_val , rf_pred)
sns.heatmap(lr_cm, annot=True,fmt='3g')
plt.show()

# LSTM

In [None]:
X_train , X_val , Y_train , Y_val = train_test_split(df['cleaned_reviews'] , df['Rating'] , train_size = 0.80 , random_state = 42)

In [None]:
len(X_train) , len(X_val)

In [None]:
corpus = [word for text in df['cleaned_reviews'] for word in text.split()]
words_count = Counter(corpus)
sorted_words = words_count.most_common()

In [None]:
# define parameters
VOCAB_SIZE = len(sorted_words)
EMBEDDING_DIM = 300
MAX_LEN = np.max(df['review_len'])

In [None]:
def lstm_dataprep(row_data) :
    tokenizer = Tokenizer(num_words=VOCAB_SIZE , oov_token='<OOV>')
    tokenizer.fit_on_texts(row_data)
    seqs = tokenizer.texts_to_sequences(row_data)
    pad_seqs = pad_sequences(seqs , maxlen = MAX_LEN , padding='post')

    return pad_seqs

In [None]:
X_train = lstm_dataprep(X_train)
X_val = lstm_dataprep(X_val)

In [None]:
X_train.shape , X_val.shape

In [None]:
model = Sequential([
    Embedding(VOCAB_SIZE + 1 , EMBEDDING_DIM , input_length=MAX_LEN) ,
    Bidirectional(LSTM(265 , return_sequences=True)) ,
    Bidirectional(LSTM(128)) ,
    Dense(64 , activation='relu') ,
    Dense(1 , activation='sigmoid')
])

In [None]:
model.summary()

In [None]:
model.compile(loss = 'binary_crossentropy' , optimizer = 'adam' , metrics=['accuracy', Precision(name = 'precision'), Recall(name = 'recall')])

In [None]:
history = model.fit(
    X_train ,
    Y_train ,
    epochs = 5 ,
    batch_size = 64 ,
    validation_data=(X_val , Y_val)
)

In [None]:
train_acc3 = history.history['accuracy'][-1]
val_acc3 = history.history['val_accuracy'][-1]

In [None]:
print(f"The training loss for LSTM is : {history.history['loss'][-1]:0.2f}\n")
print(f"The training accuracy for LSTM is : {(history.history['accuracy'][-1]*100):0.2f}%\n")
print(f"The training precision for LSTM is : {history.history['precision'][-1]:0.2f}\n")
print(f"The training recall for LSTM is : {history.history['recall'][-1]:0.2f}\n")

In [None]:
print(f"The validation loss for LSTM is : {history.history['val_loss'][-1]:0.2f}\n")
print(f"The validation accuracy for LSTM is : {(history.history['val_accuracy'][-1]*100):0.2f}%\n")
print(f"The validation precision for LSTM is : {history.history['val_precision'][-1]:0.2f}\n")
print(f"The validation recall of for LSTM is : {history.history['val_recall'][-1]:0.2f}\n")

In [None]:
plt.subplots(figsize=(6,4))
plt.plot(history.history['loss'] , label='training')
plt.plot(history.history['val_loss'] , label='validation')

plt.title('Training/Validation loss over Epochs')
plt.xlabel('Epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

plt.subplots(figsize=(6,4))
plt.plot(history.history['accuracy'], label='training')

plt.plot(history.history['val_accuracy'], label='validation')

plt.title('Training/Validation accuracy over Epochs')
plt.xlabel('epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.subplots(figsize=(6,4))
plt.plot(history.history['precision'], label='training')

plt.plot(history.history['val_precision'], label='validation')

plt.title('Training/Validation precision over Epochs')
plt.xlabel('epoch')
plt.ylabel('Precision')
plt.legend()
plt.show()

plt.subplots(figsize=(6,4))
plt.plot(history.history['recall'], label='training')

plt.plot(history.history['val_recall'], label='validation')

plt.title('Training/Validation recall over Epochs')
plt.xlabel('epoch')
plt.ylabel('Recall')
plt.legend()
plt.show()

In [None]:
y_val_pred = model.predict(X_val)
y_val_pred = y_val_pred.round()

In [None]:
lstm_cm = confusion_matrix(Y_val , y_val_pred)
sns.heatmap(lstm_cm, annot=True,fmt='3g')
plt.show()

# Compare between models

In [None]:
train_scores=[train_acc1,train_acc2,train_acc3]
val_scores=[val_acc1,val_acc2,val_acc3]

models = ['Logistic Regression','RandomForest','LSTM']

x = np.arange(len(models))

width = 0.25

fig, ax = plt.subplots(figsize=(20, 10))

rects1 = ax.bar(x - width, train_scores, width, label='Train Accuracy')

rects2 = ax.bar(x + width, val_scores, width, label='Validation Accuracy')

ax.set_xlabel('Models')
ax.set_ylabel('Accuracy')
ax.set_title('Comparison of Training and Validation Accuracies')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate('{:.3f}'.format(height),
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 2),
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

plt.show()