In [7]:
import pandas as pd
import numpy as np
import re, string, warnings
warnings.filterwarnings("ignore")

# NLP
import nltk
nltk.download('punkt_tab')
nltk.download("stopwords")
nltk.download("wordnet")

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# ML
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.over_sampling import RandomOverSampler

# Deep Learning
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

import pickle

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\varsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\varsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [8]:
df = pd.read_csv("amazon_reviews.csv")

X = df["reviewText"].astype(str)
y = df["overall"]

y = y.apply(lambda x: 0 if x <= 2 else (1 if x == 3 else 2))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [9]:
def clean_text(text):
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', ' ', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    return text.lower()

stopword_set = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()


In [10]:
def preprocess(text):
    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopword_set]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return " ".join(tokens)

X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [11]:
vocab_size = 20000
maxlen = 200

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen, padding='post')

In [12]:

ros = RandomOverSampler()
X_train_bal, y_train_bal = ros.fit_resample(X_train_pad, y_train)

model = Sequential([
    Embedding(vocab_size, 128, input_length=maxlen),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print(model.summary())

None


In [13]:

es = EarlyStopping(
    monitor="val_loss",
    patience=2,
    restore_best_weights=True
)


In [14]:

history = model.fit(
    X_train_bal,
    y_train_bal,
    epochs=10,
    batch_size=32,
    validation_split=0.2,
    callbacks=[es],
    verbose=1
)


Epoch 1/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 163ms/step - accuracy: 0.8000 - loss: 0.4903 - val_accuracy: 0.9148 - val_loss: 0.3555
Epoch 2/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 155ms/step - accuracy: 0.9852 - loss: 0.0573 - val_accuracy: 0.8584 - val_loss: 0.3566
Epoch 3/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 156ms/step - accuracy: 0.9965 - loss: 0.0149 - val_accuracy: 1.0000 - val_loss: 8.5769e-04
Epoch 4/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 111ms/step - accuracy: 0.9988 - loss: 0.0043 - val_accuracy: 1.0000 - val_loss: 1.7674e-04
Epoch 5/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 100ms/step - accuracy: 0.9995 - loss: 0.0017 - val_accuracy: 1.0000 - val_loss: 2.7859e-04
Epoch 6/10
[1m251/251[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 99ms/step - accuracy: 1.0000 - loss: 2.6400e-04 - val_accuracy: 1.0000 - val_loss: 4.

In [15]:

loss, acc = model.evaluate(X_test_pad, y_test, verbose=0)
print("\nBiLSTM Test Loss:", loss)
print("BiLSTM Test Accuracy:", acc)

pred_probs = model.predict(X_test_pad)
pred_labels = np.argmax(pred_probs, axis=1)

print("\nBiLSTM Classification Report:")
print(classification_report(y_test, pred_labels))


BiLSTM Test Loss: 0.8099352121353149
BiLSTM Test Accuracy: 0.917005717754364
[1m39/39[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step

BiLSTM Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.55      0.62        88
           1       0.20      0.18      0.19        34
           2       0.95      0.97      0.96      1107

    accuracy                           0.92      1229
   macro avg       0.62      0.56      0.59      1229
weighted avg       0.91      0.92      0.91      1229



In [16]:
model.save("sentiment_bilstm_model.h5")

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

print("\nModel + Tokenizer saved successfully!")





Model + Tokenizer saved successfully!
