1. Import Libraries
python

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
import nltk
nltk.download('stopwords')
import re
from transformers import BertTokenizer, TFBertForSequenceClassification


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Swapnil\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


2. Data Preprocessing
python

In [14]:
# Load the dataset
data = pd.read_csv('Reviews.csv')  # Replace with the path to your extracted Reviews.csv

# Display the first few rows
print(data.head())

# Select relevant columns
data = data[['Text', 'Score']]
data.columns = ['review', 'score']

# Convert scores to sentiment labels (Positive: 1, Negative: 0)
data['sentiment'] = data['score'].apply(lambda x: 1 if x > 3 else 0)

# Clean the text data
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuation and numbers
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Split into words
    text = [word for word in text if word not in stop_words]  # Remove stop words
    return ' '.join(text)

# Apply text cleaning
data['cleaned_reviews'] = data['review'].apply(clean_text)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_reviews'], data['sentiment'], test_size=0.2, random_state=42
)

   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Swapnil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


3. Split Data

In [15]:
# Split data into training, testing, and validation sets
X_train, X_test, y_train, y_test = train_test_split(
    data['cleaned_reviews'], data['sentiment'], test_size=0.2, random_state=42)


4. Sentiment Analysis with LSTM

4.1 Tokenization and Padding

In [16]:
# Tokenize the reviews
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad the sequences
max_len = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


4.2 Build and Train LSTM Model

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Define the LSTM model
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_len),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(32),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train, epochs=5, validation_data=(X_test_padded, y_test), batch_size=64)



Epoch 1/5
[1m7106/7106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m745s[0m 104ms/step - accuracy: 0.8221 - loss: 0.4231 - val_accuracy: 0.8961 - val_loss: 0.2524
Epoch 2/5
[1m7106/7106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m768s[0m 108ms/step - accuracy: 0.9025 - loss: 0.2410 - val_accuracy: 0.9112 - val_loss: 0.2218
Epoch 3/5
[1m7106/7106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m777s[0m 109ms/step - accuracy: 0.9218 - loss: 0.1981 - val_accuracy: 0.9167 - val_loss: 0.2101
Epoch 4/5
[1m7106/7106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m779s[0m 110ms/step - accuracy: 0.9354 - loss: 0.1686 - val_accuracy: 0.9219 - val_loss: 0.2105
Epoch 5/5
[1m7106/7106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m779s[0m 110ms/step - accuracy: 0.9466 - loss: 0.1429 - val_accuracy: 0.9233 - val_loss: 0.2170


4.3 Evaluate LSTM Model

In [34]:
# Make predictions
y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

# Evaluate model performance
print("LSTM Model Accuracy: ", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred)) 

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 20ms/step
LSTM Model Accuracy:  0.7678796034866436
              precision    recall  f1-score   support

           0       0.27      0.04      0.07     24666
           1       0.78      0.97      0.87     89025

    accuracy                           0.77    113691
   macro avg       0.53      0.50      0.47    113691
weighted avg       0.67      0.77      0.69    113691

