# Sentiment Analysis Using LSTM

The objective of this project is to perform sentiment analysis on movie reviews using an LSTM-based model. We'll classify reviews as positive or negative based on their text content.

In [1]:
!pip install tensorflow nltk




In [2]:
!pip install tensorflow-datasets



# Import Libraries

In [14]:
# Import Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, Input
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
import re
import tensorflow_datasets as tfds

# Load the IMDb Dataset

In [8]:
# Load the IMDb dataset
imdb, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)
train_data, test_data = imdb['train'], imdb['test']


# Convert Dataset to DataFrame

In [9]:
# TensorFlow dataset ko list mein convert karo
train_examples = [(review.numpy().decode('utf-8'), label.numpy()) for review, label in train_data.take(100)]  # Yahan hum 100 samples le rahe hain

# Pandas DataFrame banao
df_train = pd.DataFrame(train_examples, columns=['Review', 'Label'])

# DataFrame ke pehle kuch rows dikhao
df_train.head()


Unnamed: 0,Review,Label
0,This was an absolutely terrible movie. Don't b...,0
1,"I have been known to fall asleep during films,...",0
2,Mann photographs the Alberta Rocky Mountains i...,0
3,This is the kind of film for a snowy Sunday af...,1
4,"As others have mentioned, all the women that g...",1


# Text Preprocessing

In [10]:
# Download stopwords

nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

# Clean the text: remove stopwords, non-alphabetic characters, etc.
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters (punctuation)
    text = re.sub(r'\s+', ' ', text)  # Remove multiple spaces
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    return text

# Process training data
train_sentences = []
train_labels = []
for s, l in train_data:
    train_sentences.append(clean_text(str(s.numpy())))
    train_labels.append(int(l.numpy()))

# Process testing data
test_sentences = []
test_labels = []
for s, l in test_data:
    test_sentences.append(clean_text(str(s.numpy())))
    test_labels.append(int(l.numpy()))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Tokenization and Padding

In [11]:
# Hyperparameters of the model

vocab_size = 3000  # Choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200  # Choose based on statistics, for example 150 to 200
padding_type = 'post'
trunc_type = 'post'

# Tokenize sentences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index

# Convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='padding_type', maxlen=max_length)

# Convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='padding_type', maxlen=max_length)


# Build the LSTM Model

In [15]:
# Model initialization

model = Sequential([
    Input(shape=(max_length,)),
    Embedding(input_dim=vocab_size, output_dim=embedding_dim),
    Bidirectional(LSTM(64)),
    Dense(24, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Model summary
model.summary()


# Train the Model

In [16]:
# Convert labels to numpy arrays

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

num_epochs = 5
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)


Epoch 1/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 84ms/step - accuracy: 0.7131 - loss: 0.5479 - val_accuracy: 0.8544 - val_loss: 0.3736
Epoch 2/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 119ms/step - accuracy: 0.8682 - loss: 0.3295 - val_accuracy: 0.8620 - val_loss: 0.3288
Epoch 3/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 121ms/step - accuracy: 0.8944 - loss: 0.2703 - val_accuracy: 0.8576 - val_loss: 0.3586
Epoch 4/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 112ms/step - accuracy: 0.9098 - loss: 0.2363 - val_accuracy: 0.8728 - val_loss: 0.3157
Epoch 5/5
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 114ms/step - accuracy: 0.9170 - loss: 0.2179 - val_accuracy: 0.8740 - val_loss: 0.3139


# Evaluate the Model

In [17]:
# Make predictions

prediction = model.predict(test_padded)

# Get labels based on probability: 1 if p >= 0.5 else 0
pred_labels = [1 if i >= 0.5 else 0 for i in prediction]

# Calculate accuracy
print("Accuracy of prediction on test set: ", accuracy_score(test_labels, pred_labels))


[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 38ms/step
Accuracy of prediction on test set:  0.86032


### sentiment analysis model using an LSTM architecture to classify IMDb movie reviews as positive or negative. The model with 86% accuracy can be further improved by tuning hyperparameters or employing advanced techniques.