# LSTM for Sentiment Analysis

This notebook demonstrates the implementation of an **LSTM model** for sentiment classification using a movie reviews dataset.

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, SpatialDropout1D, LSTM, Dense
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
def clean_text(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z']", " ", text)
    tokens = text.lower().split()
    return " ".join([t for t in tokens if t not in stopwords.words('english')])

In [None]:
df = pd.read_csv("movie_reviews.csv")
df['clean_review'] = df['review'].apply(clean_text)

train_size = 800
train_reviews = df['clean_review'][:train_size]
test_reviews = df['clean_review'][train_size:]
train_labels = df['sentiment'][:train_size]
test_labels = df['sentiment'][train_size:]

In [None]:
tokenized_train = [text.split() for text in train_reviews]
tokenized_test = [text.split() for text in test_reviews]
token_counter = Counter(token for review in tokenized_train for token in review)
vocab_map = {word: idx + 1 for idx, (word, _) in enumerate(token_counter.items())}
vocab_map['PAD_INDEX'] = 0
vocab_map['NOT_FOUND_INDEX'] = len(vocab_map)
vocab_size = len(vocab_map)

In [None]:
def encode_and_pad(texts, vocab, max_len):
    encoded = [[vocab.get(token, vocab['NOT_FOUND_INDEX']) for token in review.split()] for review in texts]
    return sequence.pad_sequences(encoded, maxlen=max_len)

max_len = max(len(review.split()) for review in train_reviews)
X_train = encode_and_pad(train_reviews, vocab_map, max_len)
X_test = encode_and_pad(test_reviews, vocab_map, max_len)

le = LabelEncoder()
y_train = le.fit_transform(train_labels)
y_test = le.transform(test_labels)

In [None]:
embedding_dim = 128
lstm_units = 64

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(lstm_units, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
model.fit(X_train, y_train, batch_size=100, epochs=5, validation_split=0.1, verbose=1)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

## Confusion Matrix Output
![Results](sentiment-analysis.png)