In [7]:
import re
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense


In [16]:
df = pd.read_csv('../Datasets/NLP/sentiment_analysis.csv')
df = df[['text', 'sentiment']]
df.dropna(inplace=True)

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)

df['clean_text'] = df['text'].astype(str).apply(clean_text)

texts = df['clean_text'].tolist()
labels = df['sentiment'].values

MAX_WORDS = 5000
MAX_LEN = 100

tokenizer = Tokenizer(
    num_words=MAX_WORDS,
    oov_token="<OOV>",
    lower=True
)

tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

X = pad_sequences(sequences, padding='post', maxlen=MAX_LEN)

le = LabelEncoder()
y = le.fit_transform(labels)

print("Input shape:", X.shape)

EMBEDDING_DIM = 100
GLOVE_PATH = "../Datasets/NLP/glove.txt"   # change if needed

embeddings_index = {}

with open(GLOVE_PATH, encoding="utf8") as f:
    for line in f:
        values = line.rstrip().split(" ")
        if len(values) != EMBEDDING_DIM + 1:
            continue
        word = values[0]
        vector = np.asarray(values[1:], dtype="float32")
        embeddings_index[word] = vector

print("Loaded GloVe vectors:", len(embeddings_index))
print("Vector size check:", next(iter(embeddings_index.values())).shape)

word_index = tokenizer.word_index
vocab_size = min(MAX_WORDS, len(word_index) + 1)

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

model = Sequential([
    Embedding(
        input_dim=vocab_size,
        output_dim=EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_LEN,
        trainable=False   # set True if you want fine-tuning
    ),
    LSTM(128),
    Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.summary()

Input shape: (27480, 100)
Loaded GloVe vectors: 1291147
Vector size check: (100,)




In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)

model.fit(
    X_train,
    y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.2
)


Epoch 1/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 81ms/step - accuracy: 0.4026 - loss: -3.2968 - val_accuracy: 0.4094 - val_loss: -6.9521
Epoch 2/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 135ms/step - accuracy: 0.4026 - loss: -3.6313 - val_accuracy: 0.4094 - val_loss: -7.6415
Epoch 3/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 118ms/step - accuracy: 0.4026 - loss: -3.9675 - val_accuracy: 0.4094 - val_loss: -8.3178
Epoch 4/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 66ms/step - accuracy: 0.4026 - loss: -4.3079 - val_accuracy: 0.4094 - val_loss: -9.0005
Epoch 5/5
[1m550/550[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 213ms/step - accuracy: 0.4026 - loss: -4.6467 - val_accuracy: 0.4094 - val_loss: -9.7037


<keras.src.callbacks.history.History at 0x17cc6e92f90>