In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
df = pd.read_csv("customer_feedback.csv")


In [4]:
def map_sentiment(score):
    if score <= 2:
        return 0
    elif score == 3:
        return 1
    else:
        return 2

df["Sentiment"] = df["FeedbackScore"].apply(map_sentiment)


In [5]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z\s]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1', text)  # reduce repeated chars
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(w, pos='v') for w in tokens if w not in stop_words and len(w) > 2]
    return " ".join(tokens)

df["CleanedText"] = df["FeedbackText"].apply(clean_text)


In [6]:
texts = df["CleanedText"].tolist()
labels = df["Sentiment"].tolist()

max_words = 20000
max_len = 250

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_len)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, stratify=labels, random_state=42
)

y_train_cat = to_categorical(y_train, num_classes=3)
y_test_cat = to_categorical(y_test, num_classes=3)

In [8]:
weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(weights))

In [9]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=256, input_length=max_len),
    SpatialDropout1D(0.3),
    Bidirectional(LSTM(96, return_sequences=True)),
    Dropout(0.4),
    BatchNormalization(),
    Bidirectional(LSTM(48)),
    Dense(128, activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])




In [10]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [11]:
# Callbacks
callbacks = [
    EarlyStopping(patience=5, restore_best_weights=True),
    ReduceLROnPlateau(patience=3, factor=0.5)
]

In [12]:
# Train
history = model.fit(
    X_train, y_train_cat,
    epochs=30,
    batch_size=32,
    validation_data=(X_test, y_test_cat),
    class_weight=class_weights,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 1s/step - accuracy: 0.3656 - loss: 1.0957 - val_accuracy: 0.4500 - val_loss: 1.0968 - learning_rate: 0.0010
Epoch 2/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.4539 - loss: 1.0688 - val_accuracy: 0.4500 - val_loss: 1.0967 - learning_rate: 0.0010
Epoch 3/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1s/step - accuracy: 0.4125 - loss: 1.1223 - val_accuracy: 0.4500 - val_loss: 1.0957 - learning_rate: 0.0010
Epoch 4/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 1s/step - accuracy: 0.3492 - loss: 1.1034 - val_accuracy: 0.4500 - val_loss: 1.0939 - learning_rate: 0.0010
Epoch 5/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 2s/step - accuracy: 0.3969 - loss: 1.0914 - val_accuracy: 0.4500 - val_loss: 1.0934 - learning_rate: 0.0010
Epoch 6/30
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 1s/step - accuracy: 0

In [13]:
# Evaluate
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"\n✅ Final Test Accuracy: {accuracy * 100:.2f}%")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step - accuracy: 0.4500 - loss: 1.0934

✅ Final Test Accuracy: 45.00%
