In [None]:
# Step 1: Install dependencies
!pip install -q pandas numpy scikit-learn tensorflow nltk


In [None]:
# Step 2: Imports
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [None]:
# Step 3: Load dataset
df = pd.read_csv("flipkart_fashion.csv")
df = df[['FeedbackText', 'FeedbackScore']].dropna()
df['FeedbackScore'] = df['FeedbackScore'].astype(int)
df['sentiment'] = df['FeedbackScore'].apply(lambda x: 1 if x >= 4 else 0)


In [None]:
# Step 4: Text cleaning with lemmatization
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

df['FeedbackText'] = df['FeedbackText'].apply(clean_text)
df = df[df['FeedbackText'].str.len() > 10]


In [None]:
# Step 5: Tokenization
texts = df['FeedbackText'].values
labels = df['sentiment'].values

vocab_size = 10000
maxlen = 150
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded = pad_sequences(sequences, maxlen=maxlen, padding='post')


In [None]:
# Step 6: Split data
X_train, X_test, y_train, y_test = train_test_split(padded, labels, test_size=0.2, random_state=42)

# Step 7: Handle class imbalance
weights = class_weight.compute_class_weight(class_weight='balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(weights))


In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Step 8: Build smaller model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=32, input_length=maxlen),
    SpatialDropout1D(0.25),
    Bidirectional(LSTM(8)),
    Dropout(0.5),
    Dense(16, activation='relu'),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.build(input_shape=(None, maxlen))
model.summary()



In [None]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Reduce model capacity if needed (optional, e.g. fewer layers or smaller layers)

# Modify callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=0, restore_best_weights=True)  # stop early
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.7, patience=0, verbose=1)

# Train for fewer epochs, smaller batch size
history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=1,  # reduced epochs
    batch_size=32,  # smaller batch size may hurt convergence
    class_weight=class_weights,
    callbacks=[early_stop, reduce_lr],
    verbose=2
)


40/40 - 9s - 215ms/step - accuracy: 0.6070 - loss: 0.6830 - val_accuracy: 0.9625 - val_loss: 0.6462 - learning_rate: 1.0000e-03


In [None]:
# Step 10: Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"\n✅ Final Test Accuracy : {acc * 100:.2f}%")


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 36ms/step - accuracy: 0.9504 - loss: 0.6442

✅ Final Test Accuracy : 95.75%
