In [13]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential    #building rnn
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding ,LSTM  #model layer




In [4]:
data=pd.read_csv('swiggy.csv')
print(data.columns.tolist())

['ID', 'Area', 'City', 'Restaurant Price', 'Avg Rating', 'Total Rating', 'Food Item', 'Food Type', 'Delivery Time', 'Review']


In [5]:
data["Review"]=data["Review"].str.lower()
data["Review"] = data["Review"].replace(r'[^a-z0-9\s]', '', regex=True)
def label_sentiment_num(rating):
    if rating <= 2.5:
        return "negative"
    elif rating <= 3.5:
        return "neutral"
    else:
        return "positive"

data['sentiment'] = data['Avg Rating'].apply(label_sentiment_num)
data=data.dropna()
print(data)

        ID               Area       City  Restaurant Price  Avg Rating  \
0        1             Suburb  Ahmedabad               600         4.2   
1        2  Business District       Pune               200         4.7   
2        3             Suburb  Bangalore               600         4.7   
3        4  Business District     Mumbai               900         4.0   
4        5          Tech Park     Mumbai               200         4.7   
...    ...                ...        ...               ...         ...   
7995  7996        City Center     Mumbai               300         4.0   
7996  7997           Downtown    Chennai               100         4.7   
7997  7998          Tech Park    Chennai               900         4.5   
7998  7999           Old Town      Delhi               500         4.2   
7999  8000           Downtown      Delhi               400         4.5   

      Total Rating        Food Item       Food Type Delivery Time  \
0             6198            Sushi       

In [6]:
max_features = 5000   
max_length = 200    
def build_vocab(texts, max_features=None):
    word_counts = {}
    for text in texts:
        for word in text.split():
            word_counts[word] = word_counts.get(word, 0) + 1

    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

    #limited vocab
    if max_features:
        sorted_words = sorted_words[:max_features]

    word_index = {word: idx+1 for idx, (word, _) in enumerate(sorted_words)}
    return word_index

word_index = build_vocab(data["Review"].tolist(), max_features=max_features)


def texts_to_sequences(texts, word_index):
    sequences = []
    for text in texts:
        seq = [word_index.get(word, 0) for word in text.split()]  
        sequences.append(seq)
    return sequences

sequences = texts_to_sequences(data["Review"].tolist(), word_index)

def pad_sequences_custom(sequences, maxlen):
    padded = []
    for seq in sequences:
        if len(seq) < maxlen:
            # pad with zeros at the beginning
            seq = [0]*(maxlen - len(seq)) + seq
        else:
            # truncate 
            seq = seq[:maxlen]
        padded.append(seq)
    return np.array(padded)  

X = pad_sequences_custom(sequences, max_length)
y = data['sentiment'].values


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)


In [14]:


max_words = 5000  # same as tokenizer
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim),
    LSTM(128, return_sequences=True),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(64, activation="relu"),
    Dense(3, activation="softmax")   # 3 classes
])


model.compile(
    loss="sparse_categorical_crossentropy",  # for integer labels
    optimizer="adam",
    metrics=["accuracy"]
)



model.summary()


In [16]:
encoder = LabelEncoder()
y = encoder.fit_transform(data["sentiment"])

y_train = encoder.fit_transform(y_train)
y_val = encoder.transform(y_val)
y_test = encoder.transform(y_test)

# Convert to numpy arrays of floats 
y_train = np.array(y_train).astype("float32")
y_val = np.array(y_val).astype("float32")
y_test = np.array(y_test).astype("float32")


In [17]:
y_train = y_train.astype("float32")
y_val = y_val.astype("float32")
y_test = y_test.astype("float32")



In [19]:

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,        # train longer
    batch_size=64
)


loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.2f}")


Epoch 1/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 193ms/step - accuracy: 0.7160 - loss: 0.5954 - val_accuracy: 0.7156 - val_loss: 0.5957
Epoch 2/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 190ms/step - accuracy: 0.7160 - loss: 0.5957 - val_accuracy: 0.7156 - val_loss: 0.5964
Epoch 3/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 217ms/step - accuracy: 0.7160 - loss: 0.5958 - val_accuracy: 0.7156 - val_loss: 0.5959
Epoch 4/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 228ms/step - accuracy: 0.7160 - loss: 0.5959 - val_accuracy: 0.7156 - val_loss: 0.5961
Epoch 5/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 235ms/step - accuracy: 0.7160 - loss: 0.5958 - val_accuracy: 0.7156 - val_loss: 0.5957
Epoch 6/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 211ms/step - accuracy: 0.7160 - loss: 0.5962 - val_accuracy: 0.7156 - val_loss: 0.5959
Epoch 7/10
[1m90/90[