In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import pickle
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import load_model
print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load the Swiggy dataset
data = pd.read_csv('swiggy.csv')
print(f"Loaded {len(data)} records from swiggy.csv")
print(f"Columns: {data.columns.tolist()}")

Loaded 8000 records from swiggy.csv
Columns: ['ID', 'Area', 'City', 'Restaurant Price', 'Avg Rating', 'Total Rating', 'Food Item', 'Food Type', 'Delivery Time', 'Review']


In [5]:
# Preprocess data
data["Review"] = data["Review"].str.lower()
data["Review"] = data["Review"].replace(r'[^a-z0-9\s]', '', regex=True)
data = data.dropna(subset=['Review', 'Avg Rating'])

def label_sentiment_num(rating):
    if rating <= 2.5:
        return "negative"
    elif rating <= 3.5:
        return "neutral"
    else:
        return "positive"

data['sentiment'] = data['Avg Rating'].apply(label_sentiment_num)
print(f"Original sentiment distribution:")
print(data['sentiment'].value_counts())

Original sentiment distribution:
sentiment
positive    5727
neutral     2273
Name: count, dtype: int64


In [5]:
# Check if we have negative examples
negative_count = len(data[data['sentiment'] == 'negative'])

if negative_count == 0:
    print("No negative examples found! Adding synthetic negative examples...")
    
    # Common negative phrases for restaurant reviews
    negative_phrases = [
        "terrible food and bad service",
        "worst experience ever",
        "cold and tasteless food",
        "disgusting and overpriced",
        "horrible delivery and rude staff",
        "never order from here again",
        "waste of money and time",
        "food was terrible and cold",
        "very bad service and slow",
        "disappointed with the quality",
        "tasteless and bland food",
        "late delivery and wrong order",
        "poor quality and expensive",
        "awful experience and rude",
        "bad taste and dirty place",
        "not fresh and overcooked",
        "unhygienic and unhealthy",
        "worse than expected and cold",
        "cheap quality and small portions",
        "unpleasant and frustrating service"
    ]
    
    # Create synthetic negative examples
    synthetic_negative = pd.DataFrame({
        'Review': negative_phrases * 100,
        'sentiment': ['negative'] * (len(negative_phrases) * 100),
        'Avg Rating': [2.0] * (len(negative_phrases) * 100)
    })
    
    # Add to original data
    data = pd.concat([data[['Review', 'sentiment', 'Avg Rating']], synthetic_negative], ignore_index=True)
    print("After adding synthetic negatives:")
    print(data['sentiment'].value_counts())
else:
    print(f"Found {negative_count} negative examples")

No negative examples found! Adding synthetic negative examples...
After adding synthetic negatives:
sentiment
positive    5727
neutral     2273
negative    2000
Name: count, dtype: int64


In [6]:
data.head()

Unnamed: 0,ID,Area,City,Restaurant Price,Avg Rating,Total Rating,Food Item,Food Type,Delivery Time,Review,sentiment
0,1,Suburb,Ahmedabad,600,4.2,6198,Sushi,Fast Food,30-40 min,good but nothing extraordinary,positive
1,2,Business District,Pune,200,4.7,4865,Pepperoni Pizza,Non-Vegetarian,50-60 min,good but nothing extraordinary,positive
2,3,Suburb,Bangalore,600,4.7,2095,Waffles,Fast Food,50-60 min,late delivery ruined it,positive
3,4,Business District,Mumbai,900,4.0,6639,Sushi,Vegetarian,50-60 min,best meal ive had in a while,positive
4,5,Tech Park,Mumbai,200,4.7,6926,Spring Rolls,Gluten-Free,20-30 min,mediocre experience,positive


In [9]:
data.shape

(8000, 11)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                8000 non-null   int64  
 1   Area              8000 non-null   object 
 2   City              8000 non-null   object 
 3   Restaurant Price  8000 non-null   int64  
 4   Avg Rating        8000 non-null   float64
 5   Total Rating      8000 non-null   int64  
 6   Food Item         8000 non-null   object 
 7   Food Type         8000 non-null   object 
 8   Delivery Time     8000 non-null   object 
 9   Review            8000 non-null   object 
 10  sentiment         8000 non-null   object 
dtypes: float64(1), int64(3), object(7)
memory usage: 687.6+ KB


In [12]:
data.describe()

Unnamed: 0,ID,Restaurant Price,Avg Rating,Total Rating
count,8000.0,8000.0,8000.0,8000.0
mean,4000.5,544.5875,4.1299,4979.9775
std,2309.54541,287.968871,0.645791,2877.285148
min,1.0,100.0,3.0,51.0
25%,2000.75,300.0,3.5,2476.0
50%,4000.5,500.0,4.2,4989.5
75%,6000.25,800.0,4.7,7498.0
max,8000.0,1000.0,5.0,10000.0


In [13]:
data.duplicated().sum()

np.int64(0)

In [14]:
# Build vocabulary
max_features = 5000
max_length = 200

def build_vocab(texts, max_features=None):
    word_counts = {}
    for text in texts:
        for word in str(text).split():
            word_counts[word] = word_counts.get(word, 0) + 1
    
    sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    
    if max_features:
        sorted_words = sorted_words[:max_features]
    
    word_index = {word: idx+1 for idx, (word, _) in enumerate(sorted_words)}
    return word_index

word_index = build_vocab(data["Review"].tolist(), max_features=max_features)
print(f"Vocabulary size: {len(word_index)}")

Vocabulary size: 72


In [15]:
# Convert texts to sequences
def texts_to_sequences(texts, word_index):
    sequences = []
    for text in texts:
        seq = [word_index.get(word, 0) for word in str(text).split()]
        sequences.append(seq)
    return sequences

sequences = texts_to_sequences(data["Review"].tolist(), word_index)

# Pad sequences
def pad_sequences_custom(sequences, maxlen):
    padded = []
    for seq in sequences:
        if len(seq) < maxlen:
            seq = [0]*(maxlen - len(seq)) + seq
        else:
            seq = seq[:maxlen]
        padded.append(seq)
    return np.array(padded)

X = pad_sequences_custom(sequences, max_length)
y = data['sentiment'].values

print(f"X shape: {X.shape}, y shape: {y.shape}")

X shape: (8000, 200), y shape: (8000,)


In [31]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

# Encode labels
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_val_encoded = encoder.transform(y_val)
y_test_encoded = encoder.transform(y_test)

print(f"Label classes: {encoder.classes_}")
print(f"Train distribution: {np.bincount(y_train_encoded)}")
print(f"Test distribution: {np.bincount(y_test_encoded)}")

Label classes: ['neutral' 'positive']
Train distribution: [1636 4124]
Test distribution: [ 455 1145]


In [32]:
# Train the model
print("Training model...")
history = model.fit(
    X_train, y_train_encoded,
    validation_data=(X_val, y_val_encoded),
    epochs=10,
    batch_size=64
)

# Evaluate
loss, acc = model.evaluate(X_test, y_test_encoded)
print(f"Test Accuracy: {acc:.2f}")

Training model...
Epoch 1/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 233ms/step - accuracy: 0.7160 - loss: 0.5969 - val_accuracy: 0.7156 - val_loss: 0.5999
Epoch 2/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 232ms/step - accuracy: 0.7160 - loss: 0.5968 - val_accuracy: 0.7156 - val_loss: 0.5963
Epoch 3/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 287ms/step - accuracy: 0.7160 - loss: 0.5981 - val_accuracy: 0.7156 - val_loss: 0.6003
Epoch 4/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 390ms/step - accuracy: 0.7160 - loss: 0.5977 - val_accuracy: 0.7156 - val_loss: 0.5991
Epoch 5/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 371ms/step - accuracy: 0.7160 - loss: 0.5969 - val_accuracy: 0.7156 - val_loss: 0.5970
Epoch 6/10
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 311ms/step - accuracy: 0.7160 - loss: 0.5963 - val_accuracy: 0.7156 - val_loss: 0.5961
Epoc

In [23]:
# Save the model and artifacts
print("Saving model and artifacts...")

# Save Keras model
model.save('smartapp/sentiment_model.h5')
print("Saved: smartapp/sentiment_model.h5")

# Save word_index
with open('smartapp/word_index.pkl', 'wb') as f:
    pickle.dump(word_index, f)
print("Saved: smartapp/word_index.pkl")

# Save label encoder
with open('smartapp/label_encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)
print("Saved: smartapp/label_encoder.pkl")

# Save max_length for later use
with open('smartapp/model_config.pkl', 'wb') as f:
    pickle.dump({'max_length': max_length, 'max_features': max_features}, f)
print("Saved: smartapp/model_config.pkl")



Saving model and artifacts...
Saved: smartapp/sentiment_model.h5
Saved: smartapp/word_index.pkl
Saved: smartapp/label_encoder.pkl
Saved: smartapp/model_config.pkl


In [24]:
# Test the saved model
print("Testing saved model...")

# Load saved artifacts
loaded_model = load_model('smartapp/sentiment_model.h5')
with open('smartapp/word_index.pkl', 'rb') as f:
    loaded_word_index = pickle.load(f)
with open('smartapp/label_encoder.pkl', 'rb') as f:
    loaded_encoder = pickle.load(f)

# Test reviews
test_reviews = [
    "The food was amazing and delicious!",
    "Terrible service and cold food",
    "It was okay, nothing special",
    "Best restaurant ever, highly recommend!",
    "Worst experience ever, never coming back",
    "Fresh and tasty, will order again",
    "Hate the long wait and rude staff",
    "Average quality but decent"
]

print("Test Results:")
for review in test_reviews:
    # Preprocess
    text = review.lower().strip()
    text = re.sub(r'[^\w\s]', '', text)
    words = text.split()
    sequence = [loaded_word_index.get(word, 0) for word in words]
    if len(sequence) < max_length:
        sequence = [0] * (max_length - len(sequence)) + sequence
    else:
        sequence = sequence[:max_length]
    sequence = np.array([sequence])
    
    # Predict
    pred = loaded_model.predict(sequence, verbose=0)[0]
    pred_class = np.argmax(pred)
    sentiment = loaded_encoder.inverse_transform([pred_class])[0]
    confidence = float(pred[pred_class] * 100)
    
    print(f"Review: '{review}'")
    print(f"  Sentiment: {sentiment} ({confidence:.1f}%)")

print("Model training and export complete!")



Testing saved model...
Test Results:
Review: 'The food was amazing and delicious!'
  Sentiment: positive (67.5%)
Review: 'Terrible service and cold food'
  Sentiment: positive (69.8%)
Review: 'It was okay, nothing special'
  Sentiment: positive (67.1%)
Review: 'Best restaurant ever, highly recommend!'
  Sentiment: positive (67.8%)
Review: 'Worst experience ever, never coming back'
  Sentiment: positive (68.0%)
Review: 'Fresh and tasty, will order again'
  Sentiment: positive (68.3%)
Review: 'Hate the long wait and rude staff'
  Sentiment: positive (68.1%)
Review: 'Average quality but decent'
  Sentiment: positive (66.3%)
Model training and export complete!
