In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [34]:
df = pd.read_csv('Zomato_reviews.csv')

In [35]:
df.head(1)

Unnamed: 0,rating,review_text
0,1.0,"Their service is worst, pricing in menu is dif..."


In [36]:
df.shape

(27762, 2)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27762 entries, 0 to 27761
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rating       27762 non-null  float64
 1   review_text  27748 non-null  object 
dtypes: float64(1), object(1)
memory usage: 433.9+ KB


In [38]:
df['rating'].unique()

array([1. , 5. , 4. , 3. , 2. , 4.5, 1.5, 3.5, 2.5])

In [39]:
# Round ratings to nearest integer
df['rating'] = df['rating'].round().astype(int)

# Check classes
print("Unique Ratings:", df['rating'].unique())


Unique Ratings: [1 5 4 3 2]


In [40]:
df.isnull().sum()

Unnamed: 0,0
rating,0
review_text,14


In [41]:
df = df.dropna()

In [42]:
df.isnull().sum()

Unnamed: 0,0
rating,0
review_text,0


In [43]:
import pandas as pd
import numpy as np
import string
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from nltk.corpus import  stopwords
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

In [44]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [45]:
def process(text) :
    text = text.lower()                                                                # Converting string into lower case
    text = "".join([i        for  i in text       if i not in string.punctuation  ])  # Removing Punctuations
    tokens = word_tokenize(text)                                                             # Tokenize
    # Stop Word Removal
    tokens = [ i        for i in tokens          if i not  in  stopwords.words('english') ]

    return " ".join(tokens)

In [46]:
df['cleaned_message'] = df['review_text'].apply(process)

In [47]:
df.head()

Unnamed: 0,rating,review_text,cleaned_message
0,1,"Their service is worst, pricing in menu is dif...",service worst pricing menu different bill give...
1,5,really appreciate their quality and timing . I...,really appreciate quality timing tried thattil...
2,4,"Went there on a Friday night, the place was su...",went friday night place surprisingly empty int...
3,4,A very decent place serving good food.\r\nOrde...,decent place serving good food ordered chilli ...
4,5,One of the BEST places for steaks in the city....,one best places steaks city tried beef steak c...


In [48]:

df = df[df['cleaned_message'].str.strip() != '']

df = df.reset_index(drop=True)

print("Cleaned Data Shape:", df.shape)

Cleaned Data Shape: (27691, 3)


In [49]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['cleaned_message'])

y = df['rating']

print("TF-IDF Shape:", X.shape)


TF-IDF Shape: (27691, 20155)


In [50]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123, stratify=y)  # stratify to keep class distribution same


In [51]:
feature_names = tfidf.get_feature_names_out()
print("Sample Features:", feature_names[:20])


Sample Features: ['00' '00ã' '010' '015' '01dec2018' '02' '03' '03rd' '05' '055' '06102018'
 '10' '100' '1000' '1000rs' '100100' '100150' '100also' '100am' '100dark']


In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Model
model = LogisticRegression(multi_class='multinomial', solver='saga', max_iter=1000)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8116988626105796

Classification Report:
               precision    recall  f1-score   support

           1       0.87      0.88      0.88       625
           2       0.92      0.62      0.74       421
           3       0.80      0.66      0.72       752
           4       0.78      0.87      0.82      2126
           5       0.82      0.83      0.83      1615

    accuracy                           0.81      5539
   macro avg       0.84      0.77      0.80      5539
weighted avg       0.82      0.81      0.81      5539



In [53]:
import joblib

# TF-IDF vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Logistic Regression model
joblib.dump(model, 'rating_prediction_model.pkl')

print("Model and vectorizer saved!")


Model and vectorizer saved!


In [58]:
# InterFace
import joblib
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load model & vectorizer
tfidf = joblib.load('tfidf_vectorizer.pkl')
model = joblib.load('rating_prediction_model.pkl')

# Same preprocess function as training
def process(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = "".join([i for i in text if i not in string.punctuation])
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in stopwords.words('english')]
    return " ".join(tokens)

# Prediction Function
def predict_rating(review_text):
    cleaned_text = process(review_text)
    features = tfidf.transform([cleaned_text])
    predicted_rating = model.predict(features)[0]
    return predicted_rating

# Example usage
review = input("Enter your review: ")
predicted = predict_rating(review)
print(f"Predicted Rating: {predicted}")


Enter your review: Zomato Is Failing Its Customers: A Frustrated User’s Experience   In the age of convenience, food delivery apps like Zomato promised to make our lives easier. Unfortunately, that promise seems to be fading fast. Today, Zomato is no longer about customer satisfaction—it's about dodging accountability.   Terrible Customer Support Experience   The biggest frustration with Zomato is its absolutely terrible customer service. Gone are the days when you could actually speak to a human. Now, you’re stuck with a chatbot that offers only a handful of limited options—most of which have nothing to do with your actual issue. If your concern doesn’t fit into one of their pre-selected boxes, you’re out of luck.   No Way to Reach a Human   Need to explain your issue in detail? Want to talk to someone who can actually help? Too bad. There’s no call support anymore. Zomato has completely removed the option to speak with a real person. This leaves customers feeling ignored and helpless