# Project: Swiggy’s Restaurant Recommendation System using Streamlit
Project covers data cleaning, one-hot encoding, similarity/clustering-based recommendations, and a Streamlit UI.

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity


In [10]:
RAW_DATA_PATH = r"C:\Swiggy restaurant\swiggy.csv"

restaurants_df = pd.read_csv(RAW_DATA_PATH)
print("Dataset Loaded:", restaurants_df.shape)


Dataset Loaded: (148541, 11)


In [11]:
print(restaurants_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB
None


In [17]:
# Remove duplicates
restaurants_df = restaurants_df.drop_duplicates()

# Clean cost column
restaurants_df['cost'] = restaurants_df['cost'].replace('[₹,]', '', regex=True)
restaurants_df['cost'] = pd.to_numeric(restaurants_df['cost'], errors='coerce')

# Function to clean rating_count values
def clean_rating_count(value):
    value = str(value).strip()
    if 'Too Few Ratings' in value or value == '':
        return 0
    if 'K' in value:
        try:
            return int(float(value.replace('K', '').replace('+', '').strip()) * 1000)
        except ValueError:
            return 0
    if '+' in value:
        numeric_part = value.replace('+', '').replace('ratings', '').replace('rating', '').strip()
        return int(float(numeric_part)) if numeric_part.replace('.', '', 1).isdigit() else 0
    return int(float(value)) if value.replace('.', '', 1).isdigit() else 0

# Apply function to clean column
restaurants_df['rating_count'] = restaurants_df['rating_count'].apply(clean_rating_count)

# Convert rating to numeric
restaurants_df['rating'] = pd.to_numeric(restaurants_df['rating'], errors='coerce')

# Fill missing numerical values
restaurants_df['cost'] = restaurants_df['cost'].fillna(restaurants_df['cost'].median())
restaurants_df['rating'] = restaurants_df['rating'].fillna(restaurants_df['rating'].median())

# Drop rows with missing categorical values
restaurants_df.dropna(subset=['name', 'city', 'cuisine'], inplace=True)

# Remove unnecessary columns if they exist
cols_to_drop = [c for c in ['id', 'link', 'menu'] if c in restaurants_df.columns]
if cols_to_drop:
    restaurants_df.drop(columns=cols_to_drop, inplace=True)

# Save cleaned dataset
restaurants_df.to_csv("cleaned_data.csv", index=False)
print("Cleaned data saved.", "Shape:", restaurants_df.shape)

Cleaned data saved. Shape: (148441, 8)


In [18]:
# Save cleaned dataset
restaurants_df.to_csv("cleaned_data.csv", index=False)
print("Data Cleaning Completed! Saved as cleaned_data.csv")

Data Cleaning Completed! Saved as cleaned_data.csv


In [19]:
# Initialize encoder
category_encoder = OneHotEncoder(
    handle_unknown='ignore',
    sparse_output=False
)

# Fit encoder
category_encoder.fit(
    restaurants_df[['city', 'cuisine']]
)

# Save encoder
with open("encoder.pkl", "wb") as enc_file:
    pickle.dump(category_encoder, enc_file)

print("Encoder trained & saved.")


Encoder trained & saved.


In [20]:
# Select numerical features
numeric_features = restaurants_df.drop(
    columns=['city', 'cuisine', 'lic_no', 'address']
)

numeric_features = numeric_features.apply(
    pd.to_numeric, errors='coerce'
)

numeric_features.replace(
    [np.inf, -np.inf], np.nan, inplace=True
)

numeric_features.fillna(
    numeric_features.median(), inplace=True
)

# Scale features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(numeric_features)


In [21]:
CLUSTERS = 10

kmeans_model = KMeans(
    n_clusters=CLUSTERS,
    random_state=42
)

restaurants_df['cluster'] = kmeans_model.fit_predict(
    scaled_features
)

restaurants_df.to_csv("clustered_data.csv", index=False)
print("Clustering completed.")


Clustering completed.


In [22]:
def get_recommendations(user_input, top_k=5):
    """
    user_input: list of numeric values matching numeric_features columns
    """

    user_df = pd.DataFrame(
        [user_input],
        columns=numeric_features.columns
    )

    user_scaled = scaler.transform(user_df)

    similarity_scores = cosine_similarity(
        user_scaled, scaled_features
    ).flatten()

    top_indices = similarity_scores.argsort()[-top_k:][::-1]

    return restaurants_df.iloc[top_indices][
        ['name', 'city', 'cuisine', 'rating', 'cost']
    ]


In [23]:
sample_input = numeric_features.iloc[0].values
recommended = get_recommendations(sample_input)

print(recommended)
print(restaurants_df.info())

                                      name      city               cuisine  \
0                           AB FOODS POINT    Abohar      Beverages,Pizzas   
148540                      Lazeez kitchen  Yavatmal                Pizzas   
9                                yummy hub    Abohar                Indian   
12                           Swastik Dhaba    Abohar          North Indian   
17      wah ji waah veg and non veg corner    Abohar  North Indian,Chinese   

        rating   cost  
0          4.0  200.0  
148540     4.0  200.0  
9          4.0  200.0  
12         4.0  200.0  
17         4.0  200.0  
<class 'pandas.core.frame.DataFrame'>
Index: 148441 entries, 0 to 148540
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   name          148441 non-null  object 
 1   city          148441 non-null  object 
 2   rating        148441 non-null  float64
 3   rating_count  148441 non-null  int64  
 4   cost         