In [1]:
# === 1. Imports and settings ===

import os
import numpy as np
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import pickle
import random

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
data_dir = os.path.join(project_root, "data")
models_dir = os.path.join(project_root, "models")

os.makedirs(data_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

print("Project root:", project_root)
print("Data dir:", data_dir)
print("Models dir:", models_dir)


Project root: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit
Data dir: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\data
Models dir: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\models


In [2]:
# === 2. Load raw data ===

csv_path = os.path.join(data_dir, "swiggy.csv")
df_raw = pd.read_csv(csv_path)

print("Raw shape:", df_raw.shape)
df_raw.head()


Raw shape: (148541, 11)


Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,lic_no,link,address,menu
0,567335,AB FOODS POINT,Abohar,--,Too Few Ratings,₹ 200,"Beverages,Pizzas",22122652000138,https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50+ ratings,₹ 200,"Sweets,Bakery",12117201000112,https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100+ ratings,₹ 100,Beverages,22121652000190,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20+ ratings,₹ 250,"Fast Food,Indian",22119652000167,https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,--,Too Few Ratings,₹ 250,"Italian-American,Fast Food",12122201000053,https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [21]:

# === 3. Data understanding and cleaning ===
# Columns (from problem): 
# ['id','name','city','rating','rating_count','cost','cuisine','lic_no','link','address','menu']

df = df_raw.copy()

# Drop columns: do NOT consider menu, address, link, lic_no
cols_to_drop = ["menu", "address", "link", "lic_no"]
for c in cols_to_drop:
    if c in df.columns:
        df.drop(columns=c, inplace=True)

print("Columns after drop:", df.columns.tolist())

# 3.1 Remove duplicates
before_dups = df.shape[0]
df = df.drop_duplicates()
after_dups = df.shape[0]
print(f"Duplicates removed: {before_dups - after_dups}")

# 3.2 Handle missing values:
# rating: may have '--'
# rating_count: e.g. "Too Few Ratings", "50+ ratings"
# cost: e.g. '₹ 200'

# Clean rating: convert '--' to NaN, then to float
def clean_rating(x):
    try:
        x = str(x).strip()
        if x == "--" or x == "":
            return np.nan
        return float(x)
    except Exception:
        return np.nan

df["rating"] = df["rating"].apply(clean_rating)   #rating_clean

# Clean rating_count: extract leading number
import re
def clean_rating_count(x):
    x = str(x)
    if "Too Few" in x or x.strip() == "":
        return np.nan
    match = re.search(r"\d+", x)
    if match:
        return int(match.group())
    return np.nan

df["rating_count"] = df["rating_count"].apply(clean_rating_count)    #rating_count_clean

# Clean cost: remove currency and commas
def clean_cost(x):
    x = str(x)
    x = x.replace("₹", "").replace(",", "").strip()
    if x == "":
        return np.nan
    try:
        return float(x)
    except Exception:
        return np.nan

df["cost"] = df["cost"].apply(clean_cost) #cost_clean	

# Drop original numeric-ish columns if you only want clean versions in modelling
# (keep originals for report if you want)
# For modelling we will use rating_clean, rating_count_clean, cost_clean
num_cols = ["rating", "rating_count", "cost"]    #rating_clean,rating_count_clean,cost_clean	

# Basic missing handling: drop rows where all 3 numeric fields are NaN
before_na = df.shape[0]
df = df.dropna(subset=num_cols, how="all")
after_na = df.shape[0]
print(f"Rows dropped where all numeric fields missing: {before_na - after_na}")

# For remaining NaNs, we can impute with median
for col in num_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)

# Final cleaning: ensure basic non-null in key columns
df = df.dropna(subset=["city", "cuisine", "name"])
print("Shape after final cleaning:", df.shape)

# Save cleaned_data.csv (keeping original descriptive columns)
cleaned_path = os.path.join(data_dir, "cleaned_data.csv")
df.to_csv(cleaned_path, index=False)
print("Saved cleaned data to:", cleaned_path)

Columns after drop: ['id', 'name', 'city', 'rating', 'rating_count', 'cost', 'cuisine']
Duplicates removed: 0
Rows dropped where all numeric fields missing: 115
Shape after final cleaning: (148414, 7)
Saved cleaned data to: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\data\cleaned_data.csv


In [22]:
# === 4. Preprocessing: MultiLabelBinarizer for cuisine, and encoding city ===

# cuisine is multi-label string like "Beverages,Pizzas"
# Use MultiLabelBinarizer instead of OneHotEncoder

df_prep = df.copy()

# 4.1 MultiLabelBinarizer for cuisine
mlb = MultiLabelBinarizer()

def split_cuisine(val):
    # val is like "North Indian,Chinese"
    if pd.isna(val):
        return []
    return [c.strip() for c in str(val).split(",") if c.strip() != ""]

cuisine_lists = df_prep["cuisine"].apply(split_cuisine)
cuisine_encoded = mlb.fit_transform(cuisine_lists)

cuisine_df = pd.DataFrame(
    cuisine_encoded,
    columns=[f"cuisine_{c}" for c in mlb.classes_],
    index=df_prep.index
)

print("Cuisine encoded shape:", cuisine_df.shape)

# 4.2 City encoding: simple one-hot via pandas.get_dummies
city_dummies = pd.get_dummies(df_prep["city"], prefix="city")
print("City encoded shape:", city_dummies.shape)

# 4.3 Combine numeric + city + cuisine
numeric_df = df_prep[["rating", "rating_count", "cost"]]   #rating_clean,rating_count_clean,cost_clean	

encoded_df = pd.concat([numeric_df, city_dummies, cuisine_df], axis=1)

print("Encoded data shape:", encoded_df.shape)

# Ensure indices match between cleaned_data and encoded_data
assert (encoded_df.index == df_prep.index).all(), "Index mismatch between cleaned and encoded data"

encoded_path = os.path.join(data_dir, "encoded_data.csv")
encoded_df.to_csv(encoded_path, index=False)
print("Saved encoded data to:", encoded_path)

# Save encoder (mlb) for use in Streamlit app
mlb_path = os.path.join(models_dir, "mlb_cuisine.pkl")
with open(mlb_path, "wb") as f:
    pickle.dump(mlb, f)

print("Saved MultiLabelBinarizer to:", mlb_path)


Cuisine encoded shape: (148414, 126)
City encoded shape: (148414, 821)
Encoded data shape: (148414, 950)
Saved encoded data to: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\data\encoded_data.csv
Saved MultiLabelBinarizer to: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\models\mlb_cuisine.pkl


In [23]:
# === 5. Feature scaling ===

scaler = StandardScaler()
X_scaled = scaler.fit_transform(encoded_df)

scaler_path = os.path.join(models_dir, "scaler.pkl")
with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)

print("Scaler fitted and saved to:", scaler_path)
print("Scaled feature matrix shape:", X_scaled.shape)

Scaler fitted and saved to: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\models\scaler.pkl
Scaled feature matrix shape: (148414, 950)


In [24]:
# === 6. Unsupervised recommendation engine: KMeans + cosine similarity / nearest neighbors ===

# Choose number of clusters (heuristic; you could do elbow method)
n_clusters = 20  # adjust if needed

kmeans = KMeans(
    n_clusters=n_clusters,
    random_state=RANDOM_STATE,
    n_init=10
)
kmeans.fit(X_scaled)

kmeans_path = os.path.join(models_dir, "kmeans_model.pkl")
with open(kmeans_path, "wb") as f:
    pickle.dump(kmeans, f)

print("KMeans trained with", n_clusters, "clusters and saved to:", kmeans_path)

# Optional: also fit NearestNeighbors on scaled features for similarity search
nn_model = NearestNeighbors(metric="cosine", algorithm="brute")
nn_model.fit(X_scaled)

nn_path = os.path.join(models_dir, "nn_model.pkl")
with open(nn_path, "wb") as f:
    pickle.dump(nn_model, f)

print("NearestNeighbors model saved to:", nn_path)


KMeans trained with 20 clusters and saved to: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\models\kmeans_model.pkl
NearestNeighbors model saved to: d:\GuviDatascienceCourse\Project\4thProject(Swiggy’s Restaurant Recommendation System using Streamlit\ProjectTopicsWise\Streamlit\models\nn_model.pkl


In [25]:
# === 7. Helper function inside notebook (for testing) ===

def recommend_restaurants(
    city: str,
    min_rating: float,
    max_cost: float,
    preferred_cuisines: list,
    top_n: int = 10
):
    """
    Simple recommendation: 
    1) Filter cleaned_df on city, rating, cost, cuisine.
    2) Within filtered subset, use NearestNeighbors on encoded features.
    """

    # Start from cleaned data
    df_clean = df.copy().reset_index(drop=True)
    df_enc = encoded_df.copy().reset_index(drop=True)

    mask = (df_clean["city"] == city)
    mask &= (df_clean["rating"] >= min_rating)
    mask &= (df_clean["cost"] <= max_cost)

    # Cuisine filter: at least one of preferred cuisines should appear
    if preferred_cuisines:
        # Use mlb classes and encoded columns
        cuisine_cols = [f"cuisine_{c}" for c in mlb.classes_]
        for pc in preferred_cuisines:
            col = f"cuisine_{pc}"
            if col not in df_enc.columns:
                # unknown cuisine; skip
                continue

        # Build mask: row has at least one of these cuisines = 1
        cuisine_mask = np.zeros(len(df_enc), dtype=bool)
        for pc in preferred_cuisines:
            col = f"cuisine_{pc}"
            if col in df_enc.columns:
                cuisine_mask |= (df_enc[col] == 1)
        mask &= cuisine_mask

    filtered_idx = np.where(mask)[0]
    if len(filtered_idx) == 0:
        print("No restaurants found with given filters.")
        return df_clean.head(0)

    X_scaled_full = scaler.transform(df_enc)
    X_sub = X_scaled_full[filtered_idx]

    nn = NearestNeighbors(metric="cosine", algorithm="brute")
    nn.fit(X_sub)

    # Query: we can use centroid of X_sub as a pseudo "user profile"
    centroid = X_sub.mean(axis=0).reshape(1, -1)
    distances, indices = nn.kneighbors(centroid, n_neighbors=min(top_n, len(filtered_idx)))

    result_indices = filtered_idx[indices[0]]
    cols_to_show = [
    "id", "name", "city",
    "rating", "rating_count", "cost",
    "cuisine"
    ]  #rating_clean,rating_count_clean,cost_clean	
    return df_clean.loc[result_indices, cols_to_show]

# Quick test (example)
sample_city = df["city"].value_counts().idxmax()  # Fixed: most frequent city
print("Sample city:", sample_city)

test_recos = recommend_restaurants(
    city=sample_city,
    min_rating=3.5,
    max_cost=400,
    preferred_cuisines=[],
    top_n=5
)
test_recos

Sample city: Bikaner


Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine
28884,461143,Paratha Party,Bikaner,4.0,50.0,250.0,North Indian
28667,234087,The Parivar Food,Bikaner,4.0,50.0,250.0,North Indian
28247,266809,Mr Curry Singh,Bikaner,4.0,50.0,250.0,North Indian
28871,318926,Dhaba by taj,Bikaner,4.0,50.0,300.0,North Indian
29552,309545,ROYAL HAVELI,Bikaner,4.0,50.0,300.0,North Indian
