In [None]:
# Data Loading

In [10]:
import pandas as pd

df = pd.read_csv("data/swiggy.csv")
print(df.shape)
print(df.head())


(148541, 11)
       id               name    city rating     rating_count   cost  \
0  567335     AB FOODS POINT  Abohar     --  Too Few Ratings  ₹ 200   
1  531342  Janta Sweet House  Abohar    4.4      50+ ratings  ₹ 200   
2  158203  theka coffee desi  Abohar    3.8     100+ ratings  ₹ 100   
3  187912          Singh Hut  Abohar    3.7      20+ ratings  ₹ 250   
4  543530      GRILL MASTERS  Abohar     --  Too Few Ratings  ₹ 250   

                      cuisine          lic_no  \
0            Beverages,Pizzas  22122652000138   
1               Sweets,Bakery  12117201000112   
2                   Beverages  22121652000190   
3            Fast Food,Indian  22119652000167   
4  Italian-American,Fast Food  12122201000053   

                                                link  \
0  https://www.swiggy.com/restaurants/ab-foods-po...   
1  https://www.swiggy.com/restaurants/janta-sweet...   
2  https://www.swiggy.com/restaurants/theka-coffe...   
3  https://www.swiggy.com/restaurants/sin

In [None]:
# Data Cleaning

In [None]:
import numpy as np
import pandas as pd
import re

# 1) Clean COST
df['cost'] = df['cost'].astype(str).str.replace(r'[^0-9]', '', regex=True)
df['cost'] = pd.to_numeric(df['cost'], errors='coerce').astype('Int64')

# 2) Clean RATING_COUNT
def clean_rating_count(x):
    if pd.isna(x):
        return 0

    x = str(x).strip()

    # "Too Few Ratings" -> 5
    if "Too Few" in x:
        return 5

    # "1K+ ratings", "5K+ ratings" etc.
    if "K" in x.upper():
        nums = re.findall(r'\d+', x)
        if nums:
            return int(nums[0]) * 1000

    # "50+ ratings", "100+ ratings" etc.
    nums = re.findall(r'\d+', x)
    if nums:
        return int(nums[0])

    return 0

df['rating_count'] = df['rating_count'].apply(clean_rating_count).astype('Int64')

# 3) Clean RATING (city + cuisine + cost)


# Convert "--" and "" to NaN
df['rating'] = df['rating'].replace(["--", ""], np.nan)
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

# Level 1: Fill using (city, cuisine, cost)
mean_lvl1 = df.groupby(['city', 'cuisine', 'cost'])['rating'].transform('mean')
df['rating'] = df['rating'].fillna(mean_lvl1)

# Level 2: Fill remaining NaN using (city, cuisine)
mean_lvl2 = df.groupby(['city', 'cuisine'])['rating'].transform('mean')
df['rating'] = df['rating'].fillna(mean_lvl2)

# Level 3: Fill remaining NaN using (city)
mean_lvl3 = df.groupby('city')['rating'].transform('mean')
df['rating'] = df['rating'].fillna(mean_lvl3)

df['rating'] = df['rating'].fillna(df['rating'].median())


# Round final ratings
df['rating'] = df['rating'].round(2)

# Make sure cuisine is string
df['cuisine'] = df['cuisine'].astype(str)

# 1) Build a mask for rows where cuisine looks like a time / time range
time_mask = (
    df['cuisine'].str.contains(r'\d{1,2}\s*[:.]\s*\d{2}', case=False, na=False) |  # 8:15, 10.30 etc
    df['cuisine'].str.contains(r'\b(am|pm)\b', case=False, na=False) |           # AM / PM
    df['cuisine'].str.contains(r'\bto\b', case=False, na=False)                  # "to" in time ranges
)

# 2) Check how many such rows exist (just to see)
print("Rows with time-like cuisine:", time_mask.sum())

# 3) Drop those rows
df = df[~time_mask].reset_index(drop=True)




In [None]:
# Data Preprocessing

In [17]:
df = pd.read_csv(r"C:\Users\priya\OneDrive\Desktop\projects\Swiggy-Restaurant-Recommendation-System\data\swiggy_cleaned.csv")
print(df.shape)

(148245, 11)


In [18]:
categorical_cols = ['city', 'cuisine']
numerical_cols = ['rating', 'rating_count', 'cost']


In [19]:
# Split cuisine into list since having multiple values

df['cuisine'] = df['cuisine'].apply(
    lambda x: ','.join(x) if isinstance(x, list) else x
)
# One hot encoding

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = ['city', 'cuisine']
numerical_cols = ['rating', 'rating_count', 'cost']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
) 
#  Fit & Transform Data
encoded_data = preprocessor.fit_transform(df[categorical_cols + numerical_cols])



In [20]:
#Convert to DataFrame (INDEX MATCH TO CLEANED DATA)
#Sparse DataFrame (INDEX SAFE)
encoded_df = pd.DataFrame.sparse.from_spmatrix(
    encoded_data,
    index=df.index
)




In [21]:
#Save the sparse file
from scipy.sparse import save_npz
save_npz("encoded_data.npz", encoded_data)


In [22]:
# Pickle file
import pickle
with open("encoder.pkl", "wb") as f:
    pickle.dump(preprocessor, f)


In [None]:
# Model Training

In [3]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
# Load cleaned (non-encoded) data
cleaned_df = pd.read_csv("data/swiggy_cleaned.csv")

# Load encoded + normalized data
encoded_data = load_npz("encoded_data.npz")


In [5]:
print(cleaned_df.shape)
print(encoded_data.shape)

(148245, 11)
(148245, 2952)


In [49]:
N_CLUSTERS = 20


In [None]:
from sklearn.cluster import MiniBatchKMeans

kmeans_model = MiniBatchKMeans(
    n_clusters=N_CLUSTERS,
    random_state=42,
    batch_size=1024
)

# MODEL TRAINING 
cluster_labels = kmeans_model.fit_predict(encoded_data)


In [54]:
cleaned_df['cluster'] = cluster_labels


In [55]:
import pickle

with open("kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans_model, f)


In [56]:
cleaned_df.to_csv("data/swiggy_with_clusters.csv", index=False)


In [57]:
def recommend_hotels_kmeans(
    city,
    top_n=10
):
    # Filter by city
    city_df = cleaned_df[
        cleaned_df['city'].str.lower() == city.lower()
    ]

    if city_df.empty:
        return "No restaurants found for this city"

    # Find dominant cluster in this city
    dominant_cluster = city_df['cluster'].mode()[0]

    # Recommend from same cluster & city
    recommendations = cleaned_df[
        (cleaned_df['cluster'] == dominant_cluster) &
        (cleaned_df['city'].str.lower() == city.lower())
    ]

    # Rank by quality & popularity
    recommendations = recommendations.sort_values(
        by=['rating', 'rating_count'],
        ascending=False
    )

    return recommendations[
        ['name', 'city', 'cuisine', 'rating', 'rating_count', 'cost']
    ].head(top_n)


In [60]:
recommend_hotels("Abohar", top_n=5)


Unnamed: 0,name,cuisine,rating,rating_count,cost
19,FOODY MOOD,"Fast Food,Chinese",4.7,20,300
14,Bharawan Da Dhaba,Indian,4.4,50,300
1,Janta Sweet House,"Sweets,Bakery",4.4,50,200
29,Domino's Pizza,Pizzas,4.4,20,400
18,Shri Balaji fast food and Variety store,Indian,4.4,5,100
