# Swiggy Recommendation Methodology
This notebook shows how to build a recommendation system using:
1. K-Means Clustering
2. Cosine Similarity (Nearest Neighbors)

It uses `cleaned_data.csv` for mapping and `numerical_data.csv` as feature matrix.

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors


In [2]:
# Load cleaned dataset and numerical features
df_cleaned = pd.read_csv("cleaned_data.csv")
df_num = pd.read_csv("numerical_data.csv")

print("Cleaned data shape:", df_cleaned.shape)
print("Numerical feature matrix shape:", df_num.shape)

df_cleaned.head()

Cleaned data shape: (148442, 10)
Numerical feature matrix shape: (148442, 3)


Unnamed: 0,id,name,city,rating,rating_count,cost,cuisine,link,address,menu
0,567335,AB FOODS POINT,Abohar,4.0,50.0,200.0,"Beverages,Pizzas",https://www.swiggy.com/restaurants/ab-foods-po...,"AB FOODS POINT, NEAR RISHI NARANG DENTAL CLINI...",Menu/567335.json
1,531342,Janta Sweet House,Abohar,4.4,50.0,200.0,"Sweets,Bakery",https://www.swiggy.com/restaurants/janta-sweet...,"Janta Sweet House, Bazar No.9, Circullar Road,...",Menu/531342.json
2,158203,theka coffee desi,Abohar,3.8,100.0,100.0,Beverages,https://www.swiggy.com/restaurants/theka-coffe...,"theka coffee desi, sahtiya sadan road city",Menu/158203.json
3,187912,Singh Hut,Abohar,3.7,20.0,250.0,"Fast Food,Indian",https://www.swiggy.com/restaurants/singh-hut-n...,"Singh Hut, CIRCULAR ROAD NEAR NEHRU PARK ABOHAR",Menu/187912.json
4,543530,GRILL MASTERS,Abohar,4.0,50.0,250.0,"Italian-American,Fast Food",https://www.swiggy.com/restaurants/grill-maste...,"GRILL MASTERS, ADA Heights, Abohar - Hanumanga...",Menu/543530.json


In [3]:
# ===== K-Means Clustering =====
k = 10  # number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
cluster_labels = kmeans.fit_predict(df_num)

# Add cluster labels to cleaned dataframe
df_cleaned['cluster'] = cluster_labels

# Display cluster distribution
print("Cluster distribution:")
print(df_cleaned['cluster'].value_counts())


Cluster distribution:
cluster
8    51918
0    36654
4    22901
6    19735
3     9541
2     2945
9     2235
5     2119
7      393
1        1
Name: count, dtype: int64


In [4]:
# Example: Recommend restaurants in the same cluster as index 0
idx = 0  # example index
restaurant = df_cleaned.iloc[idx]
print("Selected Restaurant:")
print(restaurant[['name', 'city', 'cuisine', 'rating', 'cost', 'cluster']])

cluster_label = restaurant['cluster']
similar_cluster = df_cleaned[df_cleaned['cluster'] == cluster_label]

print(f"Restaurants in same cluster (cluster {cluster_label}):")
similar_cluster[['name', 'city', 'cuisine', 'rating', 'cost']].head(10)

Selected Restaurant:
name         AB FOODS POINT
city                 Abohar
cuisine    Beverages,Pizzas
rating                  4.0
cost                  200.0
cluster                   8
Name: 0, dtype: object
Restaurants in same cluster (cluster 8):


Unnamed: 0,name,city,cuisine,rating,cost
0,AB FOODS POINT,Abohar,"Beverages,Pizzas",4.0,200.0
1,Janta Sweet House,Abohar,"Sweets,Bakery",4.4,200.0
3,Singh Hut,Abohar,"Fast Food,Indian",3.7,250.0
4,GRILL MASTERS,Abohar,"Italian-American,Fast Food",4.0,250.0
5,Sam Uncle,Abohar,Continental,3.6,200.0
9,yummy hub,Abohar,Indian,4.0,200.0
12,Swastik Dhaba,Abohar,North Indian,4.0,200.0
16,Roll Express,Abohar,Fast Food,4.0,200.0
17,wah ji waah veg and non veg corner,Abohar,"North Indian,Chinese",4.0,200.0
22,Royal Chicken,Abohar,"Mughlai,North Indian",4.2,200.0


In [5]:
# ===== Cosine Similarity (Nearest Neighbors) =====
# Build NearestNeighbors model on numerical features
nn_model = NearestNeighbors(n_neighbors=6, metric='cosine')
nn_model.fit(df_num)

# Find top 5 similar to index 0
idx = 0
distances, indices = nn_model.kneighbors(df_num.iloc[[idx]], n_neighbors=6)
recommended_indices = [i for i in indices[0] if i != idx][:5]

print("Top 5 similar restaurants (Cosine Similarity):")
df_cleaned.iloc[recommended_indices][['name', 'city', 'cuisine', 'rating', 'cost']]


Top 5 similar restaurants (Cosine Similarity):


Unnamed: 0,name,city,cuisine,rating,cost
148441,Lazeez kitchen,Yavatmal,Pizzas,4.0,200.0
148437,The Food Delight,Yavatmal,"Fast Food,Snacks",4.0,200.0
25,PubG Cafe,Abohar,"Chinese,Pizzas",4.0,200.0
30,Joker Cafe,Abohar,Indian,4.0,200.0
34,Bhatti Ki Rasoi,Abohar,North Indian,4.0,200.0


In [6]:
# Save models for future use
with open("kmeans_model.pkl", "wb") as f:
    pickle.dump(kmeans, f)

with open("nn_model.pkl", "wb") as f:
    pickle.dump(nn_model, f)

print("Saved kmeans_model.pkl and nn_model.pkl")

Saved kmeans_model.pkl and nn_model.pkl


In [7]:

df_cleaned = pd.read_csv("cleaned_data.csv")
df_num     = pd.read_csv("numerical_data.csv")

# Assume nn_model_all was trained on df_num for the full dataset:
with open("nn_model.pkl", "rb") as f:
    nn_model_all = pickle.load(f)

In [8]:


# Example: user has selected restaurant at index 0 (from df_cleaned)
idx = 0
selected_city = df_cleaned.loc[idx, "city"]
print(f"Selected Restaurant is in {selected_city}")

Selected Restaurant is in Abohar


In [9]:


# STEP A: Find all indices of df_cleaned in that city
city_mask = df_cleaned["city"] == selected_city
indices_in_city = np.where(city_mask)[0]

# STEP B: Extract the numeric rows *for only that city*
df_num_city = df_num.loc[indices_in_city].reset_index(drop=True)

# STEP C: Build a new NearestNeighbors on that city-only subset
nn_model_city = NearestNeighbors(n_neighbors=6, metric="cosine")
nn_model_city.fit(df_num_city)

# STEP D: Find the position of idx within indices_in_city
# Because idx is a global index (0..N-1), find its index in the city subset
pos_in_city = np.where(indices_in_city == idx)[0][0]

# STEP E: Query for neighbors
distances, city_indices = nn_model_city.kneighbors(df_num_city.loc[[pos_in_city]], n_neighbors=6)

# STEP F: Map those “city_indices” (0..M-1) back to the original df_cleaned index
# Skip the first neighbor (it’s the restaurant itself)
recommended_local_positions = [p for p in city_indices[0] if p != pos_in_city][:5]
recommended_indices = indices_in_city[recommended_local_positions]

print("Recommended restaurants in the same city:")
print(df_cleaned.loc[recommended_indices, ["name", "city", "cuisine", "rating", "cost"]])


Recommended restaurants in the same city:
                                  name    city               cuisine  rating  \
9                            yummy hub  Abohar                Indian     4.0   
12                       Swastik Dhaba  Abohar          North Indian     4.0   
17  wah ji waah veg and non veg corner  Abohar  North Indian,Chinese     4.0   
25                           PubG Cafe  Abohar        Chinese,Pizzas     4.0   
30                          Joker Cafe  Abohar                Indian     4.0   

     cost  
9   200.0  
12  200.0  
17  200.0  
25  200.0  
30  200.0  


In [10]:
def recommend_same_city(df_cleaned, df_num, chosen_idx=0, n_recs=5):
    chosen_city = df_cleaned.loc[chosen_idx, "city"]
    city_mask = (df_cleaned["city"] == chosen_city).values
    indices_in_city = np.where(city_mask)[0]
    if len(indices_in_city) <= 1:
        return []
    df_num_city = df_num.loc[indices_in_city].reset_index(drop=True)
    nn_city = NearestNeighbors(n_neighbors=n_recs+1, metric="cosine")
    nn_city.fit(df_num_city)
    pos_in_city = np.where(indices_in_city == chosen_idx)[0][0]
    distances, neighbors_local = nn_city.kneighbors(
        df_num_city.iloc[[pos_in_city]], n_neighbors=n_recs+1
    )
    recs_local = [r for r in neighbors_local[0] if r != pos_in_city][:n_recs]
    rec_indices = indices_in_city[recs_local]
    return rec_indices


In [12]:
idx = 0
print("User-selected restaurant:")
display(df_cleaned.loc[[idx], ["name", "city", "cuisine", "rating", "cost"]])
recommended_indices = recommend_same_city(df_cleaned, df_num, chosen_idx=idx, n_recs=5)
print("\nTop 5 same-city recommendations:")
display(df_cleaned.loc[recommended_indices][["name", "city", "cuisine", "rating", "cost"]])

User-selected restaurant:


Unnamed: 0,name,city,cuisine,rating,cost
0,AB FOODS POINT,Abohar,"Beverages,Pizzas",4.0,200.0



Top 5 same-city recommendations:


Unnamed: 0,name,city,cuisine,rating,cost
9,yummy hub,Abohar,Indian,4.0,200.0
12,Swastik Dhaba,Abohar,North Indian,4.0,200.0
17,wah ji waah veg and non veg corner,Abohar,"North Indian,Chinese",4.0,200.0
25,PubG Cafe,Abohar,"Chinese,Pizzas",4.0,200.0
30,Joker Cafe,Abohar,Indian,4.0,200.0


In [15]:
def recommend_same_city_and_cuisine(df_cleaned, df_num, chosen_idx=0, n_recs=5):
    """
    Returns top-n_recs similar restaurants that share both city and cuisine 
    with the restaurant at index chosen_idx.
    """
    # 1) Look up city & cuisine for the chosen restaurant
    chosen_city   = df_cleaned.loc[chosen_idx, "city"]
    chosen_cuisine = df_cleaned.loc[chosen_idx, "cuisine"]
    
    # 2) Build a mask for (same city) AND (same cuisine tag)
    #    If “cuisine” is a comma-separated string, use 'in' to match:
    mask_same_city = (df_cleaned["city"] == chosen_city)
    mask_same_cuis = (df_cleaned["cuisine"] == chosen_cuisine)
    
    #    Final mask:
    mask = mask_same_city & mask_same_cuis
    indices_sub = np.where(mask)[0]    # global indices of matching rows
    
    # 3) If there aren’t enough rows, just return an empty list or fewer suggestions
    if len(indices_sub) <= 1:
        return []
    
    # 4) Subset the numeric features to only those rows
    df_num_sub = df_num.loc[indices_sub].reset_index(drop=True)
    
    # 5) Build & fit a new NearestNeighbors on this filtered subset
    nn_sub = NearestNeighbors(n_neighbors=n_recs+1, metric="cosine")
    nn_sub.fit(df_num_sub)
    
    # 6) Find the “local” position of chosen_idx within indices_sub
    pos_in_sub = np.where(indices_sub == chosen_idx)[0][0]
    
    # 7) Query for neighbors, skip “itself”
    distances, neighbors_local = nn_sub.kneighbors(
        df_num_sub.iloc[[pos_in_sub]], n_neighbors=n_recs+1
    )
    recs_local = [r for r in neighbors_local[0] if r != pos_in_sub][:n_recs]
    
    # 8) Map those “local” indices back to global indices
    rec_indices = indices_sub[recs_local]
    return rec_indices

In [16]:

# 2) Pick an index to test, e.g. idx = 9
idx = 9

# 3) Call the function
recommended_idxs = recommend_same_city_and_cuisine(df_cleaned, df_num, chosen_idx=idx, n_recs=5)

# 4) Show results
print("Chosen Restaurant:")
display(df_cleaned.loc[[idx], ["name", "city", "cuisine", "rating", "cost"]])

print("\nTop 5 same-city & same-cuisine recommendations:")
display(df_cleaned.loc[recommended_idxs][["name", "city", "cuisine", "rating", "cost"]])

Chosen Restaurant:


Unnamed: 0,name,city,cuisine,rating,cost
9,yummy hub,Abohar,Indian,4.0,200.0



Top 5 same-city & same-cuisine recommendations:


Unnamed: 0,name,city,cuisine,rating,cost
30,Joker Cafe,Abohar,Indian,4.0,200.0
61,SHRI GANESH SWEET HOUSE,Abohar,Indian,4.0,200.0
14,Bharawan Da Dhaba,Abohar,Indian,4.4,300.0
35,SETHI PUNJABI RASOI,Abohar,Indian,4.0,300.0
26,Verma Dhaba,Abohar,Indian,4.0,300.0
