In [8]:
!pip install -q pandas scikit-learn numpy matplotlib seaborn joblib

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder
import joblib


In [14]:
# ---------------------------------------------
# STEP 1: LOAD THE DATASET
# ---------------------------------------------
file_path = "/content/Dataset  (1).csv"   # <-- CHANGE if your file has a different name

df = pd.read_excel(file_path)

In [15]:
df.info()
df.describe(include='all').T


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
Restaurant ID,9551.0,,,,9051128.349178,8791521.282104,53.0,301962.5,6004089.0,18352291.5,18500652.0
Restaurant Name,9551.0,7446.0,Cafe Coffee Day,83.0,,,,,,,
Country Code,9551.0,,,,18.365616,56.750546,1.0,1.0,1.0,1.0,216.0
City,9551.0,141.0,New Delhi,5473.0,,,,,,,
Address,9551.0,8918.0,"Dilli Haat, INA, New Delhi",11.0,,,,,,,
Locality,9551.0,1208.0,Connaught Place,122.0,,,,,,,
Locality Verbose,9551.0,1265.0,"Connaught Place, New Delhi",122.0,,,,,,,
Longitude,9551.0,,,,64.126574,41.467058,-157.948486,77.081343,77.191964,77.282006,174.832089
Latitude,9551.0,,,,25.854381,11.007935,-41.330428,28.478713,28.570469,28.642758,55.97698
Cuisines,9542.0,1825.0,North Indian,936.0,,,,,,,


In [16]:
# Missing values
df.isna().sum()

# Unique counts for categorical columns
for col in ['Cuisines','Price Range','Rating','Restaurant Name']:
    if col in df.columns:
        print(col, df[col].nunique())

# Quick plots
if 'Rating' in df.columns:
    sns.histplot(df['Rating'].dropna(), bins=10)
    plt.title('Rating distribution')
    plt.show()


Cuisines 1825
Restaurant Name 7446


In [17]:
# 1. Standardize column names to simple ones
df = df.rename(columns=lambda x: x.strip())

# 2. Fill missing cuisines with empty string
cuis_col = 'Cuisines' if 'Cuisines' in df.columns else 'Cuisine'
df[cuis_col] = df[cuis_col].fillna('').astype(str)

# 3. Process price: convert common formats to integers/buckets
if 'Price Range' in df.columns:
    # Example: if values like '$', '$$', '1','2' etc. map to integers
    def price_to_int(x):
        try:
            x = str(x).strip()
            if x.isdigit(): return int(x)
            if x.count('$')>0: return x.count('$')
            # fallback
            return np.nan
        except:
            return np.nan
    df['price_bucket'] = df['Price Range'].apply(price_to_int)
    # Fill missing with median bucket
    df['price_bucket'].fillna(int(df['price_bucket'].median()), inplace=True)
else:
    df['price_bucket'] = 0


In [18]:
def clean_cuisines(text):
    text = text.lower()
    # remove extra characters if needed
    parts = [c.strip() for c in text.split(',') if c.strip()!='']
    # deduplicate preserving order
    seen = set(); out=[]
    for p in parts:
        if p not in seen:
            seen.add(p); out.append(p)
    return " ".join(out)

df['cuisines_clean'] = df[cuis_col].apply(clean_cuisines)


In [19]:
df['combined_features'] = df['cuisines_clean'] + " " + df['price_bucket'].astype(str)
# Optionally add city/location or tags if available:
for col in ['Location', 'Category']:
    if col in df.columns:
        df['combined_features'] += " " + df[col].astype(str).str.lower()


In [20]:
tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
tfidf_matrix = tfidf.fit_transform(df['combined_features'])

# Cosine similarity (restaurant vs restaurant)
similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [24]:
def recommend_restaurants(cuisine_pref, price_pref, top_n=5):
    # create user profile string (apply same cleaning)
    user_cuis = clean_cuisines(cuisine_pref)
    user_profile = user_cuis + " " + str(price_pref)
    user_vector = tfidf.transform([user_profile])
    scores = cosine_similarity(user_vector, tfidf_matrix).flatten()
    top_idx = scores.argsort()[::-1][:top_n]
    return df.iloc[top_idx][['Restaurant Name', 'cuisines_clean', 'Price range']].assign(score=scores[top_idx])

In [25]:
recommend_restaurants("Italian, Pizza", 2, top_n=10)


Unnamed: 0,Restaurant Name,cuisines_clean,Price range,score
166,Biaggi's Ristorante Italiano,italian pizza,3,1.0
228,Pie Slingers Pizzeria,italian pizza,1,1.0
146,Flatbread Neapolitan Pizzeria,italian pizza,2,1.0
255,Centro,italian pizza,3,1.0
8277,Pizza Hut Delivery,italian pizza,2,1.0
3243,NYC.PIE,italian pizza,2,1.0
8258,Affamato,italian pizza,1,1.0
286,Shot Tower Inn,italian pizza,2,1.0
674,Pizza Hut,italian pizza,3,1.0
1406,Baking Bad,italian pizza,3,1.0


In [27]:
# Example (requires user-item interactions dataset which you may not have)
# If you don't have user data, do manual checks + sample user tests.
df['combined_features_weighted'] = (df['cuisines_clean'] + ' ') * 3 + ' ' + df['price_bucket'].astype(str)
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_features_weighted'])


In [28]:
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(similarity_matrix, 'similarity_matrix.pkl')
df.to_csv('restaurants_processed.csv', index=False)
