In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
# load dataset
df = pd.read_csv('Dataset .csv', encoding='utf-8')
df.head()

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.58445,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229


#### There are some columns with special characters and some columns that are irrelevant to our recommendation system

#### We will handle these issues in the next steps

In [3]:
# handling missing values by dropping the rows
df = df.dropna(axis=0)

In [4]:
# checking for columns with special characters

special_char_cols = []
for col in df.columns:
    if df[col].apply(lambda x: isinstance(x, str) and any(ord(c) > 127 for c in x)).any():
        special_char_cols.append(col)

print(special_char_cols)

['Restaurant Name', 'City', 'Address', 'Locality', 'Locality Verbose', 'Cuisines', 'Currency']


In [5]:
# dropping columns that are irrelevant to our recommendation system

data = df.drop(['Restaurant ID', 'Address', 'Locality', 'Locality Verbose', 
                'Currency', 'Switch to order menu', 'Rating color', 'Rating text'], axis=1)
data.head()

Unnamed: 0,Restaurant Name,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Price range,Aggregate rating,Votes
0,Le Petit Souffle,162,Makati City,121.027535,14.565443,"French, Japanese, Desserts",1100,Yes,No,No,3,4.8,314
1,Izakaya Kikufuji,162,Makati City,121.014101,14.553708,Japanese,1200,Yes,No,No,3,4.5,591
2,Heat - Edsa Shangri-La,162,Mandaluyong City,121.056831,14.581404,"Seafood, Asian, Filipino, Indian",4000,Yes,No,No,4,4.4,270
3,Ooma,162,Mandaluyong City,121.056475,14.585318,"Japanese, Sushi",1500,No,No,No,4,4.9,365
4,Sambo Kojin,162,Mandaluyong City,121.057508,14.58445,"Japanese, Korean",1500,Yes,No,No,4,4.8,229


In [6]:
# checking for columns with special characters in the new dataframe

sp_char_cols = []
for col in data.columns:
    if data[col].apply(lambda x: isinstance(x, str) and any(ord(c) > 127 for c in x)).any():
        sp_char_cols.append(col)

sp_char_cols

['Restaurant Name', 'City', 'Cuisines']

In [7]:
data['Restaurant Name'].nunique()

7437

In [8]:
# checking for special characters in the restaurant names
import re

def contains_special_characters(name):
    special_characters_pattern = re.compile('[^A-Za-z0-9 ]')
    return bool(special_characters_pattern.search(name))

special_char_count = data['Restaurant Name'].apply(contains_special_characters).sum()

print(special_char_count)


2171


In [9]:
# Function to check for special characters

def contains_special_characters(name):
    special_characters_pattern = re.compile('[^A-Za-z0-9 ]')
    return bool(special_characters_pattern.search(name))

# Create a boolean mask to identify rows to keep
mask = data['Restaurant Name'].apply(contains_special_characters)

# Drop rows with special characters
dt = data[~mask]

dt.head()

Unnamed: 0,Restaurant Name,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Price range,Aggregate rating,Votes
0,Le Petit Souffle,162,Makati City,121.027535,14.565443,"French, Japanese, Desserts",1100,Yes,No,No,3,4.8,314
1,Izakaya Kikufuji,162,Makati City,121.014101,14.553708,Japanese,1200,Yes,No,No,3,4.5,591
3,Ooma,162,Mandaluyong City,121.056475,14.585318,"Japanese, Sushi",1500,No,No,No,4,4.9,365
4,Sambo Kojin,162,Mandaluyong City,121.057508,14.58445,"Japanese, Korean",1500,Yes,No,No,4,4.8,229
5,Din Tai Fung,162,Mandaluyong City,121.056314,14.583764,Chinese,1000,No,No,No,3,4.4,336


In [10]:
# checking for columns with special characters in the new dataframe

sp_char_cols = []
for col in dt.columns:
    if dt[col].apply(lambda x: isinstance(x, str) and any(ord(c) > 127 for c in x)).any():
        sp_char_cols.append(col)

sp_char_cols

['City']

In [11]:
# checking unique values in City column with special characters

def has_special_chars(text):
    return any(ord(char) > 127 for char in str(text))

unique_with_special_chars = dt['City'].apply(has_special_chars)

city_unique_values = dt.loc[unique_with_special_chars, 'City'].unique()

city_unique_values

array(['Bras�_lia', 'S��o Paulo', '��stanbul'], dtype=object)

In [12]:
# removing special characters and normalizing text

import unicodedata

def normalize_text(column):
    def normalize_entry(entry):
        if isinstance(entry, str):
            return ''.join(
                c if ord(c) < 128 else unicodedata.normalize('NFKD', c).encode('ascii', 'ignore').decode('utf-8')
                for c in entry.replace('_', ' ')
            ).strip()
        return entry

    return column.apply(normalize_entry)

dt['City'] = normalize_text(dt['City'])

# checking unique values in the cleaned columns to ensure proper cleaning
unique_cities = dt['City'].unique()

unique_cities

array(['Makati City', 'Mandaluyong City', 'Pasay City', 'Pasig City',
       'San Juan City', 'Santa Rosa', 'Tagaytay City', 'Taguig City',
       'Bras lia', 'Rio de Janeiro', 'So Paulo', 'Albany', 'Armidale',
       'Athens', 'Augusta', 'Balingup', 'Beechworth', 'Boise',
       'Cedar Rapids/Iowa City', 'Chatham-Kent', 'Clatskanie', 'Columbus',
       'Consort', 'Dalton', 'Davenport', 'Des Moines', 'Dicky Beach',
       'Dubuque', 'East Ballina', 'Fernley', 'Flaxton', 'Forrest',
       'Gainesville', 'Hepburn Springs', 'Huskisson', 'Inverloch',
       'Lakes Entrance', 'Lakeview', 'Lincoln', 'Lorn', 'Macon',
       'Mayfield', 'Mc Millan', 'Middleton Beach', 'Montville',
       'Ojo Caliente', 'Orlando', 'Palm Cove', 'Paynesville', 'Penola',
       'Pensacola', 'Phillip Island', 'Pocatello', 'Potrero', 'Princeton',
       'Rest of Hawaii', 'Savannah', 'Singapore', 'Sioux City',
       'Tampa Bay', 'Trentham East', 'Valdosta', 'Vernonia',
       'Victor Harbor', 'Vineland Station', 'W

In [13]:
# correcting the city column using mapping

city_mapping = {
    'Bras lia': 'Brasilia',
    'So Paulo': 'Sao Paulo',
    'stanbul': 'Istanbul'
}

dt['City'] = dt['City'].replace(city_mapping)

# checking unique values after mapping
mapped_cities = dt['City'].unique()

mapped_cities

array(['Makati City', 'Mandaluyong City', 'Pasay City', 'Pasig City',
       'San Juan City', 'Santa Rosa', 'Tagaytay City', 'Taguig City',
       'Brasilia', 'Rio de Janeiro', 'Sao Paulo', 'Albany', 'Armidale',
       'Athens', 'Augusta', 'Balingup', 'Beechworth', 'Boise',
       'Cedar Rapids/Iowa City', 'Chatham-Kent', 'Clatskanie', 'Columbus',
       'Consort', 'Dalton', 'Davenport', 'Des Moines', 'Dicky Beach',
       'Dubuque', 'East Ballina', 'Fernley', 'Flaxton', 'Forrest',
       'Gainesville', 'Hepburn Springs', 'Huskisson', 'Inverloch',
       'Lakes Entrance', 'Lakeview', 'Lincoln', 'Lorn', 'Macon',
       'Mayfield', 'Mc Millan', 'Middleton Beach', 'Montville',
       'Ojo Caliente', 'Orlando', 'Palm Cove', 'Paynesville', 'Penola',
       'Pensacola', 'Phillip Island', 'Pocatello', 'Potrero', 'Princeton',
       'Rest of Hawaii', 'Savannah', 'Singapore', 'Sioux City',
       'Tampa Bay', 'Trentham East', 'Valdosta', 'Vernonia',
       'Victor Harbor', 'Vineland Station', '

In [14]:
dt.head()

Unnamed: 0,Restaurant Name,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Price range,Aggregate rating,Votes
0,Le Petit Souffle,162,Makati City,121.027535,14.565443,"French, Japanese, Desserts",1100,Yes,No,No,3,4.8,314
1,Izakaya Kikufuji,162,Makati City,121.014101,14.553708,Japanese,1200,Yes,No,No,3,4.5,591
3,Ooma,162,Mandaluyong City,121.056475,14.585318,"Japanese, Sushi",1500,No,No,No,4,4.9,365
4,Sambo Kojin,162,Mandaluyong City,121.057508,14.58445,"Japanese, Korean",1500,Yes,No,No,4,4.8,229
5,Din Tai Fung,162,Mandaluyong City,121.056314,14.583764,Chinese,1000,No,No,No,3,4.4,336


In [15]:
dt.tail()

Unnamed: 0,Restaurant Name,Country Code,City,Longitude,Latitude,Cuisines,Average Cost for two,Has Table booking,Has Online delivery,Is delivering now,Price range,Aggregate rating,Votes
9538,Starbucks,208,Istanbul,29.043734,41.077696,Cafe,30,No,No,No,2,4.9,1042
9539,Valonia,208,Istanbul,29.002896,41.044813,"Restaurant Cafe, Desserts",80,No,No,No,3,4.2,874
9540,Draft Gastro Pub,208,Istanbul,29.074116,40.963935,Bar Food,130,No,No,No,4,4.9,522
9545,Baltazar,208,Istanbul,28.981103,41.025785,"Burger, Izgara",90,No,No,No,3,4.3,870
9548,Huqqa,208,Istanbul,29.03464,41.055817,"Italian, World Cuisine",170,No,No,No,4,3.7,661


In [16]:
# checking to ensure we have no more special characters
sp_char_cols = []
for col in data.columns:
    if dt[col].apply(lambda x: isinstance(x, str) and any(ord(c) > 127 for c in x)).any():
        sp_char_cols.append(col)

sp_char_cols

[]

#### We have handled the inconsistencies by removing special characters, normalizing text, and correcting city names. Now, we can proceed with the recommendation system.

In [18]:
# One-hot encode cuisines
dt['Cuisines'] = dt['Cuisines'].str.split(', ')
cuisines_encoded = dt['Cuisines'].explode().str.strip().unique()

# Creating binary columns for each cuisine
for cuisine in cuisines_encoded:
    dt[cuisine] = dt['Cuisines'].apply(lambda x: 1 if cuisine in x else 0)

In [19]:
# Encoding binary columns as numerical values
binary_columns = ['Has Table booking', 'Has Online delivery', 'Is delivering now']
for col in binary_columns:
    dt[col] = dt[col].apply(lambda x: 1 if x == 'Yes' else 0)

In [20]:
# Defining a dictionary to store user preferences for restaurant recommendations
user_preferences = {
    'Preferred Cuisines': ['Cafe', 'Desserts'], 'Price Range': 2, 'Minimum Rating': 4.0 
}

In [21]:
# Creating a user cuisine vector
user_cuisine_vector = np.zeros(len(cuisines_encoded))
for i, cuisine in enumerate(cuisines_encoded):
    if cuisine in user_preferences['Preferred Cuisines']:
        user_cuisine_vector[i] = 1

# Compute cuisine similarity using dot product
dt['Cuisine Similarity'] = dt[list(cuisines_encoded)].values.dot(user_cuisine_vector)

In [22]:
# Filtering restaurants based on price range and rating
dt_filtered = dt[
    (dt['Price range'] == user_preferences['Price Range']) &
    (dt['Aggregate rating'] >= user_preferences['Minimum Rating'])
]

In [23]:
# Normalizing scores and computing the final score
dt_filtered['Final Score'] = (
    0.5 * (dt_filtered['Cuisine Similarity'] / dt_filtered['Cuisine Similarity'].max()) +
    0.5 * (dt_filtered['Aggregate rating'] / dt_filtered['Aggregate rating'].max())
)

In [24]:
# Sort by final score
recommended_restaurants = dt_filtered.sort_values(by='Final Score', ascending=False).head(5)

print(recommended_restaurants[['Restaurant Name', 'City', 'Final Score']])


                 Restaurant Name         City  Final Score
2315                   Churrolto    Hyderabad     0.979592
18    Hobing Korean Dessert Cafe  Taguig City     0.959184
9452                    My Sugar    Cape Town     0.948980
9520             Turta Home Cafe       Ankara     0.938776
2396                French Toast        Kochi     0.938776


#### The recommendation system is now ready to provide personalized restaurant recommendations based on user preferences. Users can input their preferred cuisines, price range, and minimum rating, and the system will return a list of recommended restaurants that match their criteria.

In [26]:
# Testing the recommendation system with different user preferences
user_preferences = {
    'Preferred Cuisines': ['Burger', 'Italian'], 
    'Price Range': 3, 
    'Minimum Rating': 4.5
}

In [27]:
# Creating a user cuisine vector
user_cuisine_vector = np.zeros(len(cuisines_encoded))
for i, cuisine in enumerate(cuisines_encoded):
    if cuisine in user_preferences['Preferred Cuisines']:
        user_cuisine_vector[i] = 1

# Compute cuisine similarity using dot product
dt['Cuisine Similarity'] = dt[list(cuisines_encoded)].values.dot(user_cuisine_vector)

In [28]:
# Filtering restaurants based on price range and rating
dt_filtered = dt[
    (dt['Price range'] == user_preferences['Price Range']) &
    (dt['Aggregate rating'] >= user_preferences['Minimum Rating'])
]

In [29]:
# Normalizing scores and computing the final score
dt_filtered['Final Score'] = (
    0.5 * (dt_filtered['Cuisine Similarity'] / dt_filtered['Cuisine Similarity'].max()) +
    0.5 * (dt_filtered['Aggregate rating'] / dt_filtered['Aggregate rating'].max())
)

In [30]:
# Sort by final score
recommended_restaurants = dt_filtered.sort_values(by='Final Score', ascending=False).head(5)

print(recommended_restaurants[['Restaurant Name', 'City', 'Final Score']])


         Restaurant Name            City  Final Score
3658         Owl is Well       New Delhi     0.959184
9404              Solita      Manchester     0.750000
9451             Jarryds       Cape Town     0.739796
47             TT Burger  Rio de Janeiro     0.739796
2483  The Fusion Kitchen          Mumbai     0.729592


#### The recommendation system has been tested using different user preferences, and it has successfully provided personalized restaurant recommendations that match the user's criteria. The system can be further improved by incorporating additional features such as user reviews, ratings, and location to enhance the recommendation quality.

#### Conclusion: The recommendation system shows that it is possible to provide personalized restaurant recommendations based on user preferences.