In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
import csv

# Set the maximum field size allowed in CSV
# to the maximum size supported by the system
csv.field_size_limit(sys.maxsize)

131072

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Reading the 'zomato.csv' file
df = pd.read_csv('/content/drive/MyDrive/zomato.csv',
                 engine='python', on_bad_lines='skip', encoding='utf-8')

In [4]:
# Dropping unnecessary columns
df = df.drop(['url', 'phone', 'dish_liked', 'menu_item', 'listed_in(city)'], axis=1)

# Renaming columns for better readability and consistency
df = df.rename(columns={'approx_cost(for two people)':'cost',
                        'listed_in(type)':'type'})

# Capitalizing the first letter of each word in the 'name' column
df.name = df.name.apply(lambda x:x.title())

# Converting 'online_order' and 'book_table' columns to Boolean True/False
df.online_order.replace(('Yes','No'),(True, False),inplace=True)
df.book_table.replace(('Yes','No'),(True, False),inplace=True)

# Cleaning and converting the 'cost' column to a float type
df['cost'] = df['cost'].astype(str)
df['cost'] = df['cost'].apply(lambda x: x.replace(',','.'))
df['cost'] = df['cost'].astype(float)

# Removing rows where 'rate' is 'NEW' or '-'
df = df.loc[df.rate !='NEW']
df = df.loc[df.rate !='-'].reset_index(drop=True)

# Removing the '/5' part from the 'rate' column and converting it to float
remove_slash = lambda x: x.replace('/5', '') if isinstance(x, str) else x
df.rate = df.rate.apply(remove_slash).str.strip().astype('float')

In [5]:
### Content-Based Recommendation System Using TF-IDF Vectorization & Cosine Similarity

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Creating a TF-IDF Vectorizer, but ignoring English stop words
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Transforming the 'reviews_list' column of the DataFrame into a TF-IDF-weighted term-document matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['reviews_list'])

# Calculating the cosine similarity matrix from the TF-IDF matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

def get_recommendations(df, title):

    # Finding the index of the restaurant that matches the title
    idx = df.index[df['name'] == title].tolist()[0]

    # Getting the pairwise similarity scores
    # of all restaurants with that restaurant
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sorting the restaurants based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Filtering out the same restaurant
    # to avoid self-recommendation
    sim_scores = [score for score in sim_scores if df['name'].iloc[score[0]] != title]

    recommended_restaurants = []
    seen_names = set()
    for score in sim_scores:
        # Limit the number of recommendations to 5
        if len(recommended_restaurants) == 5:
            break
        restaurant_name = df['name'].iloc[score[0]]

        # Ensuring no duplicate recommendations
        if restaurant_name not in seen_names:
            recommended_restaurants.append(restaurant_name)
            seen_names.add(restaurant_name)

    return recommended_restaurants

df_copied = df.copy()

# Scenario: Getting recommendations for the restaurant 'Spice Elephant'
recommended_restaurants = get_recommendations(df_copied, 'Spice Elephant')

recommended_restaurants

['Atithi', 'Flavours - Octave Hotel & Spa', 'Cinnamon', 'Paprica', 'Tamarind']