In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import sys
import csv

# Set the maximum field size allowed in CSV
# to the maximum size supported by the system
csv.field_size_limit(sys.maxsize)

131072

In [3]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Reading the 'zomato.csv' file
df = pd.read_csv('/content/drive/MyDrive/zomato.csv',
                 engine='python', on_bad_lines='skip', encoding='utf-8')

In [4]:
# Dropping unnecessary columns
df = df.drop(['url', 'phone', 'dish_liked', 'menu_item', 'listed_in(city)'], axis=1)

# Renaming columns for better readability and consistency
df = df.rename(columns={'approx_cost(for two people)':'cost',
                        'listed_in(type)':'type'})

# Capitalizing the first letter of each word in the 'name' column
df.name = df.name.apply(lambda x:x.title())

# Converting 'online_order' and 'book_table' columns to Boolean True/False
df.online_order.replace(('Yes','No'),(True, False),inplace=True)
df.book_table.replace(('Yes','No'),(True, False),inplace=True)

# Cleaning and converting the 'cost' column to a float type
df['cost'] = df['cost'].astype(str)
df['cost'] = df['cost'].apply(lambda x: x.replace(',','.'))
df['cost'] = df['cost'].astype(float)

# Removing rows where 'rate' is 'NEW' or '-'
df = df.loc[df.rate !='NEW']
df = df.loc[df.rate !='-'].reset_index(drop=True)

# Removing the '/5' part from the 'rate' column and converting it to float
remove_slash = lambda x: x.replace('/5', '') if isinstance(x, str) else x
df.rate = df.rate.apply(remove_slash).str.strip().astype('float')

In [5]:
### User-Based Collaborative Filtering Recommendations Using Cosine Similarity of Rating Patterns

from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Counting the number of ratings in each 'reviews_list' entry
array_sizes = df['reviews_list'].str.count(r'Rated \d\.\d')

# Filtering the DataFrame for restaurants
# with 50 or more ratings (Assuming for scenario)
df_filtered = df[array_sizes >= 50]

# Extracting individual ratings and converting them to floats
ratings_list = df_filtered['reviews_list'].str.findall(r'Rated (\d\.\d)').explode().dropna()
ratings_list = ratings_list.astype(float)

# Creating an array of user IDs, with each ID repeated
# according to the number of ratings they've given
user_ids = np.repeat(np.arange(len(df_filtered)), df_filtered['reviews_list'].str.count(r'Rated \d\.\d')).astype(int)

# Limiting each user's ratings
# to their first 50 reviews (Assuming for scenario)
ratings_list = ratings_list.groupby(user_ids).head(50)
user_ids = np.repeat(np.arange(len(df_filtered)), 50)

# DF - user IDs, restaurant IDs, and ratings
ratings_df = pd.DataFrame({
    'user_id': user_ids,
    'restaurant_id': np.tile(np.arange(len(df_filtered)), ratings_list.groupby(user_ids).size().max()),
    'rating': ratings_list.values
}).dropna()

# Creating a pivot table for user-item (restaurant) interactions
ratings_pivot = ratings_df.pivot_table(index='user_id', columns='restaurant_id', values='rating', fill_value=0)

# Converting the pivot table into a sparse matrix
ratings_matrix = csr_matrix(ratings_pivot.values)

# Calculating the cosine similarity
# between users based on their rating patterns
user_similarity = cosine_similarity(ratings_matrix)

In [6]:
restaurant_names = df['name'].tolist()

def recommend_items(user_id, user_similarity, ratings_pivot, top_n=5):
    # Sorting other users based on similarity to the target user
    # and excluding the target user itself
    similar_users = user_similarity[user_id].argsort()[::-1]
    similar_users = similar_users[1:]

    # Predicting ratings for items by dot product of the similarity vector and the ratings matrix
    recommendations = np.dot(user_similarity[user_id, :].reshape(1, -1), ratings_pivot.values).reshape(-1)

    # Identifying items already rated by the user (don't need ~)
    already_rated = np.nonzero(ratings_pivot.iloc[user_id].to_numpy())[0]

    # Setting recommendations for already rated items to 0 to exclude them
    recommendations[already_rated] = 0

    # Getting top N recommendations, ignoring already rated items
    recommendation_ids = recommendations.argsort()[::-1][:top_n]

    # Converting restaurant indices to names
    recommended_item_names = [restaurant_names[i] for i in recommendation_ids]

    return recommended_item_names

# Scenario: Recommending items for user with ID 0
recommended_items = recommend_items(0, user_similarity, ratings_pivot)
print("Recommended items for user 0:", recommended_items)

Recommended items for user 0: ['Shree Cool Point', 'Corner House Ice Cream', 'Petoo', "Pizza Baker'S", 'Kanti Sweets']
