# GOURMET GUIDE

### Foundation Project 1: Food Recommendation Model

#### Group 6

* Charanjeet Singh - 12220064
* Pooja Nilesh Doshi - 12220028
* Snigdha Debashis Bhattacharjee - 12220067
* Vinayak Dave - 12220047

In [1]:
#!pip install threadpoolctl==3.1.0
#!pip install yellowbrick
#!pip install pca
#!pip install spacy==3.5.2
#!python -m spacy download en

In [2]:
import json
import string
import spacy
import matplotlib as m
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import numpy as np
import os
import re
import pandas as pd
from pandas import DataFrame
from scipy import stats
from scipy.stats import norm
import seaborn as sns
import threadpoolctl
import warnings
warnings.filterwarnings("ignore")
from pca import pca
from sklearn import datasets
from sklearn.preprocessing import MinMaxScaler
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics.pairwise import linear_kernel
from sklearn.cluster import KMeans

In [3]:
zomato = pd.read_csv(r"C:\Users\Green Lantern\OneDrive\Documents\ISB AMPBA Winter 2023\Term 2\Foundational Project 1\FP1_Project 4_Group6_Final\restaurant dataset\zomato.csv")

#### DATA CLEANING

In [4]:
#Adding Restaurant ID
zomato.insert(0, "rest_id", range(1, len(zomato) + 1))

In [5]:
#Preserving the original dataset
df = zomato.copy()

In [6]:
df.shape

(51717, 18)

In [7]:
df.columns

Index(['rest_id', 'url', 'address', 'name', 'online_order', 'book_table',
       'rate', 'votes', 'phone', 'location', 'rest_type', 'dish_liked',
       'cuisines', 'approx_cost(for two people)', 'reviews_list', 'menu_item',
       'listed_in(type)', 'listed_in(city)'],
      dtype='object')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51717 entries, 0 to 51716
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   rest_id                      51717 non-null  int64 
 1   url                          51717 non-null  object
 2   address                      51717 non-null  object
 3   name                         51717 non-null  object
 4   online_order                 51717 non-null  object
 5   book_table                   51717 non-null  object
 6   rate                         43942 non-null  object
 7   votes                        51717 non-null  int64 
 8   phone                        50509 non-null  object
 9   location                     51696 non-null  object
 10  rest_type                    51490 non-null  object
 11  dish_liked                   23639 non-null  object
 12  cuisines                     51672 non-null  object
 13  approx_cost(for two people)  51

In [9]:
#Deleting rows with null values in 'dish_liked', 'rest_type', 'rate', 'approx_cost(for two people)'
df.dropna(subset=['dish_liked'], how='all', inplace=True)
df.dropna(subset=['rest_type'], how='all', inplace=True)
df.dropna(subset=['rate'], how='all', inplace=True)
df.dropna(subset=['approx_cost(for two people)'], how='all', inplace=True)

In [10]:
#Deleting Unnnecessary Columns
df = df.drop(['address','listed_in(type)', 'listed_in(city)', 'menu_item','phone'],axis=1)

In [11]:
#Removing the Duplicates
df.duplicated().sum()
df.drop_duplicates(inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23406 entries, 0 to 51715
Data columns (total 13 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   rest_id                      23406 non-null  int64 
 1   url                          23406 non-null  object
 2   name                         23406 non-null  object
 3   online_order                 23406 non-null  object
 4   book_table                   23406 non-null  object
 5   rate                         23406 non-null  object
 6   votes                        23406 non-null  int64 
 7   location                     23406 non-null  object
 8   rest_type                    23406 non-null  object
 9   dish_liked                   23406 non-null  object
 10  cuisines                     23406 non-null  object
 11  approx_cost(for two people)  23406 non-null  object
 12  reviews_list                 23406 non-null  object
dtypes: int64(2), object(11)
memory 

In [13]:
#Changing the column names
df = df.rename(columns={'approx_cost(for two people)':'cost'})
df.columns

Index(['rest_id', 'url', 'name', 'online_order', 'book_table', 'rate', 'votes',
       'location', 'rest_type', 'dish_liked', 'cuisines', 'cost',
       'reviews_list'],
      dtype='object')

In [14]:
#Changing the cost to Float
df['cost'] = df['cost'].astype(str)
df['cost'] = df['cost'].apply(lambda x: x.replace(',','.'))
df['cost'] = df['cost'].astype(float) 

In [15]:
df.head()

Unnamed: 0,rest_id,url,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost,reviews_list
0,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,Yes,Yes,4.1/5,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ..."
1,2,https://www.zomato.com/bangalore/spice-elephan...,Spice Elephant,Yes,No,4.1/5,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din..."
2,3,https://www.zomato.com/SanchurroBangalore?cont...,San Churro Cafe,Yes,No,3.8/5,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ..."
3,4,https://www.zomato.com/bangalore/addhuri-udupi...,Addhuri Udupi Bhojana,No,No,3.7/5,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper..."
4,5,https://www.zomato.com/bangalore/grand-village...,Grand Village,No,No,3.8/5,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ..."


In [16]:
#'rate'
df['rate'].unique()

array(['4.1/5', '3.8/5', '3.7/5', '4.6/5', '4.0/5', '4.2/5', '3.9/5',
       '3.0/5', '3.6/5', '2.8/5', '4.4/5', '3.1/5', '4.3/5', '2.6/5',
       '3.3/5', '3.5/5', '3.8 /5', '3.2/5', '4.5/5', '2.5/5', '2.9/5',
       '3.4/5', '2.7/5', '4.7/5', 'NEW', '2.4/5', '2.2/5', '2.3/5',
       '4.8/5', '3.9 /5', '4.2 /5', '4.0 /5', '4.1 /5', '2.9 /5',
       '2.7 /5', '2.5 /5', '2.6 /5', '4.5 /5', '4.3 /5', '3.7 /5',
       '4.4 /5', '4.9/5', '2.1/5', '2.0/5', '1.8/5', '3.4 /5', '3.6 /5',
       '3.3 /5', '4.6 /5', '4.9 /5', '3.2 /5', '3.0 /5', '2.8 /5',
       '3.5 /5', '3.1 /5', '4.8 /5', '2.3 /5', '4.7 /5', '2.4 /5',
       '2.1 /5', '2.2 /5', '2.0 /5', '1.8 /5'], dtype=object)

In [17]:
#Removing '/5' from 'rate' and dropping 'NEW' restaurants
df['rate'] = df['rate'].str.replace('/5', '')

df = df.loc[df.rate !='NEW']
df = df.loc[df.rate !='-'].reset_index(drop=True)

df['rate'].unique()

array(['4.1', '3.8', '3.7', '4.6', '4.0', '4.2', '3.9', '3.0', '3.6',
       '2.8', '4.4', '3.1', '4.3', '2.6', '3.3', '3.5', '3.8 ', '3.2',
       '4.5', '2.5', '2.9', '3.4', '2.7', '4.7', '2.4', '2.2', '2.3',
       '4.8', '3.9 ', '4.2 ', '4.0 ', '4.1 ', '2.9 ', '2.7 ', '2.5 ',
       '2.6 ', '4.5 ', '4.3 ', '3.7 ', '4.4 ', '4.9', '2.1', '2.0', '1.8',
       '3.4 ', '3.6 ', '3.3 ', '4.6 ', '4.9 ', '3.2 ', '3.0 ', '2.8 ',
       '3.5 ', '3.1 ', '4.8 ', '2.3 ', '4.7 ', '2.4 ', '2.1 ', '2.2 ',
       '2.0 ', '1.8 '], dtype=object)

In [18]:
#Converting 'online_order' and 'book_table'columns from String to Boolean 
df.online_order.replace(('Yes','No'),(True, False),inplace=True)
df.book_table.replace(('Yes','No'),(True, False),inplace=True)

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23259 entries, 0 to 23258
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rest_id       23259 non-null  int64  
 1   url           23259 non-null  object 
 2   name          23259 non-null  object 
 3   online_order  23259 non-null  bool   
 4   book_table    23259 non-null  bool   
 5   rate          23259 non-null  object 
 6   votes         23259 non-null  int64  
 7   location      23259 non-null  object 
 8   rest_type     23259 non-null  object 
 9   dish_liked    23259 non-null  object 
 10  cuisines      23259 non-null  object 
 11  cost          23259 non-null  float64
 12  reviews_list  23259 non-null  object 
dtypes: bool(2), float64(1), int64(2), object(8)
memory usage: 2.0+ MB


In [20]:
df.head()

Unnamed: 0,rest_id,url,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost,reviews_list
0,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...","North Indian, Mughlai, Chinese",800.0,"[('Rated 4.0', 'RATED\n A beautiful place to ..."
1,2,https://www.zomato.com/bangalore/spice-elephan...,Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...","Chinese, North Indian, Thai",800.0,"[('Rated 4.0', 'RATED\n Had been here for din..."
2,3,https://www.zomato.com/SanchurroBangalore?cont...,San Churro Cafe,True,False,3.8,918,Banashankari,"Cafe, Casual Dining","Churros, Cannelloni, Minestrone Soup, Hot Choc...","Cafe, Mexican, Italian",800.0,"[('Rated 3.0', ""RATED\n Ambience is not that ..."
3,4,https://www.zomato.com/bangalore/addhuri-udupi...,Addhuri Udupi Bhojana,False,False,3.7,88,Banashankari,Quick Bites,Masala Dosa,"South Indian, North Indian",300.0,"[('Rated 4.0', ""RATED\n Great food and proper..."
4,5,https://www.zomato.com/bangalore/grand-village...,Grand Village,False,False,3.8,166,Basavanagudi,Casual Dining,"Panipuri, Gol Gappe","North Indian, Rajasthani",600.0,"[('Rated 4.0', 'RATED\n Very good restaurant ..."


In [21]:
df.to_excel(r"C:\Users\Green Lantern\OneDrive\Documents\ISB AMPBA Winter 2023\Term 2\Foundational Project 1\FP1_Project 4_Group6_Final\restaurant dataset\zomato1.xlsx")

#### INITIAL TEXT PROCESSING

In [22]:
nlp = spacy.load('en_core_web_sm')

Name

In [23]:
#Review sample data
df[['name']].head(10)

Unnamed: 0,name
0,Jalsa
1,Spice Elephant
2,San Churro Cafe
3,Addhuri Udupi Bhojana
4,Grand Village
5,Timepass Dinner
6,Onesta
7,Penthouse Cafe
8,Smacznego
9,CafÃÂÃÂÃÂÃÂÃÂÃÂÃÂÃÂ© Down The A...


In [24]:
#Clean the names and capitalize the first letter
df['name'] = df['name'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()).strip().title())
df[['name']].head(10)

Unnamed: 0,name
0,Jalsa
1,Spice Elephant
2,San Churro Cafe
3,Addhuri Udupi Bhojana
4,Grand Village
5,Timepass Dinner
6,Onesta
7,Penthouse Cafe
8,Smacznego
9,Cafãâãâãâãâãâãâãâãâ Down The Alley


Reviews

In [25]:
#Review sample data
df[['reviews_list']].head(10)

Unnamed: 0,reviews_list
0,"[('Rated 4.0', 'RATED\n A beautiful place to ..."
1,"[('Rated 4.0', 'RATED\n Had been here for din..."
2,"[('Rated 3.0', ""RATED\n Ambience is not that ..."
3,"[('Rated 4.0', ""RATED\n Great food and proper..."
4,"[('Rated 4.0', 'RATED\n Very good restaurant ..."
5,"[('Rated 3.0', 'RATED\n Food 3/5\nAmbience 3/..."
6,"[('Rated 5.0', 'RATED\n I personally really l..."
7,"[('Rated 3.0', ""RATED\n I had been to this pl..."
8,"[('Rated 4.0', ""RATED\n Easy to locate\nVFM 3..."
9,"[('Rated 4.0', 'RATED\n We ended up here on a..."


In [26]:
#Lower Casing of Reviews
df["reviews_list"] = df["reviews_list"].str.lower()
df[['reviews_list']].head(10)

Unnamed: 0,reviews_list
0,"[('rated 4.0', 'rated\n a beautiful place to ..."
1,"[('rated 4.0', 'rated\n had been here for din..."
2,"[('rated 3.0', ""rated\n ambience is not that ..."
3,"[('rated 4.0', ""rated\n great food and proper..."
4,"[('rated 4.0', 'rated\n very good restaurant ..."
5,"[('rated 3.0', 'rated\n food 3/5\nambience 3/..."
6,"[('rated 5.0', 'rated\n i personally really l..."
7,"[('rated 3.0', ""rated\n i had been to this pl..."
8,"[('rated 4.0', ""rated\n easy to locate\nvfm 3..."
9,"[('rated 4.0', 'rated\n we ended up here on a..."


In [27]:
#Removal of Puctuations in Reviews
def without_punctuation(text):
    # Remove punctuation using regular expressions
    text_without_punct = re.sub(r'[^\w\s]', '', text)
    return text_without_punct

df['reviews_list'] = df['reviews_list'].apply(without_punctuation)
df[['reviews_list']].head(10)

Unnamed: 0,reviews_list
0,rated 40 ratedn a beautiful place to dine int...
1,rated 40 ratedn had been here for dinner with...
2,rated 30 ratedn ambience is not that good eno...
3,rated 40 ratedn great food and proper karnata...
4,rated 40 ratedn very good restaurant in neigh...
5,rated 30 ratedn food 35nambience 35nservice 3...
6,rated 50 ratedn i personally really liked thi...
7,rated 30 ratedn i had been to this place with...
8,rated 40 ratedn easy to locatenvfm 355ntaste ...
9,rated 40 ratedn we ended up here on a saturda...


In [28]:
#Removal of Stopwords in Reviews
stop_words = set(stopwords.words('english'))

def without_stopwords(text):
    # Remove "rated number" pattern
    text = re.sub(r'rated \d+ ratedn', '', text)
    # Remove "ratedn" pattern
    text = re.sub(r'ratedn', '', text)
    
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return ' '.join(filtered_tokens)

df['reviews_list'] = df['reviews_list'].apply(without_stopwords)
df[['reviews_list']].head(10)

Unnamed: 0,reviews_list
0,beautiful place dine inthe interiors take back...
1,dinner family turned good choose suitable ages...
2,ambience good enough pocket friendly cafe quan...
3,great food proper karnataka style full meals t...
4,good restaurant neighbourhood buffet system pr...
5,food 35nambience 35nservice 35nnhad family lun...
6,personally really liked place ambience rooftop...
7,place one friends small place decent nice plac...
8,easy locatenvfm 355ntaste 55nyummy cheesyyy fr...
9,ended saturday afternoon hectic daynthe food g...


Cuisines

In [29]:
#Review sample data
df[['cuisines']].head(10)

Unnamed: 0,cuisines
0,"North Indian, Mughlai, Chinese"
1,"Chinese, North Indian, Thai"
2,"Cafe, Mexican, Italian"
3,"South Indian, North Indian"
4,"North Indian, Rajasthani"
5,North Indian
6,"Pizza, Cafe, Italian"
7,"Cafe, Italian, Continental"
8,"Cafe, Mexican, Italian, Momos, Beverages"
9,Cafe


In [30]:
#Split the cuisines
df['cuisines'] = df['cuisines'].str.split(', ')

#Explode the list of cuisines into separate rows
df = df.explode('cuisines')

#Reset the index
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,rest_id,url,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost,reviews_list
0,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",North Indian,800.0,beautiful place dine inthe interiors take back...
1,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",Mughlai,800.0,beautiful place dine inthe interiors take back...
2,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",Chinese,800.0,beautiful place dine inthe interiors take back...
3,2,https://www.zomato.com/bangalore/spice-elephan...,Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",Chinese,800.0,dinner family turned good choose suitable ages...
4,2,https://www.zomato.com/bangalore/spice-elephan...,Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",North Indian,800.0,dinner family turned good choose suitable ages...


Restaurant Type

In [31]:
#Review sample data
df[['rest_type']].head(10)

Unnamed: 0,rest_type
0,Casual Dining
1,Casual Dining
2,Casual Dining
3,Casual Dining
4,Casual Dining
5,Casual Dining
6,"Cafe, Casual Dining"
7,"Cafe, Casual Dining"
8,"Cafe, Casual Dining"
9,Quick Bites


In [32]:
#Split the restaurant types
df['rest_type'] = df['rest_type'].str.split(', ')

#Explode the list of restaurant types into separate rows
df = df.explode('rest_type')

#Reset the index
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,rest_id,url,name,online_order,book_table,rate,votes,location,rest_type,dish_liked,cuisines,cost,reviews_list
0,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",North Indian,800.0,beautiful place dine inthe interiors take back...
1,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",Mughlai,800.0,beautiful place dine inthe interiors take back...
2,1,https://www.zomato.com/bangalore/jalsa-banasha...,Jalsa,True,True,4.1,775,Banashankari,Casual Dining,"Pasta, Lunch Buffet, Masala Papad, Paneer Laja...",Chinese,800.0,beautiful place dine inthe interiors take back...
3,2,https://www.zomato.com/bangalore/spice-elephan...,Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",Chinese,800.0,dinner family turned good choose suitable ages...
4,2,https://www.zomato.com/bangalore/spice-elephan...,Spice Elephant,True,False,4.1,787,Banashankari,Casual Dining,"Momos, Lunch Buffet, Chocolate Nirvana, Thai G...",North Indian,800.0,dinner family turned good choose suitable ages...


Cost

In [33]:
#Filter the dataframe to include restaurants with cost less than 10
cost_lessthan_100 = df[df['cost'] <= 100]
cost_lessthan_100.shape

(27414, 13)

In [34]:
#Updating the cost of restaurants with cost less than 10 to 50
df.loc[df['cost'] <= 100, 'cost'] = 100

We are updating the cost of restaurants that are currently listed as less than Rs.100 because such low cost values seem unrealistic. Therefore, we are setting the starting minimum cost to Rs.100. By making this change, we aim to ensure that the cost values in the dataset better reflect the current pricing trends.

In [35]:
#Extracting the 'cost' column as a numpy array
cost_values = df['cost'].values.reshape(-1, 1)

In [36]:
#Defining the number of clusters
num_clusters = 3

In [37]:
#Performing K-means clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(cost_values)

KMeans(n_clusters=3, random_state=42)

In [38]:
#Retrieving the cluster labels
cluster_labels = kmeans.labels_

In [39]:
#Adding the cluster labels to the dataframe
df['cost_cluster'] = cluster_labels

In [40]:
#Retrieving the unique cluster labels
unique_clusters = df['cost_cluster'].unique()

#Printing the unique cluster labels and their counts
for cluster in unique_clusters:
    cluster_count = len(df[df['cost_cluster'] == cluster])
    print(f"Cluster {cluster}: {cluster_count} restaurants")

Cluster 1: 24441 restaurants
Cluster 2: 24300 restaurants
Cluster 0: 32102 restaurants


In [41]:
#Iterating over the unique clusters
for cluster in unique_clusters:
    cluster_data = df[df['cost_cluster'] == cluster]
    
    #Retrieving the minimum and maximum cost values
    min_cost = cluster_data['cost'].min()
    max_cost = cluster_data['cost'].max()
    
    #Printing the cost range for the current cluster
    print(f"Cluster {cluster}: Rs.{min_cost} to Rs.{max_cost}")

Cluster 1: Rs.600.0 to Rs.950.0
Cluster 2: Rs.300.0 to Rs.550.0
Cluster 0: Rs.100.0 to Rs.250.0


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80843 entries, 0 to 80842
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   rest_id       80843 non-null  int64  
 1   url           80843 non-null  object 
 2   name          80843 non-null  object 
 3   online_order  80843 non-null  bool   
 4   book_table    80843 non-null  bool   
 5   rate          80843 non-null  object 
 6   votes         80843 non-null  int64  
 7   location      80843 non-null  object 
 8   rest_type     80843 non-null  object 
 9   dish_liked    80843 non-null  object 
 10  cuisines      80843 non-null  object 
 11  cost          80843 non-null  float64
 12  reviews_list  80843 non-null  object 
 13  cost_cluster  80843 non-null  int32  
dtypes: bool(2), float64(1), int32(1), int64(2), object(8)
memory usage: 7.2+ MB


In [43]:
#Extract a random 15% of the data
sample_df = df.sample(frac=0.15, random_state=42)

In [44]:
sample_df.to_excel("C:/Users/Green Lantern/OneDrive/Documents/ISB AMPBA Winter 2023/Term 2/Foundational Project 1/FP1_Project 4_Group6_Final/restaurant dataset/zomato2.xlsx")

#### SPLITTING DATA INTO TRAINING AND TEST DATA

In [45]:
from sklearn.model_selection import train_test_split
train_df_0,test_df_0 = train_test_split(sample_df,test_size = 0.22,random_state = 50)

In [46]:
train_df_0.shape

(9458, 14)

In [47]:
test_df_0.shape

(2668, 14)

In [48]:
#Preserving the original datasets
train_df = train_df_0.copy()
test_df = test_df_0.copy()

In [49]:
train_df.to_excel("C:/Users/Green Lantern/OneDrive/Documents/ISB AMPBA Winter 2023/Term 2/Foundational Project 1/FP1_Project 4_Group6_Final/restaurant dataset/zomato_train_df.xlsx")

In [50]:
test_df.to_excel("C:/Users/Green Lantern/OneDrive/Documents/ISB AMPBA Winter 2023/Term 2/Foundational Project 1/FP1_Project 4_Group6_Final/restaurant dataset/zomato_test_df.xlsx")