In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
dataset_filepath = './Dataset/zomato.csv'
df = pd.read_csv(dataset_filepath)
#df.head()

In [3]:
df.columns.values

array(['url', 'address', 'name', 'online_order', 'book_table', 'rate',
       'votes', 'phone', 'location', 'rest_type', 'dish_liked',
       'cuisines', 'approx_cost(for two people)', 'reviews_list',
       'menu_item', 'listed_in(type)', 'listed_in(city)'], dtype=object)

In [4]:
# Remove commas from in between the digits
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].str.replace(",","")
print(df.shape)
# Drop rows from the dataset with NaN in the "approx_cost(for two people)" column
df = df.dropna(subset=['approx_cost(for two people)'])
print(df.shape)
# Convert the "approx_cost(for two people)" column to int
df['approx_cost(for two people)'] = df['approx_cost(for two people)'].astype(int)
# Drop duplicate restaurant entries based on name and address column
df = df.drop_duplicates(subset=['name','address'], keep='first')
print(df.shape)

# Pending: Encode info about whether a restaurant is a chain or not.

# Calculate mean cost per location i.e average cost of all restaurants in a given area
# Add a new column in df with means corresponding to the restaurant locations
mean_cost_location = df.groupby('location', as_index=False)['approx_cost(for two people)'].mean()
mean_cost_location.rename(columns={'approx_cost(for two people)':'mean_cost'}, inplace=True)
df['mean_cost_by_location'] = 0.0
locations = mean_cost_location['location'].tolist()
for location in locations:
    indices = df.index[df['location'] == location].tolist()
    mean_cost = mean_cost_location[mean_cost_location['location'] == location]['mean_cost'].tolist()[0]
    df['mean_cost_by_location'][indices] = mean_cost

# Calculate the deviation of each restaurant's cost for two from the corresponding mean_cost_location
df['delta_mean_cost_by_location'] = df['approx_cost(for two people)'] - df['mean_cost_by_location']

# Calculate mean cost per restaurant type i.e avg cost of restaurants per rest_type
mean_cost_rest_type = df.groupby('rest_type', as_index=False)['approx_cost(for two people)'].mean()
mean_cost_rest_type.rename(columns={'approx_cost(for two people)':'mean_cost'}, inplace=True)
df['mean_cost_by_rest_type'] = 0.0
restaurant_types = mean_cost_rest_type['rest_type'].tolist()
for rest_type in restaurant_types:
    indices = df.index[df['rest_type'] == rest_type].tolist()
    mean_cost = mean_cost_rest_type[mean_cost_rest_type['rest_type'] == rest_type]['mean_cost'].tolist()[0]
    df['mean_cost_by_rest_type'][indices] = mean_cost

# Calculate deviation between cost of the restaurant and corresponding mean_cost_rest_type
df['delta_mean_cost_by_rest_type'] = df['approx_cost(for two people)'] - df['mean_cost_by_rest_type']

# Calculate mean cost per rest_type, given the location
mean_rest_type_and_loc = df.groupby(['rest_type', 'location'], as_index=False)['approx_cost(for two people)'].mean()
mean_rest_type_and_loc.rename(columns={'approx_cost(for two people)':'mean_cost'}, inplace=True)
df['mean_cost_by_rest_type_and_loc'] = 0.0


# Pending: Calculate deviation between cost of the restaurant and corresponding mean_rest_type_and_location

# Pending: One-Hot-Encode the "listed_in(type)" column

# Pending: Normalize all columns

(51717, 17)
(51371, 17)
(12444, 17)


In [5]:
df = df.drop(['url','address','phone','reviews_list','menu_item','cuisines','online_order','book_table','rate','votes', 'dish_liked'], axis=1)
df[8980:9000]

Unnamed: 0,name,location,rest_type,approx_cost(for two people),listed_in(type),listed_in(city),mean_cost_by_location,delta_mean_cost_by_location,mean_cost_by_rest_type,delta_mean_cost_by_rest_type
23930,Leon's Take Away,Banaswadi,"Takeaway, Delivery",450,Delivery,Kalyan Nagar,378.04878,71.95122,379.703356,70.296644
23931,Nandhini,Banaswadi,"Casual Dining, Bar",600,Delivery,Kalyan Nagar,378.04878,221.95122,1149.019608,-549.019608
23932,Thalis Of India,Hennur,Delivery,500,Delivery,Kalyan Nagar,388.28125,111.71875,435.623236,64.376764
23933,Pizza Hut,Nagawara,Quick Bites,750,Delivery,Kalyan Nagar,428.846154,321.153846,310.155626,439.844374
23934,Donne Biryani Kuteera,Banaswadi,Quick Bites,200,Delivery,Kalyan Nagar,378.04878,-178.04878,310.155626,-110.155626
23935,Marwari Basa,Nagawara,Quick Bites,300,Delivery,Kalyan Nagar,428.846154,-128.846154,310.155626,-10.155626
23936,Krispy Kreme,Kalyan Nagar,Dessert Parlor,300,Delivery,Kalyan Nagar,525.495868,-225.495868,320.445633,-20.445633
23937,Belgian Waffle Factory,Kalyan Nagar,Dessert Parlor,400,Delivery,Kalyan Nagar,525.495868,-125.495868,320.445633,79.554367
23938,Subway,Kalyan Nagar,Quick Bites,600,Delivery,Kalyan Nagar,525.495868,74.504132,310.155626,289.844374
23939,New Krishna Sagar,HBR Layout,Casual Dining,400,Delivery,Kalyan Nagar,380.0,20.0,753.526505,-353.526505


In [156]:
.cost_locs = df.groupby('location', as_index=False)['approx_cost(for two people)'].mean()

In [161]:
cost_locs.shape

(93, 2)

In [162]:
cost_rest_type = df.groupby('rest_type', as_index=False)['approx_cost(for two people)'].mean()

In [169]:
type_and_loc = df.groupby(['rest_type','location'], as_index=False)['approx_cost(for two people)'].mean()

In [170]:
type_and_loc.shape

(1410, 3)

In [173]:
type_and_loc[900:]

Unnamed: 0,rest_type,location,approx_cost(for two people)
900,"Food Court, Quick Bites",Brigade Road,400.000000
901,"Food Court, Quick Bites",CV Raman Nagar,400.000000
902,"Food Court, Quick Bites",Domlur,400.000000
903,"Food Court, Quick Bites",Electronic City,500.000000
904,"Food Court, Quick Bites",Koramangala 6th Block,325.000000
905,"Food Court, Quick Bites",Koramangala 7th Block,400.000000
906,"Food Court, Quick Bites",Malleshwaram,400.000000
907,"Food Court, Quick Bites",Marathahalli,466.666667
908,"Food Court, Quick Bites",Whitefield,516.666667
909,"Food Court, Quick Bites",Yeshwantpur,300.000000


In [26]:
mean_cost_location[:5]

Unnamed: 0,location,mean_cost_location
0,BTM,378.982118
1,Banashankari,380.101695
2,Banaswadi,378.04878
3,Bannerghatta Road,423.340381
4,Basavanagudi,339.620853


In [27]:
df['location'][:10]

0    Banashankari
1    Banashankari
2    Banashankari
3    Banashankari
4    Basavanagudi
5    Basavanagudi
6     Mysore Road
7    Banashankari
8    Banashankari
9    Banashankari
Name: location, dtype: object

In [64]:
df_t = pd.DataFrame(data=['BTM','HSR','Brigade','HSR','BTM','BTM'], columns=['location'])
df_t['cost'] = [200,500,600,450,150,300]
df_t['mean_cost'] = 0
df_t

Unnamed: 0,location,cost,mean_cost
0,BTM,200,0
1,HSR,500,0
2,Brigade,600,0
3,HSR,450,0
4,BTM,150,0
5,BTM,300,0


In [65]:
mean_cost_location = df_t.groupby('location', as_index=False)['cost'].mean()
mean_cost_location

Unnamed: 0,location,cost
0,BTM,216.666667
1,Brigade,600.0
2,HSR,475.0


In [66]:
locations = mean_cost_location['location'].tolist()
locations

['BTM', 'Brigade', 'HSR']

In [67]:
for loc in locations:
    indices = df_t.index[df_t['location']==loc].tolist()
    m_cost = mean_cost_location[mean_cost_location['location']==loc]['cost'].tolist()[0]
    df_t['mean_cost'][indices] = m_cost
df_t

Unnamed: 0,location,cost,mean_cost
0,BTM,200,216.666667
1,HSR,500,475.0
2,Brigade,600,600.0
3,HSR,450,475.0
4,BTM,150,216.666667
5,BTM,300,216.666667


In [68]:
#df_t['delta_mean_cost_location'] = 0.0
df_t['delta_mean_cost_location'] = df_t['cost'] - df_t['mean_cost']
df_t

Unnamed: 0,location,cost,mean_cost,delta_mean_cost_location
0,BTM,200,216.666667,-16.666667
1,HSR,500,475.0,25.0
2,Brigade,600,600.0,0.0
3,HSR,450,475.0,-25.0
4,BTM,150,216.666667,-66.666667
5,BTM,300,216.666667,83.333333


In [45]:
val = mean_cost_location[mean_cost_location['location']=='Brigade']['cost'].tolist()[0]

In [46]:
type(val)

float

In [47]:
val

600.0

In [73]:
mean_cost_rest_type[:10]

Unnamed: 0,rest_type,approx_cost(for two people)
0,Bakery,375.37037
1,"Bakery, Beverage Shop",166.666667
2,"Bakery, Cafe",546.428571
3,"Bakery, Dessert Parlor",358.571429
4,"Bakery, Food Court",500.0
5,"Bakery, Kiosk",600.0
6,"Bakery, Quick Bites",383.58209
7,"Bakery, Sweet Shop",200.0
8,Bar,1263.157895
9,"Bar, Cafe",1000.0


In [23]:
mean_rest_type_and_loc[-1:]

Unnamed: 0,rest_type,location,approx_cost(for two people)
1409,"Takeaway, Delivery",Wilson Garden,400.0


In [26]:
for row in mean_rest_type_and_loc.itertuples(index=False):
    val = row
print(row)

Pandas(rest_type='Takeaway, Delivery', location='Wilson Garden', _2=400.0)


In [33]:
type(row[2])

float

In [35]:
bakeries = df.index[(df['rest_type']=='Bakery') & (df['location']=='BTM')].tolist()

In [36]:
bakeries

[1050,
 1196,
 1793,
 1945,
 2913,
 8472,
 9408,
 9501,
 9531,
 9628,
 10209,
 10215,
 10216,
 10255]

In [17]:
for row in bake

<pandas.core.indexing._iLocIndexer at 0x7f1d445b7ea8>