In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from statistics import mean
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
zomatoData = pd.read_csv('/kaggle/input/zomato-bangalore-restaurants/zomato.csv')

In [4]:
zomato = zomatoData.copy()

In [5]:
zomato.head()

Data cleaning
1. drop()
2. dropna()
3. drop_duplicates()

In [6]:
zomato = zomato.drop(['phone', 'dish_liked', 'menu_item', 'url', 'address'], axis = 1)

In [7]:
zomato.shape

In [8]:
zomato.dropna(inplace = True)

In [9]:
zomato.shape

In [10]:
#number of duplicated rows
zomato.duplicated().sum()

In [11]:
zomato.drop_duplicates(inplace = True)

In [12]:
zomato.duplicated().sum()

In [13]:
len(zomato)

In [14]:
zomato.reset_index()

In [15]:
def getReviewList(reviewString):
    reviews = []
    while len(reviewString) != 0:
        try:
            tempIndex = reviewString.index('Rated') + 6
            reviews.append(float(reviewString[tempIndex : tempIndex + 3]))
            reviewString = reviewString[tempIndex + 3 : ]
        except:
            return reviews

def castRateToFloat(rate):
    return float(rate.replace('/5', ''))

In [16]:
zomato['reviews_list'] = zomato['reviews_list'].apply(lambda reviews: getReviewList(reviews))

In [17]:
zomato['approx_cost(for two people)'] = zomato["approx_cost(for two people)"].apply(lambda x: x.replace(',','') if ',' in x else x)
zomato['approx_cost(for two people)'] = zomato["approx_cost(for two people)"].apply(lambda x: float(x))

In [18]:
#creating a new column mean_rating with value as mean of the reviews_list column.
zomato['mean_rating'] = 0
for i in range(len(zomato)):
    zomato.iloc[i, -1] = round(mean(zomato.iloc[i, -4]), 2) if len(zomato.iloc[i, -4]) > 0 else 0 
zomato.head()

Dividing data sets into two
1. Old restaurants
2. New restaurants

In [19]:
zomatoOld = zomato.loc[(zomato['rate'] != 'NEW') & (zomato['rate'] != '-')]
zomatoNew = zomato.loc[(zomato['rate'] == 'NEW') & (zomato['rate'] != '-')]

In [20]:
print(zomatoOld.shape)
print(zomatoNew.shape)

The string value of rate column in casted into float by replacing #/5 with float(#) 

In [21]:
zomatoOld['rate'] = zomatoOld['rate'].apply(lambda r: castRateToFloat(r))

In [22]:
for i in range(len(zomatoOld)):
    zomatoOld.iloc[i, -1] = zomatoOld.iloc[i, 3] if zomatoOld.iloc[i, -1] == 0 else zomatoOld.iloc[i, -1]
zomato.head()

**Descriptive statistics**

In [23]:
stats = zomatoOld['approx_cost(for two people)'].describe()
print('Mean: ' + str(stats['mean']))
print('Median: ' + str(stats['50%']))
print('Interquartile range: ' + str(stats['75%'] - stats['25%']))
print('Outliers: ' + str(stats['25%'] - 1.5*(stats['75%'] - stats['25%'])) + ' ' + str(stats['75%'] + 1.5*(stats['75%'] - stats['25%'])))

In [24]:
correlation = zomatoOld.corr().loc['mean_rating', 'approx_cost(for two people)']
print(correlation)

**Visualization**

In [25]:
#top 10 streets/cities with max number of restaurants
plt.figure(figsize = (10, 7))
top10Cities = zomato['listed_in(city)'].value_counts()[:10]
sns.barplot(x = top10Cities, y = top10Cities.index)
plt.title('Top 10 streets/cities with maximum number of restaurants')
plt.xlabel('Number of restaurants')
plt.ylabel('City name')

In [26]:
#Restaurant with most outlets Banglore
plt.figure(figsize = (10, 7))
top10Restaurants = zomato['name'].value_counts()[:10]
sns.barplot(x = top10Restaurants, y = top10Restaurants.index)
plt.title('Restaurant with most outlets Banglore')
plt.xlabel('Number of outlets')
plt.ylabel('Restaurants name')

In [27]:
#Mean rating distribution
sns.distplot(zomatoOld['mean_rating'], kde = False, color = 'blue', bins = 20)
plt.axvline(mean(zomatoOld['mean_rating']), color = 'red')

In [40]:
cuisineList = []
for i in range(len(zomato)):
    temp = zomato.iloc[i, -6].split(',')
    cuisineList += temp
for i in range(len(cuisineList)):
    cuisineList[i] = cuisineList[i].strip()
cuisineList = pd.Series(cuisineList)
top10Cuisines = cuisineList.value_counts()[:10]
plt.figure(figsize = (10, 7))
sns.barplot(x = top10Cuisines, y = top10Cuisines.index)
plt.title('Top 10 cuisines in Banglore')
plt.xlabel('Count')
plt.ylabel('Cuisine')

In [57]:
#scatter plot name vs cuisines
import matplotlib.pyplot as plt

plt.figure(figsize=(10,5))
chart = sns.countplot(
    data=zomato[zomato['rest_type'].isin(['Bar','Pub','Quick Bites','Casual Dining','Cafe','Dessert Parlor','Delivery','Casual Dining, Bar'])],x='rest_type'
)
chart.set_xticklabels(chart.get_xticklabels(), rotation=45,fontsize='x-large')

In [31]:
#overall view
plt.figure(figsize = (30, 7))
plt.xticks(rotation=45)
sns.boxplot(x = 'listed_in(city)', y = 'approx_cost(for two people)', data = zomatoOld)