In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pip install gdown

In [None]:
!gdown https://drive.google.com/uc?id=10rmPrIB8lwn5c3xRFaihJx5lLeu_OXne

# Reading the data

In [None]:
df = pd.read_csv('./zomato.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.dtypes

In [None]:
df.shape

# Data Cleaning

**Finding the % of missing data**

In [None]:
df.isnull().sum()

In [None]:
feature_na = [feature for feature in df.columns if df[feature].isnull().sum()>0]

In [None]:
feature_na

In [None]:
for i in feature_na:
    print("{} has {}% missing values".format(i,np.round((df[i].isnull().sum()/len(df))*100,4)))

In [None]:
df['rate'].unique()

In [None]:
df.dropna(axis='index',subset=feature_na,inplace=True)

In [None]:
df['rate'].unique()

In [None]:
def split(x):
    return x.split('/')[0]

In [None]:
df['rate']=df['rate'].apply(split)

In [None]:
df['rate'].unique()

In [None]:
df.replace('NEW',0,inplace=True)

In [None]:
df['rate'].unique()

In [None]:
df['rate']=df['rate'].astype('float')

In [None]:
df.dtypes

In [None]:
df.head()

# Calculating Average Rating Of Each Restaurant

In [None]:
df_rate = df.groupby('name')['rate'].mean().to_frame()

In [None]:
df_rate.reset_index(inplace=True)

In [None]:
df_rate.head()

In [None]:
df_rate.columns = ['Restaurant Names','Average Rating']

In [None]:
df_rate.head()

In [None]:
df.groupby('name')['rate'].mean().nlargest(20).plot.bar()

# Get distribution of Rating column & try to find out what distribution this feature support.

In [None]:
sns.distplot(df_rate['Average Rating'])

# Top Restaurant chains in Bangalore

In [None]:
chains=df['name'].value_counts()[0:20]
sns.barplot(x=chains,y=chains.index,palette='deep')
plt.title("Most famous restaurants chains in Bangaluru")
plt.xlabel("Number of outlets") 

# How many of the restaurant do not accept Online Orders

In [None]:
df.head()

In [None]:
x = df['online_order'].value_counts()

In [None]:
x

In [None]:
import plotly.express as px

In [None]:
label=['not book','book']
px.pie(df,values = x,labels=label,title = 'Order Online Availability')

# Total number of variety of restaurants in Bengalore

In [None]:
df.head()

In [None]:
cuisines = df['cuisines'].value_counts()[0:10]
cuisines

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot

In [None]:
trace1 = go.Bar(x=df['cuisines'].value_counts()[0:10].index,
      y=df['cuisines'].value_counts()[0:10])

In [None]:
iplot([trace1])

# Analyse Approx Cost of 2 People Feature.

In [None]:
df.columns

In [None]:
df['approx_cost(for two people)'].isna().sum()

In [None]:
df['approx_cost(for two people)']=df['approx_cost(for two people)'].apply(lambda x : x.replace(',',''))

In [None]:
df['approx_cost(for two people)'].unique()

In [None]:
df['approx_cost(for two people)'].dtype

In [None]:
df['approx_cost(for two people)']=df['approx_cost(for two people)'].astype('int')

In [None]:
df['approx_cost(for two people)'].dtype

In [None]:
sns.distplot(df['approx_cost(for two people)'])

# Analyse "Approx cost of 2 People" vs "rating".Find out some relationship

In [None]:
sns.scatterplot(x='rate',y='approx_cost(for two people)',hue='online_order',data=df)

# Is there any difference b/w votes of restaurants accepting and not accepting the oline orders

In [None]:
sns.boxplot(x='online_order',y='votes',data=df)

In [None]:
px.box(x=df['online_order'],y=df['votes'])

# Is there any difference b/w price of restaurants accepting and not accepting online orders


In [None]:
px.box(x=df['online_order'],y=df['approx_cost(for two people)'])

# Most luxirious restaurant in Banagalore

In [None]:
df['approx_cost(for two people)'].min()

In [None]:
df['approx_cost(for two people)'].max()

In [None]:
df[df['approx_cost(for two people)']==5000]['name']

# Top 10 Most Expensive restaurant with approx cost for 2 peoples

In [None]:
data=df.copy()

In [None]:
data.set_index('name',inplace=True)

In [None]:
data.head()

In [None]:
data['approx_cost(for two people)'].nlargest(10).plot.bar()

# Top 10 Cheapest restaurant with approx cost for 2 people

In [None]:
data['approx_cost(for two people)'].nsmallest(10).plot.bar()

# Find all the restaurant that are below than See(budget hotel) as well as Affordable.

In [None]:
data[data['approx_cost(for two people)']<=500]

In [None]:
df_budget = data[data['approx_cost(for two people)']<=500].loc[:,('approx_cost(for two people)')]

In [None]:
df_budget = df_budget.reset_index()
df_budget.head()

# Total Restaurants that have good rating >4 and that are budget too

In [None]:
df[(df['rate']>4) & (df['approx_cost(for two people)']<=500)].shape

In [None]:
len(df[(df['rate']>4) & (df['approx_cost(for two people)']<=500)]['name'].unique())

# Total various affordable hotels at all locations of Bengalore

In [None]:
df_new = df[(df['rate']>4) & (df['approx_cost(for two people)']<=500)]
df_new.head()

In [None]:
location = []
total=[]
for loc,location_df in df_new.groupby('location'):
    location.append(loc)
    total.append(len(location_df['name'].unique()))

In [None]:
location_df = pd.DataFrame(zip(location,total))
location_df.head()

In [None]:
location_df.columns=['location','restaurant']

In [None]:
location_df.head()

# Finding Best budget Restaurants in any location

In [None]:
def return_budget(location,restaurant):
    budget=df[(df['approx_cost(for two people)']<=400)&(df['location']==location)&(df['rest_type']==restaurant)]
    return (budget['name'].unique())

In [None]:
return_budget('BTM','Quick Bites')

# Which are the foodie areas?

In [None]:
restaurant_location = df['location'].value_counts()[0:20]
sns.barplot(restaurant_location,restaurant_location.index)

# Performing Geographical Analysis

# Find latitude and longitude for each of the location of Bangalore

In [None]:
locations = pd.DataFrame({'Name':df['location'].unique()})
locations.head()

In [None]:
pip install geopy

**Nominatin = Open Street Map**

In [None]:
from geopy.geocoders import Nominatim

In [None]:
geolocator = Nominatim(user_agent='app')

In [None]:
lat_long = []
for location in locations['Name']:
    location = geolocator.geocode(location)
    if location is None:
        lat_long.append(np.nan)
    else:
        geo = (location.latitude,location.longitude)
        lat_long.append(geo)

In [None]:
locations['geo_loc']=lat_long

In [None]:
locations.head()

In [None]:
locations.head()

In [None]:
locations.head()

In [None]:
Rest_locat = pd.DataFrame(df['location'].value_counts().reset_index())
Rest_locat.head()

In [None]:
Rest_locat.columns = ['Name','count']
Rest_locat.head()

In [None]:
Restaurant_locations = Rest_locat.merge(locations,on='Name',how='left').dropna()
Restaurant_locations.head()

In [None]:
np.array(Restaurant_locations['geo_loc'])

In [None]:
lat,lon = zip(*np.array(Restaurant_locations['geo_loc']))

In [None]:
Restaurant_locations['lat']=lat
Restaurant_locations['lon']=lon
Restaurant_locations.drop('geo_loc',axis=1,inplace=True)

In [None]:
Restaurant_locations.head()

# Generate Basemap of Bangalore

In [None]:
!pip install folium

In [None]:
import folium
from folium.plugins import HeatMap

In [None]:
def generatebasemap(default_location=[12.93,77.62],default_zoom_start=12):
    basemap = folium.Map(location=default_location,zoom_start=default_zoom_start)
    return basemap

In [None]:
basemap = generatebasemap()

In [None]:
basemap

# Heatmap of Restaurants

In [None]:
HeatMap(Restaurant_locations[['lat','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)

In [None]:
basemap

# Heatmap of North Indian Restaurants

In [None]:
df.head()

In [None]:
df2 = df[df['cuisines']=='North Indian']
df2.head()

In [None]:
north_india = df2.groupby('location',as_index=False)['url'].agg('count')

In [None]:
north_india.head()

In [None]:
north_india.columns = ['Name','count']

In [None]:
north_india.head()

In [None]:
north_india = north_india.merge(locations,on='Name',how='left').dropna()
north_india.head()

In [None]:
north_india['lat'],north_india['lon']=zip(*north_india['geo_loc'].values)

In [None]:
north_india.drop('geo_loc',axis=1,inplace=True)

In [None]:
north_india.head()

In [None]:
basemap = generatebasemap()

In [None]:
HeatMap(north_india[['lat','lon','count']].values.tolist(),zoom=20,radius=15).add_to(basemap)

In [None]:
basemap

# Most popular casual dining restaurant chains

In [None]:
df.groupby(['rest_type','name']).agg('count')

In [None]:
df_1 = df.groupby(['rest_type','name']).agg('count')

In [None]:
df_1.sort_values(['url'],ascending=False)

In [None]:
df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))

In [None]:
df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))['url']

In [None]:
df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))['url'].reset_index()

In [None]:
dataset=df_1.sort_values(['url'],ascending=False).groupby(['rest_type'],
                as_index=False).apply(lambda x : x.sort_values(by="url",ascending=False))['url'].reset_index().rename(columns={'url':'count'})

In [None]:
dataset

# Which are the most popular casual dining restaurant chains?

In [None]:
casual=dataset[dataset['rest_type']=='Casual Dining']
casual

In [None]:
df.shape

# Wordcloud of dishes liked by cuisines.

In [None]:
!pip install wordcloud

In [None]:
 from wordcloud import WordCloud

In [None]:
df.head()

In [None]:
df['update_dish_liked']=df['dish_liked'].apply(lambda x : x.split(',') if type(x)==str else [''])

In [None]:
df.head()

In [None]:
df['rest_type'].value_counts()[:9].index

In [None]:
rest=df['rest_type'].value_counts()[:9].index

**wordcloud for each and every restaurant**

In [None]:
from wordcloud import WordCloud, STOPWORDS 

In [None]:
df.isna().sum()

In [None]:
df.dropna(axis='index',subset=['rest_type'],inplace=True)

In [None]:
df.dropna(axis='index',subset=['dish_liked'],inplace=True)

In [None]:
df.isna().sum()

**wordcloud for 1 restaurant**

In [None]:
data=df[df['rest_type']=='Quick Bites']

In [None]:
data['dish_liked']

In [None]:
stopwords=set(STOPWORDS)

In [None]:
dishes=''
for word in data['dish_liked']:
    words=word.split()
    # Converts each token into lowercase 
    for i in range(len(words)): 
        words[i] = words[i].lower() 
    dishes=dishes+ " ".join(words)+" "
wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,stopwords = stopwords,width=1500, height=1500).generate(dishes)
plt.imshow(wordcloud)
plt.axis("off")

In [None]:
def produce_wordcloud(rest):
    
    plt.figure(figsize=(20,30))
    for i,restaurant in enumerate(rest):
        plt.subplot(3,3,i+1)
        dishes=''
        data=df[df['rest_type']==restaurant]
        for word in data['dish_liked']:
            words=word.split()
            # Converts each token into lowercase 
            for i in range(len(words)): 
                words[i] = words[i].lower() 
            dishes=dishes+ " ".join(words)+" "
        wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,stopwords = stopwords,width=1500, height=1500).generate(dishes)
        plt.imshow(wordcloud)
        plt.title(restaurant)
        plt.axis("off")

In [None]:
stopwords = set(STOPWORDS) 
produce_wordcloud(rest)