In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import re
from wordcloud import WordCloud

In [None]:
pd.set_option("display.max_rows", None)

In [None]:
# Reading data
df = pd.read_csv("../input/zomato-bangalore-restaurants/zomato.csv", encoding='utf-8')

In [None]:
# Exploring data
df.head()

In [None]:
# Exploring features
df.columns

In [None]:
# Exploring dimensions
print("dataset contains {} rows and {} columns".format(df.shape[0],df.shape[1]))

In [None]:
# Exploring information summary
df.info()

In [None]:
# Statistical summery
df.describe()

### Features definition:

* **url**: contains the url of the restaurant in the zomato website.
* **address:** contains the address of the restaurant in Bengaluru.
* **name:** contains the name of the restaurant.
* **online-order:** whether online ordering is available in the restaurant or not.
* **book-table:** table book option available or not.
* **rate:** contains the overall rating of the restaurant out of 5.
* **votes:** contains total number of rating for the restaurant as of the above mentioned date.
* **phone:** contains the phone number of the restaurant.
* **location:** contains the neighborhood in which the restaurant is located.
* **rest-type:** restaurant type.
* **dish_liked:** dishes people liked in the restaurant
* **cuisines:** food styles, separated by comma
* **approx_cost(for two people):** contains the approximate cost for meal for two people
* **reviews_list:** of tuples containing reviews for the restaurant, each tuple
* **menu_item:** contains list of menus available in the restaurant
* **listed_in(type):** type of meal
* **listed_in(city):** contains the neighborhood in which the restaurant is listed


In [None]:
df.rename(columns={
    'approx_cost(for two people)': 'average_cost',
    'listed_in(city)': 'neighborhood',
    'listed_in(type)': 'restaurant_type'},
          inplace=True)

In [None]:
df.columns

## Data Preprocessing:

In [None]:
# check for nulls
df.isnull().sum()

In [None]:
df[df.menu_item == '[]'].shape[0]

the menu_item contain too much empty data

In [None]:
# drop irrelevant features to us
df.drop(['phone','menu_item'], axis=1, inplace=True)

In [None]:
# Nulls percentage
print("Percentage NaN values in df:")
((df.isnull() | df.isna()).sum() * 100 / df.index.size).round(2)

In [None]:
#Check for Duplicates
df.duplicated().sum()

- Although there is no dublicated data but there aren't 51k restaurants in Bengaluru. 
- **Then, How 51K (51k different link for each restaurant) restaurants were listed in the zomato website?**

- **The Answer is:** the data set was scraped individually for each category. Ex.: Buffet, dineout, pubs, bars, delivery, nightlife, etc. so it may have happened a restaurant was mentioned in more than one category.

### Univariate Analysis

#### What are the top known resturants?

In [None]:
plt.figure(figsize=(10,7))
chains=df['name'].value_counts()[:20]
sns.barplot(x=chains,y=chains.index)
plt.title("Most famous restaurants")
plt.xlabel("Number of restaurants");

In [None]:
# Number of repeated restaurants observations in different category features 
dub_res = df.duplicated(subset='name').sum()
# Number of resturants
uni_res = df.name.unique().size
# total number of obsercations
total_obs= df.shape[0]
print("Total number of observations =", total_obs)
print("Total number of unique resturant names =", uni_res)
print("Total number of dublicated resturants names =", dub_res)
# validate observations
print("Are the total sum of both unique resturant names and dublicated names equal to total sum of observations ?",
      dub_res + uni_res == total_obs)

#### What is the avilability percentage for ordaring online?

In [None]:
df.online_order.value_counts()

In [None]:
# Exploring features: online_order
df.online_order.value_counts().plot(kind='pie',labels= ['Available', 'Not Available'],  shadow=True, autopct='%1.2f%%')
plt.title('Online Ordering Avilablility')
plt.axis('equal')
plt.show();

In [None]:
df.book_table.value_counts()

#### What is the avilability percentage for booking a table?

In [None]:
# Exploring features: book_table
df.book_table.value_counts().plot(kind='pie',labels= ['Available', 'Not Available'], shadow=True, autopct='%1.2f%%')
plt.title(' Book a Table Avilablility')
plt.axis('equal')
plt.show();

In [None]:
df.rate.unique()

In [None]:
df.rate.value_counts(dropna=False)

In [None]:
df['rate'] = df.rate.str.replace(' ', '').str.replace('/5', '').replace("NEW",np.NaN).replace("-", np.NaN).astype('float')

In [None]:
print("Percentage of Missing rate of all dataset:",
      (df.rate.isna().sum() * 100 / df.index.size).round(2))

print("the percentage is acceptable to drop")
df.dropna(subset = ['rate'], inplace=True)

#### What are the average rate of all restaurants?

In [None]:
# Plot rating distribution
rating=df['rate'].astype('float')
sns.distplot(rating, hist=True, kde=True, 
             bins=int(180/5), color = 'darkblue', 
             hist_kws={'edgecolor':'black'},
             kde_kws={'linewidth': 4});

#### what is the top rate of most of restaurants?

In [None]:
# Exploring features: rate
df.rate.astype('int').value_counts().plot(kind='pie',
                                          labels= ['Excellent', 'Good', 'Fair', 'Poor', 'Bad'],
                                          shadow=True, autopct='%1.2f%%')
plt.title(' Restaurant Rates')
plt.axis('equal')
plt.show();

#### What are the most popular locations for restaurants?

In [None]:
plt.figure(figsize=(10,7))
loc=df['location'].value_counts()[:20]
sns.barplot(x=loc,y=loc.index)
plt.title("Most Popular Locations")
plt.xlabel("Location Count");

#### What are the most popular restaurants type?

In [None]:
plt.figure(figsize=(10,7))
rest_type=df['rest_type'].value_counts()[:20]
sns.barplot(x=rest_type,y=rest_type.index)
plt.title("Most Popular restaurants type")
plt.xlabel("restaurants type Count");

#### What are the top 10 liked dishes?

In [None]:
plt.figure(figsize=(10,7))
dish_liked=df['dish_liked'].value_counts()[:10]
sns.barplot(x=dish_liked,y=dish_liked.index)
plt.title("Most Popular dish liked ")
plt.xlabel("dish liked Count");

#### What are the most liked cuisines?

In [None]:
plt.figure(figsize=(10,7))
cuisines=df['cuisines'].value_counts()[:10]
sns.barplot(x=cuisines,y=cuisines.index)
plt.title("Most Popular cuisines ")
plt.xlabel("cuisines Count");

In [None]:
df.average_cost.unique()

In [None]:
df.dropna(subset = ['average_cost'], inplace=True)
df.average_cost = df.average_cost.str.replace(',','')
df.average_cost = df.average_cost.astype(int)

#### What is the average cost of causines?

In [None]:
fig, ax = plt.subplots(figsize=[16,4])
sns.distplot(df['average_cost'],ax=ax)
ax.set_title('Cost Distrubution for all restaurants');

Reviews Extraction:

In [None]:
all_ratings = []

for name,ratings in tqdm(zip(df['name'],df['reviews_list'])):
    ratings = eval(ratings)
    for score, doc in ratings:
        if score:
            score = score.strip("Rated").strip()
            doc = doc.strip('RATED').strip()
            score = float(score)
            all_ratings.append([name,score, doc])

In [None]:
rating_df=pd.DataFrame(all_ratings,columns=['name','rating','review'])
rating_df['review']=rating_df['review'].apply(lambda x : re.sub('[^a-zA-Z0-9\s]',"",x))

In [None]:
rating_df.to_csv("Ratings.csv")

#### What is the most repeated words in reviews?

In [None]:
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(str(rating_df.review.values))

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
df.restaurant_type.

In [None]:
# Exploring features: restaurant_type
df.restaurant_type.value_counts().plot(kind='pie', shadow=True, autopct='%1.2f%%')
plt.title('restaurant types')
plt.axis('equal')
plt.show();

In [None]:
# Exploring features: neighborhood
plt.figure(figsize=(10,10))
df.neighborhood.value_counts().plot(kind='pie', autopct='%1.2f%%')
plt.title('neighborhood Available')
plt.axis('equal')
plt.show();

### Bivariate Analysis

#### What are top 3 restaurants famous for ?

In [None]:
rest=df['name'].value_counts()[:3].index
def produce_wordcloud(rest):
    
    plt.figure(figsize=(20,30))
    for i,r in enumerate(rest):
        plt.subplot(1,3,i+1)
        corpus=rating_df[rating_df['name']==r]['review'].values.tolist()
        corpus=' '.join(x  for x in corpus)
        wordcloud = WordCloud(max_font_size=None, background_color='white', collocations=False,
                      width=1500, height=1500).generate(corpus)
        plt.imshow(wordcloud)
        plt.title(r)
        plt.axis("off")
        

        
        
produce_wordcloud(rest)

In [None]:
import os
os.remove("/kaggle/working/Ratings.csv")