# Restaurant Analysis

#### Import Necessary Library

In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from geopy.distance  import geodesic
from scipy.stats import pointbiserialr
from scipy.stats import ttest_ind
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
file = "/workspaces/Restaurant-Analysis/Data/Dataset .csv"

In [4]:
rest_df = pd.read_csv(file)

In [5]:
rest_df.head(3)

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270


### Exploring the Dataset

In [6]:
rest_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Restaurant ID         9551 non-null   int64  
 1   Restaurant Name       9551 non-null   object 
 2   Country Code          9551 non-null   int64  
 3   City                  9551 non-null   object 
 4   Address               9551 non-null   object 
 5   Locality              9551 non-null   object 
 6   Locality Verbose      9551 non-null   object 
 7   Longitude             9551 non-null   float64
 8   Latitude              9551 non-null   float64
 9   Cuisines              9542 non-null   object 
 10  Average Cost for two  9551 non-null   int64  
 11  Currency              9551 non-null   object 
 12  Has Table booking     9551 non-null   object 
 13  Has Online delivery   9551 non-null   object 
 14  Is delivering now     9551 non-null   object 
 15  Switch to order menu 

In [23]:
rest_df.shape

(9551, 21)

In [24]:
rest_df.isnull().sum()

Restaurant ID           0
Restaurant Name         0
Country Code            0
City                    0
Address                 0
Locality                0
Locality Verbose        0
Longitude               0
Latitude                0
Cuisines                9
Average Cost for two    0
Currency                0
Has Table booking       0
Has Online delivery     0
Is delivering now       0
Switch to order menu    0
Price range             0
Aggregate rating        0
Rating color            0
Rating text             0
Votes                   0
dtype: int64

In [26]:
rest_df.duplicated(['Restaurant ID']).sum()

np.int64(0)

### Data Cleansing

In [7]:
def str_miss(df,col):
    miss_value = df[df[col].isnull()]
    city = miss_value['City'].unique()
    for cit in city:
        city_df = df[df['City'] == cit]
        mode_value = city_df[col].mode()
        df.loc[df['City'] == cit, col] = df.loc[df['City'] == cit, col].fillna(mode_value).iloc[0]

    return df


rest_df = str_miss(rest_df,'Cuisines')

In [8]:
rest_df[rest_df['Cuisines'].isnull()]

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
346,17606621,HI Lite Bar & Lounge,216,Miller,"109 N Broadway Ave, Miller, SD 57362",Miller,"Miller, Miller",-98.9891,44.5158,,...,Dollar($),No,No,No,No,1,3.4,Orange,Average,11


In [9]:
rest_df['Cuisines'] = rest_df['Cuisines'].fillna('Unknown')

In [10]:
rest_df = rest_df.drop(columns = 'Locality Verbose')

In [11]:
cat = [c for c in rest_df.select_dtypes(include = ['object']).columns if rest_df[c].nunique()<=6 & rest_df[c].nunique()>1]
rest_df[cat] = rest_df[cat].astype('category')
rest_df['Country Code'] = rest_df['Country Code'].astype('int32')

In [12]:
rest_df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9551 entries, 0 to 9550
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   Restaurant ID         9551 non-null   int64   
 1   Restaurant Name       9551 non-null   object  
 2   Country Code          9551 non-null   int32   
 3   City                  9551 non-null   object  
 4   Address               9551 non-null   object  
 5   Locality              9551 non-null   object  
 6   Longitude             9551 non-null   float64 
 7   Latitude              9551 non-null   float64 
 8   Cuisines              9551 non-null   object  
 9   Average Cost for two  9551 non-null   int64   
 10  Currency              9551 non-null   object  
 11  Has Table booking     9551 non-null   category
 12  Has Online delivery   9551 non-null   category
 13  Is delivering now     9551 non-null   category
 14  Switch to order menu  9551 non-null   object  
 15  Pric