# Import necessary libraries

In [None]:
import pandas as pd

# Load the Zomato dataset

In [2]:
df = pd.read_csv("zomato.csv")

# Drop columns that are not useful for analysis

In [4]:
df2=df.drop(columns=['url', 'phone', 'rest_type', 'dish_liked', 'reviews_list', 'menu_item', 'listed_in(city)'])

# Rename columns for better readability

In [9]:
df3=df2.rename(columns={'approx_cost(for two people)':'two_people_cost','listed_in(type)':'type_of_restaurant'})
df4=df3.rename(columns={'rate':'rating'})

# Drop rows where data is missing

In [17]:
df4=df4.dropna(subset=['location'])
df4=df4.dropna(subset=['cuisines','two_people_cost'])

# Clean and Convert Cost Column

In [19]:
df4['two_people_cost']=df4['two_people_cost'].str.replace(',', '').astype(int)
df4['two_people_cost']=df4['two_people_cost']/2
df4=df4.rename(columns={'two_people_cost':'cost_per_person'})

# Clean the Rating Column

In [22]:
import numpy as np
def handlerate(value) :
    if(value== 'NEW' or value=='-'):
        return np.nan
    else:
        value = str(value).split('/')
        value = value [0]
        return float(value)
        
df4['rating'] = df4['rating'].apply(handlerate)
df4['rating']=df4['rating'].fillna(df4['rating'].mean)

# Correct Data Types

In [39]:
df4['online_order']=df4['online_order'].astype("string")
df4['book_table']=df4['book_table'].astype("string")
df4['location']=df4['location'].astype("string")
df4['cuisines']=df4['cuisines'].astype("string")

# Remove Duplicates

In [46]:
df4.drop_duplicates(inplace=True)

# Apply Title Casing to key categorical text columns

In [50]:
df4['name']=df4['name'].str.title()
df4['location']=df4['location'].str.title()
df4['cuisines']=df4['cuisines'].str.title()
df4['type_of_restaurant']=df4['type_of_restaurant'].str.title()

In [59]:
df4.drop(columns=['type_of_restuarant'],inplace=True)

# Display information about final dataframe

In [61]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
Index: 31730 entries, 0 to 51716
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   address             31730 non-null  object 
 1   name                31730 non-null  object 
 2   online_order        31730 non-null  string 
 3   book_table          31730 non-null  string 
 4   rating              31730 non-null  object 
 5   votes               31730 non-null  int64  
 6   location            31730 non-null  string 
 7   cuisines            31730 non-null  string 
 8   cost_per_person     31730 non-null  float64
 9   type_of_restaurant  31730 non-null  object 
dtypes: float64(1), int64(1), object(4), string(4)
memory usage: 2.7+ MB


# Saved the cleaned DataFrame

In [62]:
df4.to_csv("zomato_data_analysis.csv", index=False)