# 🧹 Zomato Dataset Cleaning Notebook
This notebook performs data cleaning on the Zomato dataset for use in Tableau or GitHub. Steps include:
- Column renaming and trimming
- Null handling
- Rating and cost column formatting
- Cleaning text fields
- Generating cost per person
- Dropping unnecessary columns
- Exporting final CSV

In [None]:
import pandas as pd
import numpy as np
import csv
import os

In [None]:
csv.field_size_limit(100000000)  # Allow large text fields

In [None]:
# Load the CSV file
df = pd.read_csv("zomato.csv", encoding='utf-8', engine='python')
df.head()

In [None]:
# Clean column names
df.columns = (
    df.columns.str.strip()
    .str.lower()
    .str.replace(' ', '_')
    .str.replace('(', '', regex=False)
    .str.replace(')', '', regex=False)
    .str.replace('-', '_')
    .str.replace('__', '_')
    .str.strip('_')
)

In [None]:
df.rename(columns={
    'approx_cost_for_two_people': 'approx_costfor_two_people',
    'listed_in(type)': 'listed_intype',
    'listed_in(city)': 'listed_incity'
}, inplace=True)

In [None]:
df['rate'] = df['rate'].astype(str).str.replace('/5', '', regex=False)
df['rate'] = df['rate'].replace(['NEW', '-'], np.nan)
df['rate'] = pd.to_numeric(df['rate'].str.strip(), errors='coerce')
df['rate'].fillna(df['rate'].mean(), inplace=True)

In [None]:
df['votes'] = pd.to_numeric(df['votes'], errors='coerce').fillna(0).astype(int)
df['phone'] = df['phone'].fillna("Not Available")
df['cuisines'] = df['cuisines'].fillna("Unknown")
df['rest_type'] = df['rest_type'].fillna("Unknown")
df['dish_liked'] = df['dish_liked'].fillna("Not Mentioned")

In [None]:
df['approx_costfor_two_people'] = pd.to_numeric(df['approx_costfor_two_people'], errors='coerce')
df['approx_costfor_two_people'].fillna(df['approx_costfor_two_people'].mean(), inplace=True)
df['cost_per_person'] = df['approx_costfor_two_people'] / 2
df.drop(columns=['approx_costfor_two_people'], inplace=True)

In [None]:
df.replace(to_replace=r'\r|\n', value=' ', regex=True, inplace=True)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [None]:
columns_to_drop = ['url', 'menu_item', 'reviews_list', 'phone', 'address']
df.drop(columns=columns_to_drop, inplace=True)

In [None]:
print(f"✅ Number of records: {len(df)}")
print("✅ Final columns:", list(df.columns))
df.head()

In [None]:
output_path = "zomato_cleaned_for_github.csv"
df.to_csv(output_path, index=False)
print(f"📁 Cleaned CSV saved to: {output_path}")