In [5]:
# Import Libraries and Load Dataset
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv(r"data/zomato.csv", encoding="latin1")
data = df.copy()

# Print formatted output
print("=" * 50)
print("Step 1: Initial Dataset Loading")
print("=" * 50)
print(f"Dataset Shape: {data.shape}")
print("\nFirst 5 Rows of Raw Data:")
print(data.head().to_string(index=False))
print("=" * 50)



Step 1: Initial Dataset Loading
Dataset Shape: (9551, 21)

First 5 Rows of Raw Data:
 Restaurant ID        Restaurant Name  Country Code             City                                                                 Address                                   Locality                                             Locality Verbose  Longitude  Latitude                         Cuisines  Average Cost for two         Currency Has Table booking Has Online delivery Is delivering now Switch to order menu  Price range  Aggregate rating Rating color Rating text  Votes
       6317637       Le Petit Souffle           162      Makati City Third Floor, Century City Mall, Kalayaan Avenue, Poblacion, Makati City  Century City Mall, Poblacion, Makati City       Century City Mall, Poblacion, Makati City, Makati City 121.027535 14.565443       French, Japanese, Desserts                  1100 Botswana Pula(P)               Yes                  No                No                   No            3          

In [6]:
# Drop Duplicates and Irrelevant Columns
irrelevant_columns = [
    'Restaurant ID', 'Address', 'Locality Verbose', 'Switch to order menu',
    'Country Code', 'Currency', 'Has Table booking', 'Has Online delivery',
    'Is delivering now', 'Menu Item', 'Rating text', 'Rating color', 'Locality',
    'Phone Numbers', 'Reservation'
]
data.drop(columns=irrelevant_columns, inplace=True, errors='ignore')
data.drop_duplicates(inplace=True)

# Print formatted output
print("=" * 50)
print("Step 2: Dropping Duplicates and Irrelevant Columns")
print("=" * 50)
print(f"New Dataset Shape: {data.shape}")
print(f"Remaining Columns: {data.columns.tolist()}")
print("\nFirst 5 Rows After Dropping:")
print(data.head().to_string(index=False))
print("=" * 50)

Step 2: Dropping Duplicates and Irrelevant Columns
New Dataset Shape: (9548, 9)
Remaining Columns: ['Restaurant Name', 'City', 'Longitude', 'Latitude', 'Cuisines', 'Average Cost for two', 'Price range', 'Aggregate rating', 'Votes']

First 5 Rows After Dropping:
       Restaurant Name             City  Longitude  Latitude                         Cuisines  Average Cost for two  Price range  Aggregate rating  Votes
      Le Petit Souffle      Makati City 121.027535 14.565443       French, Japanese, Desserts                  1100            3               4.8    314
      Izakaya Kikufuji      Makati City 121.014101 14.553708                         Japanese                  1200            3               4.5    591
Heat - Edsa Shangri-La Mandaluyong City 121.056831 14.581404 Seafood, Asian, Filipino, Indian                  4000            4               4.4    270
                  Ooma Mandaluyong City 121.056475 14.585318                  Japanese, Sushi                  1500       

In [None]:
# Handle Missing Values
essential_cols = ['Restaurant Name', 'Cuisines', 'City', 'Aggregate rating']
data.dropna(subset=essential_cols, inplace=True)
data['Votes'] = pd.to_numeric(data['Votes'], errors='coerce').fillna(0).astype(int)
data['Latitude'] = pd.to_numeric(data['Latitude'], errors='coerce')
data['Longitude'] = pd.to_numeric(data['Longitude'], errors='coerce')
data.dropna(subset=['Latitude', 'Longitude'], inplace=True)

# Print formatted output
print("=" * 50)
print("Step 3: Handling Missing Values")
print("=" * 50)
print(f"Dataset Shape After Handling Missing Values: {data.shape}")
print("\nMissing Values Per Column:")
print(data.isnull().sum().to_string())
print("\nFirst 5 Rows After Handling Missing Values:")
print(data.head().to_string(index=False))
print("=" * 50)

Step 3: Handling Missing Values
Dataset Shape After Handling Missing Values: (9539, 9)

Missing Values Per Column:
Restaurant Name         0
City                    0
Longitude               0
Latitude                0
Cuisines                0
Average Cost for two    0
Price range             0
Aggregate rating        0
Votes                   0

First 5 Rows After Handling Missing Values:
       Restaurant Name             City  Longitude  Latitude                         Cuisines  Average Cost for two  Price range  Aggregate rating  Votes
      Le Petit Souffle      Makati City 121.027535 14.565443       French, Japanese, Desserts                  1100            3               4.8    314
      Izakaya Kikufuji      Makati City 121.014101 14.553708                         Japanese                  1200            3               4.5    591
Heat - Edsa Shangri-La Mandaluyong City 121.056831 14.581404 Seafood, Asian, Filipino, Indian                  4000            4               4

In [None]:
# Normalize Categorical Values
data['Cuisines'] = data['Cuisines'].str.lower().str.strip()
data['Cuisines'] = data['Cuisines'].replace({
    'chinese': 'chinese', 'chinees': 'chinese',
    'south indian': 'south indian', 'south-indian': 'south indian'
})
data['City'] = data['City'].str.lower().str.strip()

# Print formatted output
print("=" * 50)
print("Step 4: Normalizing Categorical Values")
print("=" * 50)
print("Sample of Normalized Cuisines (First 10):")
print(data['Cuisines'].unique()[:10].tolist())
print("\nSample of Normalized Cities (First 10):")
print(data['City'].unique()[:10].tolist())
print("\nFirst 5 Rows After Normalization:")
print(data.head().to_string(index=False))
print("=" * 50)

Step 4: Normalizing Categorical Values
Sample of Normalized Cuisines (First 10):
['french, japanese, desserts', 'japanese', 'seafood, asian, filipino, indian', 'japanese, sushi', 'japanese, korean', 'chinese', 'asian, european', 'seafood, filipino, asian, european', 'european, asian, indian', 'filipino']

Sample of Normalized Cities (First 10):
['makati city', 'mandaluyong city', 'pasay city', 'pasig city', 'quezon city', 'san juan city', 'santa rosa', 'tagaytay city', 'taguig city', 'brasí_lia']

First 5 Rows After Normalization:
       Restaurant Name             City  Longitude  Latitude                         Cuisines  Average Cost for two  Price range  Aggregate rating  Votes
      Le Petit Souffle      makati city 121.027535 14.565443       french, japanese, desserts                  1100            3               4.8    314
      Izakaya Kikufuji      makati city 121.014101 14.553708                         japanese                  1200            3               4.5    591
H

In [None]:
# Convert Price and Rating to Numerical
data['Price range'] = pd.to_numeric(data['Price range'], errors='coerce')
data['Aggregate rating'] = pd.to_numeric(data['Aggregate rating'], errors='coerce').fillna(0)

# Print formatted output
print("=" * 50)
print("Step 5: Converting to Numerical Values")
print("=" * 50)
print(f"Price Range Data Type: {data['Price range'].dtype}")
print(f"Aggregate Rating Data Type: {data['Aggregate rating'].dtype}")
print("\nFirst 5 Rows After Conversion:")
print(data.head().to_string(index=False))
print("=" * 50)

Step 5: Converting to Numerical Values
Price Range Data Type: int64
Aggregate Rating Data Type: float64

First 5 Rows After Conversion:
       Restaurant Name             City  Longitude  Latitude                         Cuisines  Average Cost for two  Price range  Aggregate rating  Votes
      Le Petit Souffle      makati city 121.027535 14.565443       french, japanese, desserts                  1100            3               4.8    314
      Izakaya Kikufuji      makati city 121.014101 14.553708                         japanese                  1200            3               4.5    591
Heat - Edsa Shangri-La mandaluyong city 121.056831 14.581404 seafood, asian, filipino, indian                  4000            4               4.4    270
                  Ooma mandaluyong city 121.056475 14.585318                  japanese, sushi                  1500            4               4.9    365
           Sambo Kojin mandaluyong city 121.057508 14.584450                 japanese, korean 

In [None]:
# Feature Engineering
def categorize_cost(price):
    if price == 1:
        return 'low'
    elif price == 2:
        return 'medium'
    else:
        return 'high'

data['Cost Category'] = data['Price range'].apply(categorize_cost)
data['Primary Cuisine'] = data['Cuisines'].apply(lambda x: x.split(',')[0].strip() if pd.notnull(x) else x)
data['Rating'] = data['Aggregate rating'].clip(lower=1.0, upper=5.0)
data['Rating'] = data['Rating'].round(1)

# Print formatted output
print("=" * 50)
print("Step 6: Feature Engineering")
print("=" * 50)
print("Cost Category Distribution:")
print(data['Cost Category'].value_counts().to_string())
print("\nSample of Primary Cuisines (First 10):")
print(data['Primary Cuisine'].unique()[:10].tolist())
print("\nUnique Rating Values:")
print(data['Rating'].unique().tolist())
print("\nFirst 5 Rows After Feature Engineering:")
print(data.head().to_string(index=False))
print("=" * 50)

Step 6: Feature Engineering
Cost Category Distribution:
Cost Category
low       4435
medium    3113
high      1991

Sample of Primary Cuisines (First 10):
['french', 'japanese', 'seafood', 'chinese', 'asian', 'european', 'filipino', 'american', 'korean', 'cafe']

Unique Rating Values:
[4.8, 4.5, 4.4, 4.9, 4.0, 4.2, 4.3, 3.6, 4.7, 3.0, 3.8, 3.7, 3.2, 3.1, 1.0, 4.1, 3.3, 4.6, 3.9, 3.4, 3.5, 2.2, 2.9, 2.4, 2.6, 2.8, 2.1, 2.7, 2.5, 1.8, 2.0, 2.3, 1.9]

First 5 Rows After Feature Engineering:
       Restaurant Name             City  Longitude  Latitude                         Cuisines  Average Cost for two  Price range  Aggregate rating  Votes Cost Category Primary Cuisine  Rating
      Le Petit Souffle      makati city 121.027535 14.565443       french, japanese, desserts                  1100            3               4.8    314          high          french     4.8
      Izakaya Kikufuji      makati city 121.014101 14.553708                         japanese                  1200        

In [11]:
#Rating Rounding: Adjust the rating normalization to round to the nearest 0.5 step to fully meet the requirement.
data['Rating'] = (data['Aggregate rating'].clip(lower=1.0, upper=5.0) * 2).round() / 2

In [12]:
# Final Cleaned Dataset
final_data = data[[
    'Restaurant Name', 'City', 'Primary Cuisine', 'Cost Category',
    'Rating', 'Votes', 'Latitude', 'Longitude'
]].reset_index(drop=True)

# Preview the final result
final_data.head()
final_data.to_csv("data/cleaned_zomato.csv", index=False)
