In [32]:
import pandas as pd
import numpy as np
import re

In [None]:
df = pd.read_csv("cleaned_data/zomato_clean_final.csv")

In [34]:
df.head()

Unnamed: 0,name,cuisine,price,link,main_cuisine,rating,cost_for_two
0,Molecule,"Asian, Chinese, Bar Food",1000.0,https://www.zomato.com/gwalior/molecule-lashka...,Asian,4.4,"₹1,000 for two people (approx.) Without alcohol"
1,Kwality Restaurant,"North Indian, Chinese, Continental, Biryani, D...",700.0,https://www.zomato.com/gwalior/kwality-restaur...,North Indian,4.8,₹700 for two people (approx.)
2,Muscle Vision Nutrition,"Continental, North Indian, Chinese, Sandwich, ...",450.0,https://www.zomato.com/gwalior/muscle-vision-n...,Continental,3.5,₹450 for two people (approx.)
3,Starbucks Coffee,"Cafe, Beverages, Coffee",900.0,https://www.zomato.com/gwalior/starbucks-coffe...,Cafe,4.4,₹900 for two people (approx.)
4,Xero Degrees,"Pizza, Fast Food, Chinese, Burger, Sandwich, I...",450.0,https://www.zomato.com/gwalior/xero-degrees-ci...,Pizza,4.0,₹450 for two people (approx.)


In [35]:
# Remove rows where name is missing
df = df[df["name"].notna() & (df["name"].str.strip() != "")]

In [36]:
# Remove duplicate links (keep first valid row)
df = df.drop_duplicates(subset=["link"], keep="first").reset_index(drop=True)

In [37]:
df.isnull().sum()

name             0
cuisine          0
price           36
link             0
main_cuisine     0
rating           0
cost_for_two     0
dtype: int64

In [38]:
# helpers used by both datasets
def extract_cost(x):
    if pd.isna(x): return np.nan
    s = str(x)
    m = re.search(r"₹\s*([\d,]+)", s)
    if m:
        return int(m.group(1).replace(",", ""))
    m2 = re.search(r"(\d{2,5})", s.replace(",",""))
    return int(m2.group(1)) if m2 else np.nan

df["cost"] = df["cost_for_two"].apply(extract_cost)


In [39]:
df.isnull().sum()

name             0
cuisine          0
price           36
link             0
main_cuisine     0
rating           0
cost_for_two     0
cost             0
dtype: int64

In [40]:
df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

In [41]:
df = df.drop(columns=["price", "cost_for_two"], errors="ignore")

In [42]:
df.head()

Unnamed: 0,name,cuisine,link,main_cuisine,rating,cost
0,Molecule,"Asian, Chinese, Bar Food",https://www.zomato.com/gwalior/molecule-lashka...,Asian,4.4,1000
1,Kwality Restaurant,"North Indian, Chinese, Continental, Biryani, D...",https://www.zomato.com/gwalior/kwality-restaur...,North Indian,4.8,700
2,Muscle Vision Nutrition,"Continental, North Indian, Chinese, Sandwich, ...",https://www.zomato.com/gwalior/muscle-vision-n...,Continental,3.5,450
3,Starbucks Coffee,"Cafe, Beverages, Coffee",https://www.zomato.com/gwalior/starbucks-coffe...,Cafe,4.4,900
4,Xero Degrees,"Pizza, Fast Food, Chinese, Burger, Sandwich, I...",https://www.zomato.com/gwalior/xero-degrees-ci...,Pizza,4.0,450


In [None]:
# SAVE CLEAN FILE
df.to_csv("output/zomato_gwl.csv", index=False)

print("CLEANED SHAPE:", df.shape)

CLEANED SHAPE: (501, 6)
