In [None]:
import pandas as pd

# Load raw Swiggy data
df = pd.read_csv(r"C:\Users\Digital Suppliers\Desktop\Swiggy_Recommender\venv\swiggy_data.csv")

print("Shape before cleaning:", df.shape)

Shape before cleaning: (148541, 11)


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 148541 entries, 0 to 148540
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            148541 non-null  int64 
 1   name          148455 non-null  object
 2   city          148541 non-null  object
 3   rating        148455 non-null  object
 4   rating_count  148455 non-null  object
 5   cost          148410 non-null  object
 6   cuisine       148442 non-null  object
 7   lic_no        148312 non-null  object
 8   link          148541 non-null  object
 9   address       148455 non-null  object
 10  menu          148541 non-null  object
dtypes: int64(1), object(10)
memory usage: 12.5+ MB


In [3]:
# Remove duplicate rows
df = df.drop_duplicates()
print("After removing duplicates:", df.shape)

After removing duplicates: (148541, 11)


In [4]:
# ---------- Drop unwanted columns ----------
df = df.drop(columns=["id", "lic_no", "link", "address", "menu"])

In [6]:
# Handle missing values
# Drop rows where important columns are missing
df = df.dropna(subset=["city", "cuisine", "rating", "rating_count", "cost"])

In [7]:
# Fill missing cuisine with "Unknown" (important for recommendation)
df["cuisine"] = df["cuisine"].fillna("Unknown")

In [8]:
df["rating"] = pd.to_numeric(df["rating"], errors="coerce")

In [9]:
#Inspect unique values
print(df['rating_count'].unique())

['Too Few Ratings' '50+ ratings' '100+ ratings' '20+ ratings'
 '500+ ratings' '1K+ ratings' '5K+ ratings' '10K+ ratings']


In [10]:
def clean_rating_count(x):
    if pd.isna(x):
        return None
    x = str(x).strip()

    # Drop unusable values
    if x == "Too Few Ratings":
        return None

    # Handle "K" values
    if "K" in x:
        return int(float(x.replace("K+ ratings", "").replace("K+", "").replace("K", "")) * 1000)

    # Handle normal "+ ratings" values
    if "ratings" in x:
        x = x.replace("+ ratings", "").replace(" ratings", "")
        if x.isdigit():
            return int(x)

    return None

df["rating_count"] = df["rating_count"].apply(clean_rating_count)
df["rating_count"] = df["rating_count"].astype("Int64")  # pandas nullable integer type



In [11]:
print(df['cost'].unique())

['₹ 200' '₹ 100' '₹ 250' '₹ 150' '₹ 300' '₹ 700' '₹ 650' '₹ 400' '₹ 350'
 '₹ 60' '₹ 110' '₹ 399' '₹ 249' '₹ 500' '₹ 99' '₹ 120' '₹ 499' '₹ 299'
 '₹ 199' '₹ 50' '₹ 180' '₹ 349' '₹ 1000' '₹ 599' '₹ 600' '₹ 800' '₹ 450'
 '₹ 149' '₹ 290' '₹ 175' '₹ 125' '₹ 8' '₹ 375' '₹ 275' '₹ 425' '₹ 225'
 '₹ 325' '₹ 75' '₹ 160' '₹ 550' '₹ 220' '₹ 20' '₹ 59' '₹ 1200' '₹ 210'
 '₹ 30' '₹ 310' '₹ 70' '₹ 170' '₹ 449' '₹ 280' '₹ 320' '₹ 1300' '₹ 850'
 '₹ 900' '₹ 40' '₹ 1500' '₹ 140' '₹ 1100' '₹ 410' '₹ 80' '₹ 10' '₹ 1245'
 '₹ 510' '₹ 90' '₹ 260' '₹ 1800' '₹ 5' '₹ 240' '₹ 460' '₹ 1900' '₹ 352'
 '₹ 298' '₹ 2' '₹ 252' '₹ 330' '₹ 750' '₹ 130' '₹ 2000' '₹ 198' '₹ 230'
 '₹ 999' '₹ 3999' '₹ 235' '₹ 1600' '₹ 55' '₹ 179' '₹ 129' '₹ 360' '₹ 85'
 '₹ 248' '₹ 270' '₹ 25' '₹ 159' '₹ 370' '₹ 1050' '₹ 49' '₹ 699' '₹ 340'
 '₹ 190' '₹ 710' '₹ 1250' '₹ 178' '₹ 148' '₹ 380' '₹ 257' '₹ 171' '₹ 193'
 '₹ 189' '₹ 197' '₹ 336' '₹ 251' '₹ 245' '₹ 219' '₹ 188' '₹ 137' '₹ 164'
 '₹ 126' '₹ 268' '₹ 421' '₹ 540' '₹ 239' '₹ 14' '₹ 259' '₹ 1

In [12]:
#clean cost column
def clean_cost(x):
    if pd.isna(x):
        return None
    x = str(x).replace("₹", "").replace(",", "").strip()
    if x.isdigit():
        return int(x)
    return None

df["cost"] = df["cost"].apply(clean_cost)

In [13]:
df["rating"].fillna(df["rating"].mean(), inplace=True)
df["rating_count"].fillna(0, inplace=True)  # assume no ratings → 0
df["cost"].fillna(df["cost"].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["rating"].fillna(df["rating"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["rating_count"].fillna(0, inplace=True)  # assume no ratings → 0
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object o

In [14]:
df["city"].fillna("Unknown", inplace=True)
df["cuisine"].fillna("Unknown", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["city"].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["cuisine"].fillna("Unknown", inplace=True)


In [15]:
# Save cleaned data
df.to_csv(r"C:\Users\Digital Suppliers\Desktop\Swiggy_Recommender\cleaned_data.csv", index=False)

print("Cleaned data saved as cleaned_data.csv")

Cleaned data saved as cleaned_data.csv
