In [20]:
import pandas as pd

all_sheets = pd.read_excel("C:/Users/ADMIN/Data-Analytics-Journey/06-projects/phase2-mini-project/data/swiggy_dataset.xlsx", sheet_name=None)

In [21]:
df_orders = all_sheets["orders"]
df_customers = all_sheets["customers"]
df_delivery_partners = all_sheets["delivery partners"]
df_restaurants = all_sheets["restaurants"]

In [22]:
df_orders.dtypes

order_id                              int64
customer_id                           int64
restaurant_id                         int64
dish_ordered                         object
order_time                           object
delivery_time                        object
order_status                         object
price                               float64
quantity                              int64
delivery_address                     object
expected_delivery_time               object
delivery_partner_id                   int64
delivery time (in mins)               int64
expected delivery time (in mins)      int64
dtype: object

In [23]:
df_customers.dtypes

customer_id       int64
customer_name    object
address          object
city             object
email            object
phone_number      int64
dtype: object

In [24]:
df_delivery_partners.dtypes

partner_id       int64
partner_name    object
city            object
dtype: object

In [25]:
df_restaurants.dtypes

restaurant_id        int64
restaurant_name     object
city                object
ratings            float64
total_reviews        int64
dtype: object

In [26]:
# Data cleaning operations on df_orders sheet

# Converting the columns names to snake case
df_orders.columns = (
    df_orders.columns
      .str.strip()
      .str.lower()
      .str.replace(" ", "_")
      .str.replace("(", "")
      .str.replace(")", "")
)

# Converting the data types of columns to datatime
time_cols = [
    "order_time",
    "delivery_time",
    "expected_delivery_time"
]

for col in time_cols:
    df_orders[col] = pd.to_datetime(df_orders[col], errors="coerce")

# Creating new column to calculate the delivery delay
df_orders["delivery_delay_mins"] = (
    df_orders["delivery_time_in_mins"]
    - df_orders["expected_delivery_time_in_mins"]
)

# Converting IDs to category helps for (memory optimization)
id_cols = [
    "order_id",
    "customer_id",
    "restaurant_id",
    "delivery_partner_id"
]

for col in id_cols:
    df_orders[col] = df_orders[col].astype("category")

# Creating new column to calculate total price for an order
df_orders["order_value"] = df_orders["price"] * df_orders["quantity"]

df_orders["is_delayed"] = df_orders["delivery_delay_mins"] > 0

  df_orders[col] = pd.to_datetime(df_orders[col], errors="coerce")
  df_orders[col] = pd.to_datetime(df_orders[col], errors="coerce")
  df_orders[col] = pd.to_datetime(df_orders[col], errors="coerce")


In [27]:
# Data cleaning operations on df_customers sheet

df_customers["customer_id"] = df_customers["customer_id"].astype("category")

df_customers["city"] = df_customers["city"].astype("category")

df_customers["email"] = df_customers["email"].str.lower()

df_customers["phone_number"] = df_customers["phone_number"].astype(str)
df_customers["phone_number"] = df_customers["phone_number"].str.strip()


In [28]:
# Data cleaning operations on df_delivery_partners sheet


df_delivery_partners["partner_id"] = df_delivery_partners["partner_id"].astype("category")

df_delivery_partners["city"] = df_delivery_partners["city"].astype("category")

In [29]:
# Data cleaning operations on df_restaurants sheet

# Converting data types
df_restaurants["restaurant_id"] = df_restaurants["restaurant_id"].astype("category")
df_restaurants["city"] = df_restaurants["city"].astype("category")

df_restaurants["rating_category"] = pd.cut(df_restaurants["ratings"], 
                                           bins=[0, 3, 4, 5], labels=["Average", "Good", "Excellent"])


In [30]:
print("Orders sheet\n")
print(df_orders.dtypes)
print("\nCustomers sheet\n")
print(df_customers.dtypes)
print("\nDelivery Partners sheet\n")
print(df_delivery_partners.dtypes)
print("\nRestaurants sheet\n")
print(df_restaurants.dtypes)

Orders sheet

order_id                                category
customer_id                             category
restaurant_id                           category
dish_ordered                              object
order_time                        datetime64[ns]
delivery_time                     datetime64[ns]
order_status                              object
price                                    float64
quantity                                   int64
delivery_address                          object
expected_delivery_time            datetime64[ns]
delivery_partner_id                     category
delivery_time_in_mins                      int64
expected_delivery_time_in_mins             int64
delivery_delay_mins                        int64
order_value                              float64
is_delayed                                  bool
dtype: object

Customers sheet

customer_id      category
customer_name      object
address            object
city             category
email            

In [32]:
output_path = (
    "C:/Users/ADMIN/Data-Analytics-Journey/"
    "06-projects/phase2-mini-project/data/"
    "swiggy_dataset_cleaned.xlsx"
)

with pd.ExcelWriter(output_path, engine="xlsxwriter") as writer:
    df_orders.to_excel(writer, sheet_name="orders_cleaned", index=False)
    df_customers.to_excel(writer, sheet_name="customers_cleaned", index=False)
    df_delivery_partners.to_excel(writer, sheet_name="delivery_partners_cleaned", index=False)
    df_restaurants.to_excel(writer, sheet_name="restaurants_cleaned", index=False)

print("Cleaned dataset successfully saved to Excel.")

Cleaned dataset successfully saved to Excel.


### Data Export

The cleaned and feature-engineered datasets were exported to a
multi-sheet Excel file to support further analysis and dashboarding
in Excel and Power BI.

## Summary â€“ Data Preparation Completed

All datasets were successfully loaded, cleaned, and standardized.
Key features such as order value and delivery delay were engineered
to support operational and business analysis.

At this stage, the data is fully analysis-ready and can be used for:
- Order cancellation analysis
- Delivery performance evaluation
- Customer and restaurant behavior insights
- Dashboarding in Power BI or Excel

Further analysis and visualization will be performed in subsequent steps.
