### Importing Libraries

In [41]:
# importing libraries

import os # for handling files
import pandas as pd # for data cleaning 

from dotenv import load_dotenv #
load_dotenv(override=True) 


import sqlalchemy as sal # for connecting to sql database

### Cleaning and Filtering data 

In [42]:
folder_path = os.getenv("FOLDER_PATH")

cleaned_tables = {}

# Loop through all CSV files in folder and remove duplicates or blanks rows(IF EXISTS!!)
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        table_name = file.replace(".csv", "")
        df = pd.read_csv(os.path.join(folder_path, file))
        original_rows = len(df)
        
        # Remove rows where all columns are NaN, if exists
        df = df.dropna(how="all")
        
        # Remove row duplicates, if exists
        df_cleaned = df.drop_duplicates()
        
        print(f"{table_name}: Original rows = {original_rows}, After cleaning = {len(df_cleaned)}")
        
        # Save cleaned DataFrame in dictionary
        cleaned_tables[table_name] = df_cleaned

dim_customer: Original rows = 107776, After cleaning = 107776
dim_delivery_partner: Original rows = 15000, After cleaning = 15000
dim_menu_item: Original rows = 342671, After cleaning = 342671
dim_restaurant: Original rows = 19995, After cleaning = 19995
fact_delivery_performance: Original rows = 149166, After cleaning = 149166
fact_orders: Original rows = 149166, After cleaning = 149166
fact_order_items: Original rows = 342994, After cleaning = 342994
fact_ratings: Original rows = 68842, After cleaning = 68825


In [43]:
cleaned_tables["fact_orders"].head(10)

Unnamed: 0,order_id,customer_id,restaurant_id,delivery_partner_id,order_timestamp,subtotal_amount,discount_amount,delivery_fee,total_amount,is_cod,is_cancelled
0,ORD202501023439,CUST181110,REST08622,DP05541,2025-01-01 12:00:00,471.62,35.44,30.56,466.74,N,N
1,ORD202501012051,CUST025572,REST02383,DP08091,2025-01-01 12:00:00,255.68,0.0,27.45,283.13,Y,N
2,ORD202501019281,CUST179306,REST14069,DP02021,2025-01-01 12:00:00,428.38,0.0,26.23,454.61,N,N
3,ORD202501000124,CUST191820,REST19745,DP13859,2025-01-01 12:00:00,260.81,0.0,32.75,293.56,N,N
4,ORD202501006518,CUST033760,REST12962,DP09615,2025-01-01 12:00:00,280.33,0.0,25.57,305.9,N,N
5,ORD202501018255,CUST011850,REST01307,DP14063,2025-01-01 12:01:00,310.95,0.0,35.05,345.99,Y,N
6,ORD202501004299,CUST107475,REST12542,DP07728,2025-01-01 12:02:00,206.41,0.0,30.62,237.03,Y,N
7,ORD202501018036,CUST093042,REST13907,DP01276,2025-01-01 12:03:00,300.3,48.31,31.41,283.4,N,N
8,ORD202501009329,CUST104825,REST10267,DP03078,2025-01-01 12:04:00,371.6,0.0,34.35,405.95,N,N
9,ORD202501007498,CUST135654,REST05434,DP11625,2025-01-01 12:06:00,306.23,33.38,20.7,293.55,N,N


### Connecting to MS SQL server and creating Database and Schema 

In [44]:
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError

load_dotenv(override=True) 

engine_url = os.getenv("DB_ENGINE_URL")  # master DB URL
schema_name = "quick_bite_schema"
database_name = "quick_bite_database"

# Connect to master to create database if not exists
engine_master = create_engine(engine_url)

with engine_master.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
    try:
        # Ensure database exists
        result = conn.execute(text(f"SELECT * FROM sys.databases WHERE name='{database_name}'"))
        if result.fetchone(): 
            print(f"Database '{database_name}' already exists.")
        else:
            conn.execute(text(f"CREATE DATABASE {database_name};"))
            print(f"Database '{database_name}' created successfully.")
    except SQLAlchemyError as e:
        print(f"Failed to create database '{database_name}': {e}")

# Connect to the actual database "quick_bite_database"
engine_db = create_engine(
    f"mssql+pyodbc://localhost\\SQLEXPRESS/{database_name}?driver=ODBC+Driver+17+for+SQL+Server&trusted_connection=yes"
)

# --- 3. Ensure schema exists ---
with engine_db.connect().execution_options(isolation_level="AUTOCOMMIT") as conn:
    try:
        result = conn.execute(text(f"SELECT * FROM sys.schemas WHERE name='{schema_name}'"))
        if result.fetchone():
            print(f"Schema '{schema_name}' already exists.")
        else:
            conn.execute(text(f"EXEC('CREATE SCHEMA {schema_name}')"))
            print(f"Schema '{schema_name}' created successfully.")
    except SQLAlchemyError as e:
        print(f"Failed to create schema '{schema_name}': {e}")

Database 'quick_bite_database' already exists.
Schema 'quick_bite_schema' already exists.


### Load data to SQL server

In [None]:
for table_name, df in cleaned_tables.items():
    df.to_sql(
        name=table_name,
        con=engine_db,
        schema=schema_name,
        if_exists='append',  # Append new data if table exists
        index=False
    )
    print(f"{table_name} pushed to SQL Server schema '{schema_name}'.")

print("ETL Process Completed Successfully!")

dim_customer pushed to SQL Server schema 'quick_bite_schema'.
dim_delivery_partner pushed to SQL Server schema 'quick_bite_schema'.
dim_menu_item pushed to SQL Server schema 'quick_bite_schema'.
dim_restaurant pushed to SQL Server schema 'quick_bite_schema'.
fact_delivery_performance pushed to SQL Server schema 'quick_bite_schema'.
fact_orders pushed to SQL Server schema 'quick_bite_schema'.
fact_order_items pushed to SQL Server schema 'quick_bite_schema'.
fact_ratings pushed to SQL Server schema 'quick_bite_schema'.
Table 'dim_customer' cleaned and replaced in SQL (NaNs & exact duplicates removed).
Table 'dim_delivery_partner' cleaned and replaced in SQL (NaNs & exact duplicates removed).
Table 'dim_menu_item' cleaned and replaced in SQL (NaNs & exact duplicates removed).
Table 'dim_restaurant' cleaned and replaced in SQL (NaNs & exact duplicates removed).
Table 'fact_delivery_performance' cleaned and replaced in SQL (NaNs & exact duplicates removed).
Table 'fact_orders' cleaned and r

### Optional step, if by mistakenly user ran the script multiple times and then wants to clear the duplicates

In [None]:
for table_name in cleaned_tables.keys():
    # Fetch the table back from SQL
    df_db = pd.read_sql_table(table_name, con=engine_db, schema=schema_name)
    
    # Remove rows where all columns are NaN
    df_db_cleaned = df_db.dropna(how="all")
    
    # Remove exact duplicate rows (all columns identical)
    df_db_cleaned = df_db_cleaned.drop_duplicates()
    
    # Replace the table in SQL with cleaned data
    df_db_cleaned.to_sql(
        name=table_name,
        con=engine_db,
        schema=schema_name,
        if_exists='replace',  # Replace the existing table
        index=False
    )
    
    print(f"Table '{table_name}' cleaned and replaced in SQL (NaNs & exact duplicates removed).")