In [0]:
import logging
from datetime import datetime

log_filename = f"logs/run_{datetime.now().strftime('%Y%m%d_%H%M')}.log"
log_format = "%(asctime)s | %(levelname)s | %(message)s"

logging.basicConfig(
    level=logging.INFO,
    format=log_format,
    handlers=[
        logging.StreamHandler(),
        logging.FileHandler(log_filename)
    ]
)

In [0]:
import os

# Log the start of the run
logging.info("Run started.")

# Log cluster/runtime info
cluster_id = os.environ.get("DATABRICKS_CLUSTER_ID", "unknown")
runtime_version = os.environ.get("DATABRICKS_RUNTIME_VERSION", "unknown")
logging.info(f"Cluster ID: {cluster_id}")
logging.info(f"Databricks Runtime Version: {runtime_version}")

# Log configuration values
config_values = {
    "log_filename": log_filename,
    "log_format": log_format,
    "log_level": logging.getLevelName(logging.getLogger().level)
}
logging.info(f"Configuration values: {config_values}")

In [0]:
import os, random, numpy as np
os.environ['PYTHONHASHSEED'] = '0'
random.seed(0)
np.random.seed(0)

In [0]:
%pip freeze > requirements.txt

In [0]:
import hashlib
import json
import glob

input_csv_files = glob.glob("data/*.csv")
data_hashes = {}

for csv_file in input_csv_files:
    with open(csv_file, "rb") as f:
        file_bytes = f.read()
        sha256_hash = hashlib.sha256(file_bytes).hexdigest()
        data_hashes[csv_file] = sha256_hash
        logging.info(f"SHA-256 for {csv_file}: {sha256_hash}")

with open("data_hashes.json", "w") as json_file:
    json.dump(data_hashes, json_file, indent=2)
logging.info("Saved SHA-256 hashes to data_hashes.json.")

logging.info(f"Loaded dataframes: {list(dfs.keys())}")
dfs.keys()

In [0]:
import pandas as pd

csv_files = glob.glob("data/*.csv")
dfs = {os.path.basename(f): pd.read_csv(f) for f in csv_files}

In [0]:
for name, df in dfs.items():
    # Convert date columns
    for col in df.columns:
        if "date" in col.lower():
            df[col] = pd.to_datetime(df[col], errors='coerce')
    # Trim text columns
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].str.strip()
    # Correct numeric columns
    for col in df.select_dtypes(include='object').columns:
        try:
            df[col] = pd.to_numeric(df[col], errors='ignore')
        except Exception:
            pass
    dfs[name] = df

In [0]:
menu_df = dfs["menu_items.csv"]
orders_df = dfs["order_details.csv"]

joined_df = orders_df.merge(
    menu_df,
    left_on="item_id",
    right_on="menu_item_id",
    how="left"
)

logging.info(f"Joined DataFrame shape: {joined_df.shape}")
joined_df.head()


In [0]:
# Combine date + time into single timestamp
joined_df["order_datetime"] = pd.to_datetime(
    joined_df["order_date"].astype(str) + " " + joined_df["order_time"].astype(str),
    errors="coerce"
)

# Assume quantity = 1 (since dataset lacks it)
joined_df["quantity"] = 1

# Select tidy columns
tidy_df = joined_df[[
    "order_id",
    "order_datetime",
    "item_name",
    "category",
    "price",
    "quantity"
]].copy()

# Compute revenue
tidy_df["revenue"] = tidy_df["price"] * tidy_df["quantity"]

logging.info(f"Tidy DataFrame shape: {tidy_df.shape}")
tidy_df.head()


In [0]:
logging.info("Computing metrics")

# 1. Top 5 items by quantity
top_items = (
    tidy_df.groupby("item_name")["quantity"]
    .sum()
    .sort_values(ascending=False)
    .head(5)
    .reset_index()
    .rename(columns={"quantity": "total_quantity"})
)

# 2. Revenue by category
revenue_by_cat = (
    tidy_df.groupby("category")["revenue"]
    .sum()
    .reset_index()
)

# 3. Busiest hour of the day
tidy_df["hour"] = tidy_df["order_datetime"].dt.hour
busiest_hour = (
    tidy_df.groupby("hour")["order_id"]
    .count()
    .reset_index()
    .rename(columns={"order_id": "order_count"})
    .sort_values("order_count", ascending=False)
)

logging.info("Metrics computed")
top_items, revenue_by_cat, busiest_hour.head()


In [0]:
from IPython.display import display

print("üìà Top 5 Items by Quantity")
display(top_items)

print("\nüí∞ Revenue by Category")
display(revenue_by_cat)

print("\n‚è∞ Busiest Hour of Day")
display(busiest_hour)


In [0]:
from datetime import datetime
import os

# Add metric labels
top_items_labeled = top_items.copy()
top_items_labeled["metric"] = "top_items_by_quantity"

revenue_by_cat_labeled = revenue_by_cat.copy()
revenue_by_cat_labeled["metric"] = "revenue_by_category"

busiest_hour_labeled = busiest_hour.copy()
busiest_hour_labeled["metric"] = "busiest_hour_of_day"

# Combine all metrics
metrics_df = pd.concat(
    [top_items_labeled, revenue_by_cat_labeled, busiest_hour_labeled],
    ignore_index=True,
    sort=False
)

logging.info(f"Combined metrics DataFrame shape: {metrics_df.shape}")

ts = datetime.now().strftime("%Y%m%d_%H%M")

# --- Always save to repo for versioning ---
repo_output_dir = "etl_output"
os.makedirs(repo_output_dir, exist_ok=True)
repo_output_path = f"{repo_output_dir}/metrics_{ts}.csv"
metrics_df.to_csv(repo_output_path, index=False)
logging.info(f"Saved metrics to repo path: {repo_output_path}")

metrics_df.head()


In [0]:
logging.info("Running assert tests")

# 1. Tidy DataFrame should not be empty
assert not tidy_df.empty, "Tidy DataFrame is empty!"

# 2. Tidy DataFrame columns check
expected_cols = {
    "order_id",
    "order_datetime",
    "item_name",
    "category",
    "price",
    "quantity",
    "revenue",
}
missing_cols = expected_cols - set(tidy_df.columns)
assert not missing_cols, f"Missing expected columns in tidy_df: {missing_cols}"

# 3. Metrics DataFrame should not be empty
assert not metrics_df.empty, "Metrics DataFrame is empty!"

# 4. Metrics completeness
expected_metrics = {
    "top_items_by_quantity",
    "revenue_by_category",
    "busiest_hour_of_day",
}
present_metrics = set(metrics_df["metric"].unique())
missing_metrics = expected_metrics - present_metrics
assert not missing_metrics, f"Missing metrics in metrics_df: {missing_metrics}"

logging.info("All assert tests passed ‚úîÔ∏è")
