In [None]:
import os 
os.getcwdb()
os.listdir("data")

In [None]:
import pandas as pd
import numpy as np

# load datasets from the data folder
customer = pd.read_csv("data/customer_data.csv")
product = pd.read_csv("data/product_data.csv")
sales = pd.read_csv("data/sales_data.csv")
store = pd.read_csv("data/store_data.csv")

# preview the dataset
sales.head()




In [None]:
# QUICK SANITY CHECK; We confirm the data actually loaded and looks correct.

print("CUSTOMER:", customer.shape)
print("PRODUCT :", product.shape)
print("SALES    :", sales.shape)
print("STORE   :", store.shape)

sales.head()

In [None]:
# Structure + Missing Values Summary;verify columns, datatypes, and missing values.
for name, df in {
    "customers": customer,
    "products": product,
    "sales": sales,
    "stores": store
}.items():
    print(f"\n--- {name.upper()} ---")
    print(df.info())
    print("\nMissing values per column:")
    print(df.isna().sum())

In [None]:
# CLEANING SALES: PARSE DATE

# Dates often load as text; convert to datetime so we can group by month/year.

sales["date"] = pd.to_datetime(sales["date"], errors="coerce")

# Check if any dates failed to parse (became NaT)
sales["date"].isna().sum()

In [None]:
# Validate Numeric Logic (Quality checks BEFORE editing)

# SALES VALIDATION: NUMERIC LOGIC
# =========================
# Before analysis we confirm key numeric fields make sense.

# Quantity must be >= 1 (no zero/negative sales)
bad_qty = sales[sales["quantity"] <= 0]

# Discount should be between 0 and 1 (0% to 100%)
bad_discount = sales[(sales["discount"] < 0) | (sales["discount"] > 1)]

# Returned should be only 0 or 1 (binary)
bad_returned = sales[~sales["returned"].isin([0, 1])]

print("Bad quantity rows:", len(bad_qty))
print("Bad discount rows:", len(bad_discount))
print("Bad returned rows:", len(bad_returned))


In [None]:
# Handle Missing Discount (Fill with 0)
# HANDLE MISSING DISCOUNT
# =========================
# If discount is missing, assume no discount (0).
# This is common and avoids losing many rows.

sales["discount"] = sales["discount"].fillna(0)

# Confirm discount missing values are now 0
sales["discount"].isna().sum()

In [None]:
# Handle Missing customer_id
# customer_id is required for joining to customers and for customer-level revenue.
# Rows missing customer_id cannot be reliably analyzed for customer behavior.

missing_customer = sales["customer_id"].isna().sum()
print("Missing customer_id before:", missing_customer)

sales = sales.dropna(subset=["customer_id"])

print("Missing customer_id after:", sales["customer_id"].isna().sum())
print("Sales shape after drop:", sales.shape)

In [None]:
# SALES MISSING VALUES AFTER CLEANING
# =========================
# Confirm what missing values remain and where.

sales.isna().sum()

In [None]:
# PRODUCTS CLEANING 
# =========================
# Some datasets use placeholders like "???" for category.
# We replace obvious placeholders with NaN so we can handle them consistently.

product["category"] = product["category"].replace(["???", "UNKNOWN", "Unknown"], np.nan)
product["category"].isna().sum()


In [None]:
# JOIN VALIDATION: SALES + PRODUCTS (indicator)
# =========================
# We check if every product_id in sales exists in products.
# indicator=True creates a _merge column:
# - both: matched
# - left_only: sales has product_id not found in products

sales_products_check = sales.merge(
    product,
    on="product_id",
    how="left",
    indicator=True
)

sales_products_check["_merge"].value_counts()

In [None]:
# Inspect Unmatched Product IDs (left_only)
#  INSPECT  UNMATCHED PRODUCT_IDs

# These rows won't have product info like list_price.
# They can affect revenue calculations.

unmatched_products = sales_products_check[sales_products_check["_merge"] == "left_only"]
unmatched_products[["transaction_id", "product_id"]].head(), unmatched_products.shape


In [None]:
# MERGE SALES + PRODUCTS (ENRICH SALES)
# =========================
# This creates a transaction-level dataset with product pricing.

merged = sales.merge(
    product,
    on="product_id",
    how="left"
)

merged.shape

In [None]:
# Creatating Discounted Price + Revenue
# discounted_price = list_price * (1 - discount)
# revenue = quantity * discounted_price

merged["discounted_price"] = merged["list_price"] * (1 - merged["discount"])
merged["revenue"] = merged["quantity"] * merged["discounted_price"]

merged[["list_price", "discount", "discounted_price", "quantity", "revenue"]].head()

In [None]:
# Create Month Columns (for grouping)
# TIME FEATURES: MONTH
# month as period (e.g., 2023-08) is perfect for monthly analysis.


merged["month"] = merged["date"].dt.to_period("M")
merged["month_num"] = merged["date"].dt.month

merged[["date", "month", "month_num"]].head()

In [None]:
# MONTHLY CUSTOMER REVENUE

# Sum revenue per customer per month.

monthly_customer_revenue = (
    merged.groupby(["month", "customer_id"])["revenue"]
          .sum()
          .reset_index()
          .sort_values(["month", "revenue"], ascending=[True, False])
)

monthly_customer_revenue.head(10)

In [None]:
# MONTHLY TOTAL REVENUE

monthly_revenue = (
    merged.groupby("month")["revenue"]
          .sum()
          .reset_index()
          .sort_values("month")
)

monthly_revenue

In [None]:

# TOP CUSTOMERS OVERALL

top_customers = (
    merged.groupby("customer_id")["revenue"]
          .sum()
          .sort_values(ascending=False)
          .head(10)
)

top_customers

In [None]:
# REVENUE BY PRODUCT CATEGORY

category_revenue = (
    merged.groupby("category")["revenue"]
          .sum()
          .sort_values(ascending=False)
)

category_revenue

In [None]:
# Validate Sales â†’ Customers Join
# JOIN VALIDATION: SALES + CUSTOMERS

sales_customer_check = sales.merge(
    customer,
    on="customer_id",
    how="left",
    indicator=True
)

sales_customer_check["_merge"].value_counts()

In [None]:
# Merge Everything (Final Dataset)
# FINAL MERGE: SALES + PRODUCTS + CUSTOMERS + STORES

final_df = (
    sales.merge(product, on="product_id", how="left")
         .merge(customer, on="customer_id", how="left")
         .merge(store, on="store_id", how="left")
)

# Add revenue metrics again (safe if not already computed here)
final_df["date"] = pd.to_datetime(final_df["date"], errors="coerce")
final_df["discount"] = final_df["discount"].fillna(0)
final_df["discounted_price"] = final_df["list_price"] * (1 - final_df["discount"])
final_df["revenue"] = final_df["quantity"] * final_df["discounted_price"]
final_df["month"] = final_df["date"].dt.to_period("M")

final_df.shape

In [None]:
# FINAL DATASET PREVIEW

final_df.head()

In [None]:
# Summary
# - Loaded all datasets successfully
# - Validated structure, datatypes, and missing values
# - Cleaned sales table: fixed date, discount, customer_id
# - Validated join keys using indicator merges
# - Merged sales with dimension tables
# - Created discounted_price and revenue
# - Produced monthly revenue + monthly customer revenue outputs
# print("Notebook is cleaned, validated, merged, and analysis-ready")