In [32]:
# Step 1 – Hello, Data!
import pandas as pd

# Load the raw CSV (replace with your actual file path)
df = pd.read_csv("data/synthetic_ecommerce_data.csv")

# Display the first 3 rows
df.head(3)

Unnamed: 0,date,customer_id,product,price,quantity,coupon_code,shipping_city,country,total_price
0,2024-10-19,8be304df-4697-4f40-9f57-5d849fe3947c,Smartphone,699.99,3,,Calgary,Canada,2099.97
1,2025-01-04,124f4ee9-b31b-4c15-a22c-21fd2a263098,Monitor,249.99,2,20OFF,Ottawa,Canada,499.98
2,2024-08-18,fcaac143-fab7-494a-b400-cf3e2f6d6d07,Laptop,999.99,6,NO_CODE,Leeds,UK,5999.94


## Step 2 – Pick the Right Container

For transaction records, a class is most appropriate because it allows us to encapsulate data and behavior (e.g., cleaning, calculating total revenue/profit) in a single unit. Namedtuples are immutable and lack methods; dictionaries are flexible but offer no structure or validation.

In [33]:
# Step 3 – Transaction Class and OO Data Structure
class Transaction:
    def __init__(self, date, customer_id, product, price, quantity,
                 coupon_code, shipping_city, country, total_price):
        self.date = date
        self.customer_id = customer_id
        self.product = product
        self.price = float(price)
        self.quantity = int(quantity)
        self.coupon_code = coupon_code or "NO_CODE"
        self.shipping_city = shipping_city
        self.country = country
        self.total_price = float(total_price)

    def total(self):
        return self.price * self.quantity

    def clean(self):
        if isinstance(self.product, str):
            self.product = self.product.strip().title()
        if isinstance(self.shipping_city, str):
            self.shipping_city = self.shipping_city.strip().title()
        if isinstance(self.country, str):
            self.country = self.country.strip().title()
        if not self.coupon_code:
            self.coupon_code = "NO_CODE"



In [34]:
# Step 4 – Bulk Loader
from typing import List

def load_transactions(df: pd.DataFrame) -> List[Transaction]:
    return [
        Transaction(
            row["date"],
            row["customer_id"],
            row["product"],
            row["price"],
            row["quantity"],
            row["coupon_code"],
            row["shipping_city"],
            row["country"],
            row["total_price"]
        )
        for _, row in df.iterrows()
    ]

transactions = load_transactions(df)

In [35]:
# Step 5 – Quick Profiling

# Basic stats on price and quantity
prices = [t.price for t in transactions]
quantities = [t.quantity for t in transactions]
total_prices = [t.total_price for t in transactions]
cities = {t.shipping_city for t in transactions}
countries = {t.country for t in transactions}

print("Min price:", min(prices))
print("Mean price:", round(sum(prices) / len(prices), 2))
print("Max price:", max(prices))

print("Min quantity:", min(quantities))
print("Max quantity:", max(quantities))

print("Total revenue (calculated):", round(sum(total_prices), 2))

print("Unique cities:", len(cities))
print("Unique countries:", len(countries))


Min price: 89.99
Mean price: 442.83
Max price: 999.99
Min quantity: 1
Max quantity: 10
Total revenue (calculated): 1375038.97
Unique cities: 15
Unique countries: 3


In [36]:
# Step 6 – Spot the Grime
# Show rows with missing or clearly malformed data
dirty_rows = df[df.isna().any(axis=1)]
dirty_rows.head(5)

Unnamed: 0,date,customer_id,product,price,quantity,coupon_code,shipping_city,country,total_price
0,2024-10-19,8be304df-4697-4f40-9f57-5d849fe3947c,Smartphone,699.99,3,,Calgary,Canada,2099.97
3,2025-01-03,bf0df2a4-d230-4113-8150-e0f6e8688588,Smartphone,699.99,8,,Manchester,UK,5599.92
9,2025-04-19,798de1c5-1027-46fa-9c35-c3718a8810f1,Smartphone,699.99,4,,London,UK,2799.96
19,2024-08-07,9ff80ba1-ebe6-483a-bd7f-888b0a38f6d7,Smartphone,699.99,1,,Glasgow,UK,699.99
20,2025-04-21,4f239858-8316-4c9f-8718-62cc20396dc4,Headphones,149.99,6,,Manchester,UK,899.94


In [37]:
# Step 7 – Cleaning Rules

def is_empty_coupon(code):
    return not isinstance(code, str) or code.strip() == ""

# Before cleaning
print("Before cleaning:")
print("  Country fields not titlecased:",
      sum(t.country != t.country.title() for t in transactions))
print("  Shipping city fields not titlecased:",
      sum(t.shipping_city != t.shipping_city.title() for t in transactions))
print("  Empty coupon codes:",
      sum(is_empty_coupon(t.coupon_code) for t in transactions))

# Apply cleaning
for t in transactions:
    t.clean()

# After cleaning
print("\nAfter cleaning:")
print("  Country fields not titlecased:",
      sum(t.country != t.country.title() for t in transactions))
print("  Shipping city fields not titlecased:",
      sum(t.shipping_city != t.shipping_city.title() for t in transactions))
print("  Coupon codes equal to 'NO_CODE':",
      sum(t.coupon_code == "NO_CODE" for t in transactions))


Before cleaning:
  Country fields not titlecased: 371
  Shipping city fields not titlecased: 0
  Empty coupon codes: 121

After cleaning:
  Country fields not titlecased: 0
  Shipping city fields not titlecased: 0
  Coupon codes equal to 'NO_CODE': 104
