## Full ETL Function

In [60]:
# ============================================================
# Task 2: ETL Process for Online Retail Dataset
# ============================================================

import pandas as pd
import sqlite3
import logging
from datetime import timedelta

# ---------------------------
# Logger setup
# ---------------------------
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO
)
logger = logging.getLogger(__name__)

# ---------------------------
# ETL function
# ---------------------------
def etl_process(csv_path, db_path="retail_dw.db"):
    try:
        logger.info("ETL started")

        # ---------------------------
        # Extract
        # ---------------------------
        df = pd.read_csv(csv_path, encoding='ISO-8859-1')
        logger.info(f"Extraction: {len(df)} rows loaded from {csv_path}")

        # Convert InvoiceDate to datetime immediately
        df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

        # ---------------------------
        # Transform
        # ---------------------------
        # Remove missing values
        df = df.dropna(subset=['CustomerID', 'InvoiceDate'])
        logger.info(f"Transformation: After removing missing values -> {len(df)} rows")

        # Remove outliers
        df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
        logger.info(f"Transformation: After removing outliers -> {len(df)} rows")

        # Calculate TotalSales
        df['TotalSales'] = df['Quantity'] * df['UnitPrice']

        # Dynamic cutoff date (last year in dataset)
        latest_date = df['InvoiceDate'].max()
        cutoff_date = latest_date - timedelta(days=365)
        df_last_year = df[df['InvoiceDate'] >= cutoff_date].copy()

        logger.info(f"Transformation: Latest invoice date is {latest_date}, cutoff date is {cutoff_date}")
        logger.info(f"Transformation: After last-year filter -> {len(df_last_year)} rows")

        # Customer summary
        customer_summary = (
            df_last_year.groupby(['CustomerID', 'Country'])
            .agg(TotalPurchases=('TotalSales', 'sum'),
                 TotalQuantity=('Quantity', 'sum'))
            .reset_index()
        )
        logger.info(f"Transformation: Customer summary has {len(customer_summary)} rows")

        # Time dimension
        time_dim = (
            df_last_year[['InvoiceDate']]
            .drop_duplicates()
            .assign(Date=lambda x: x['InvoiceDate'].dt.date,
                    Year=lambda x: x['InvoiceDate'].dt.year,
                    Quarter=lambda x: x['InvoiceDate'].dt.quarter,
                    Month=lambda x: x['InvoiceDate'].dt.month,
                    Day=lambda x: x['InvoiceDate'].dt.day)
        )
        logger.info(f"Transformation: Time dimension has {len(time_dim)} rows")

        # ---------------------------
        # Load
        # ---------------------------
        conn = sqlite3.connect(db_path)
        df_last_year.to_sql('SalesFact', conn, if_exists='replace', index=False)
        customer_summary.to_sql('CustomerDim', conn, if_exists='replace', index=False)
        time_dim.to_sql('TimeDim', conn, if_exists='replace', index=False)
        conn.close()

        logger.info(f"Loading: Inserted {len(df_last_year)} rows into SalesFact")
        logger.info(f"Loading: Inserted {len(customer_summary)} rows into CustomerDim")
        logger.info(f"Loading: Inserted {len(time_dim)} rows into TimeDim")

        logger.info("ETL completed successfully")

    except Exception as e:
        logger.error(f"ETL failed: {e}")
        raise


# ---------------------------
# Run ETL
# ---------------------------
etl_process("online_retail.csv")

2025-08-12 17:30:37,011 - INFO - ETL started
2025-08-12 17:30:38,009 - INFO - Extraction: 541909 rows loaded from online_retail.csv
2025-08-12 17:30:38,286 - INFO - Transformation: After removing missing values -> 406829 rows
2025-08-12 17:30:38,339 - INFO - Transformation: After removing outliers -> 397884 rows
2025-08-12 17:30:38,474 - INFO - Transformation: Latest invoice date is 2011-12-09 12:50:00, cutoff date is 2010-12-09 12:50:00
2025-08-12 17:30:38,475 - INFO - Transformation: After last-year filter -> 384529 rows
2025-08-12 17:30:38,580 - INFO - Transformation: Customer summary has 4277 rows
2025-08-12 17:30:38,622 - INFO - Transformation: Time dimension has 16630 rows
2025-08-12 17:30:41,845 - INFO - Loading: Inserted 384529 rows into SalesFact
2025-08-12 17:30:41,846 - INFO - Loading: Inserted 4277 rows into CustomerDim
2025-08-12 17:30:41,847 - INFO - Loading: Inserted 16630 rows into TimeDim
2025-08-12 17:30:41,848 - INFO - ETL completed successfully


### Clarification on "Last Year" Filter Requirement

The exam instructions specify filtering sales to the last year, assuming the current date is **August 12, 2025**.  
If applied directly to the **Online Retail** dataset, which contains only transactions from **December 2010 to December 2011**, this filter would result in **zero rows** being selected.

To fulfill the spirit of the requirement while working within the dataset’s constraints, we adapted the logic to:
- Use the **latest available transaction date in the dataset** as the reference “current” date.
- Subtract 365 days from this to define the cutoff for “last year.”

This approach keeps the requirement’s intent (most recent year of data) but ensures meaningful output for fact and dimension tables during the exam demonstration.
