### Importing the Needed Modules

In [0]:
import sys
import os

# Absolute path to the repo root
PROJECT_ROOT = "/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform"

# Add repo root to PYTHONPATH (only once)
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Debug checks (safe to remove later)
print("Current working directory:", os.getcwd())
print("Repo root added to path:", PROJECT_ROOT in sys.path)
from src.paths import *

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_TRANSACTIONS_PATH, DIM_CUSTOMERS_PATH, DIM_PRODUCTS_PATH, DIM_EMPLOYEES_PATH, DIM_STORES_PATH, DIM_DATES_PATH, DIM_DISCOUNTS_PATH, FACT_SALES_PATH
from src.schema_definitions import FACT_SALES_SCHEMA
from delta.tables import DeltaTable

### Querying and Filtering only the sales data from the Silver Transactions Table

In [0]:
sales_df = (
    spark.read.table(SILVER_TRANSACTIONS_PATH)
    .filter(col("transaction_type") == "Sale")
)

### Fact_sales Schema Reference

In [0]:
FACT_SALES_SCHEMA

### creating date_sk from date column

In [0]:
sales_df = sales_df.withColumn(
    "date_sk",
    date_format(col("date").cast("date"), "yyyyMMdd").cast("int")
)
sales_df.limit(5).display()

### Joining Dimension for Fact Sales creation

In [0]:
fact_joined_df = (
    sales_df.alias("sa")
    .join(
        spark.table(DIM_CUSTOMERS_PATH).alias("c"),
        col("sa.customer_id") == col("c.customer_id"),
        "left"
    )
    .join(
        spark.table(DIM_PRODUCTS_PATH).alias("p"),
        col("sa.product_id") == col("p.product_id"),
        "left"
    )
    .join(
        spark.table(DIM_STORES_PATH).alias("st"),
        col("sa.store_id") == col("st.store_id"),
        "left"
    )
    .join(
        spark.table(DIM_EMPLOYEES_PATH).alias("e"),
        col("sa.employee_id") == col("e.employee_id"),
        "left"
    )
    .join(
        spark.table(DIM_DISCOUNTS_PATH).alias("d"),
        (
            (col("sa.date").cast("date") >= col("d.discount_start_date")) &
            (col("sa.date").cast("date") <= col("d.discount_end_date")) &
            (col("p.category") == col("d.category")) &
            (col("p.sub_category") == col("d.sub_category")) &
            (col("sa.discount") >= 0)
        ),
        "left"
    )
)


### Selecting the Needed columns for fact_sales

In [0]:
fact_sales_df = (
    fact_joined_df
    .select(
        col("sa.date_sk"),
        col("c.customer_sk"),
        col("p.product_sk"),
        col("st.store_sk"),
        col("e.employee_sk"),
        col("d.discount_sk"),
        col("sa.invoice_id"),
        col("sa.line"),
        col("sa.size"),
        col("sa.unit_price"),
        col("sa.quantity"),
        col("sa.line_total"),
        col("sa.invoice_total"),
        col("sa.currency"),
        col("sa.payment_method"),
        col("sa.date").alias("sale_date"),
        current_timestamp().alias("_created_at")
    )
)


### Creating Fact_sales Table with surrogate key

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {FACT_SALES_PATH} (
    sales_sk Long GENERATED ALWAYS AS IDENTITY,
    date_sk INTEGER,
    customer_sk LONG,
    product_sk LONG,
    store_sk LONG,
    employee_sk LONG,
    discount_sk LONG,
    invoice_id STRING,
    line INTEGER,
    size STRING,
    unit_price DOUBLE,
    quantity INTEGER,
    line_total DOUBLE,
    invoice_total DOUBLE,
    currency STRING,
    payment_method STRING,
    sale_date TIMESTAMP,
    _created_at TIMESTAMP
)
USING DELTA
""")

### Updating the Fact_sales Table

In [0]:
fact_sales_tbl = DeltaTable.forName(spark, FACT_SALES_PATH)

fact_sales_tbl.alias("tgt").merge(
    fact_sales_df.alias("src"),
    "tgt.invoice_id = src.invoice_id AND tgt.line = src.line"
).whenNotMatchedInsert(values={
    "date_sk": col("src.date_sk"),
    "customer_sk": col("src.customer_sk"),
    "product_sk": col("src.product_sk"),
    "store_sk": col("src.store_sk"),
    "employee_sk": col("src.employee_sk"),
    "discount_sk": col("src.discount_sk"),
    "invoice_id": col("src.invoice_id"),
    "line": col("src.line"),
    "size": col("src.size"),
    "unit_price": col("src.unit_price"),
    "quantity": col("src.quantity"),
    "line_total": col("src.line_total"),
    "invoice_total": col("src.invoice_total"),
    "currency": col("src.currency"),
    "payment_method": col("src.payment_method"),
    "sale_date": col("src.sale_date"),
    "_created_at": col("src._created_at")    
}).execute()


In [0]:
spark.read.table(FACT_SALES_PATH).limit(5).display()

In [0]:
spark.read.table(FACT_SALES_PATH).count()