### Importing the Needed Modules

In [0]:
import sys
import os

# Absolute path to the repo root
PROJECT_ROOT = "/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform"

# Add repo root to PYTHONPATH (only once)
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

# Debug checks (safe to remove later)
print("Current working directory:", os.getcwd())
print("Repo root added to path:", PROJECT_ROOT in sys.path)
from src.paths import *

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/thiruvengadamk16@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_TRANSACTIONS_PATH, DIM_CUSTOMERS_PATH, DIM_PRODUCTS_PATH, DIM_EMPLOYEES_PATH, DIM_STORES_PATH, DIM_DATES_PATH, FACT_RETURNS_PATH
from src.schema_definitions import FACT_RETURNS_SCHEMA
from delta.tables import DeltaTable

### Querying and Filtering only the returns data from the Silver Transactions Table

In [0]:
returns_df = (
    spark.read.table(SILVER_TRANSACTIONS_PATH)
    .filter(col("transaction_type") == "Return")
)
returns_df.limit(5).display()

### Fact_returns Schema Reference

In [0]:
FACT_RETURNS_SCHEMA

### creating date_sk from date column

In [0]:
returns_df = returns_df.withColumn(
    "date_sk",
    date_format(col("date").cast("date"), "yyyyMMdd").cast("int")
)
returns_df.limit(5).display()

### Joining Dimension for Fact Returns creation

In [0]:
fact_joined_df = (
    returns_df.alias("r")
    .join(
        spark.table(DIM_CUSTOMERS_PATH).alias("c"),
        col("r.customer_id") == col("c.customer_id"),
        "left"
    )
    .join(
        spark.table(DIM_PRODUCTS_PATH).alias("p"),
        col("r.product_id") == col("p.product_id"),
        "left"
    )
    .join(
        spark.table(DIM_STORES_PATH).alias("st"),
        col("r.store_id") == col("st.store_id"),
        "left"
    )
    .join(
        spark.table(DIM_EMPLOYEES_PATH).alias("e"),
        col("r.employee_id") == col("e.employee_id"),
        "left"
    )
)


### Selecting the Needed columns for fact_returns

In [0]:
fact_returns_df = (
    fact_joined_df
    .select(
        col("r.date_sk"),
        col("c.customer_sk"),
        col("p.product_sk"),
        col("st.store_sk"),
        col("e.employee_sk"),
        col("r.invoice_id"),
        col("r.line"),
        col("r.size"),
        col("r.quantity").alias("quantity_returned"),
        col("r.line_total").alias("refund_amount"),
        col("r.currency"),
        col("r.date").alias("return_date"),
        current_timestamp().alias("_created_at")
    )
)


### Creating Fact_returns Table with surrogate key

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {FACT_RETURNS_PATH} (
    return_sk Long GENERATED ALWAYS AS IDENTITY,
    date_sk INTEGER,
    customer_sk LONG,
    product_sk LONG,
    store_sk LONG,
    employee_sk LONG,
    invoice_id STRING,
    line INTEGER,
    size STRING,
    quantity_returned INTEGER,
    refund_amount DOUBLE,
    currency STRING,
    return_date TIMESTAMP,
    _created_at TIMESTAMP
)
USING DELTA
""")

### Updating the Fact_returns Table

In [0]:
fact_returns_tbl = DeltaTable.forName(spark, FACT_RETURNS_PATH)

fact_returns_tbl.alias("tgt").merge(
    fact_returns_df.alias("src"),
    """
    tgt.invoice_id = src.invoice_id AND
    tgt.line = src.line AND
    tgt.return_date = src.return_date
    """
).whenNotMatchedInsert(values={
    "date_sk": col("src.date_sk"),
    "customer_sk": col("src.customer_sk"),
    "product_sk": col("src.product_sk"),
    "store_sk": col("src.store_sk"),
    "employee_sk": col("src.employee_sk"),
    "invoice_id": col("src.invoice_id"),
    "line": col("src.line"),
    "size": col("src.size"),
    "quantity_returned": col("src.quantity_returned"),
    "refund_amount": col("src.refund_amount"),
    "currency": col("src.currency"),
    "return_date": col("src.return_date"),
    "_created_at": col("src._created_at")
}).execute()


In [0]:
spark.read.table(FACT_RETURNS_PATH).limit(5).display()

In [0]:
spark.read.table(FACT_RETURNS_PATH).count()