### Importing the Needed Modules

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import *

import sys
sys.path.append('/Workspace/Users/mohammedthoufiq9360@gmail.com/Retail-And-Ecommerce-Analytics-Platform')

from src.paths import SILVER_TRANSACTIONS_PATH, DIM_CUSTOMERS_PATH, DIM_PRODUCTS_PATH, DIM_EMPLOYEES_PATH, DIM_STORES_PATH, DIM_DATES_PATH, FACT_RETURNS_PATH
from src.schema_definitions import FACT_RETURNS_SCHEMA
from delta.tables import DeltaTable

### Querying and Filtering only the returns data from the Silver Transactions Table

In [0]:
returns_df = (
    spark.read.table(SILVER_TRANSACTIONS_PATH)
    .filter(col("transaction_type") == "Return")
)
returns_df.limit(5).display()

invoice_id,line,customer_id,product_id,size,color,unit_price,quantity,date,discount,line_total,store_id,employee_id,currency,currency_symbol,sku,transaction_type,payment_method,invoice_total,ingestion_ts,_source_file
RET-US-005-04352918,1,306768,14586,UnKnown,UnKnown,26.0,1,2025-01-03T00:00:00.000Z,0.0,-13.0,5,56,USD,$,CHAC14586--,Return,Cash,-13.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
RET-US-005-04359285,1,288754,12924,M,YELLOW,41.0,1,2025-02-13T00:00:00.000Z,0.0,-41.0,5,58,USD,$,FESW12924-M-YELLOW,Return,Credit Card,-41.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
RET-US-005-04359695,2,273484,12018,36,UnKnown,44.5,1,2025-03-01T00:00:00.000Z,0.0,-44.5,5,60,USD,$,FEPA12018-36-,Return,Cash,-133.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
RET-US-005-04362300,1,281886,17668,M,PINK,23.5,1,2025-03-08T00:00:00.000Z,0.0,-23.5,5,58,USD,$,MAT-17668-M-PINK,Return,Credit Card,-23.5,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv
RET-CN-006-02878876,1,381094,4517,XXL,UnKnown,251.0,1,2023-03-15T00:00:00.000Z,0.0,-251.0,6,68,CNY,짜,MAUN4517-XXL-,Return,Cash,-251.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv


### Fact_returns Schema Reference

In [0]:
FACT_RETURNS_SCHEMA

{'return_sk': 'long',
 'date_sk': 'integer',
 'customer_sk': 'long',
 'product_sk': 'long',
 'store_sk': 'long',
 'employee_sk': 'long',
 'invoice_id': 'string',
 'line': 'integer',
 'size': 'string',
 'quantity_returned': 'integer',
 'refund_amount': 'double',
 'currency': 'string',
 'sale_date': 'timestamp',
 '_created_at': 'timestamp'}

### creating date_sk from date column

In [0]:
returns_df = returns_df.withColumn(
    "date_sk",
    date_format(col("date").cast("date"), "yyyyMMdd").cast("int")
)
returns_df.limit(5).display()

invoice_id,line,customer_id,product_id,size,color,unit_price,quantity,date,discount,line_total,store_id,employee_id,currency,currency_symbol,sku,transaction_type,payment_method,invoice_total,ingestion_ts,_source_file,date_sk
RET-US-005-04352918,1,306768,14586,UnKnown,UnKnown,26.0,1,2025-01-03T00:00:00.000Z,0.0,-13.0,5,56,USD,$,CHAC14586--,Return,Cash,-13.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv,20250103
RET-US-005-04359285,1,288754,12924,M,YELLOW,41.0,1,2025-02-13T00:00:00.000Z,0.0,-41.0,5,58,USD,$,FESW12924-M-YELLOW,Return,Credit Card,-41.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv,20250213
RET-US-005-04359695,2,273484,12018,36,UnKnown,44.5,1,2025-03-01T00:00:00.000Z,0.0,-44.5,5,60,USD,$,FEPA12018-36-,Return,Cash,-133.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv,20250301
RET-US-005-04362300,1,281886,17668,M,PINK,23.5,1,2025-03-08T00:00:00.000Z,0.0,-23.5,5,58,USD,$,MAT-17668-M-PINK,Return,Credit Card,-23.5,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv,20250308
RET-CN-006-02878876,1,381094,4517,XXL,UnKnown,251.0,1,2023-03-15T00:00:00.000Z,0.0,-251.0,6,68,CNY,짜,MAUN4517-XXL-,Return,Cash,-251.0,2026-01-14T05:54:30.121Z,dbfs:/Volumes/retail_analytics/raw/kaggle/global_fashion/transactions.csv,20230315


### Joining Dimension for Fact Returns creation

In [0]:
fact_joined_df = (
    returns_df.alias("r")
    .join(
        spark.table(DIM_CUSTOMERS_PATH).alias("c"),
        col("r.customer_id") == col("c.customer_id"),
        "left"
    )
    .join(
        spark.table(DIM_PRODUCTS_PATH).alias("p"),
        col("r.product_id") == col("p.product_id"),
        "left"
    )
    .join(
        spark.table(DIM_STORES_PATH).alias("st"),
        col("r.store_id") == col("st.store_id"),
        "left"
    )
    .join(
        spark.table(DIM_EMPLOYEES_PATH).alias("e"),
        col("r.employee_id") == col("e.employee_id"),
        "left"
    )
)


### Selecting the Needed columns for fact_returns

In [0]:
fact_returns_df = (
    fact_joined_df
    .select(
        col("r.date_sk"),
        col("c.customer_sk"),
        col("p.product_sk"),
        col("st.store_sk"),
        col("e.employee_sk"),
        col("r.invoice_id"),
        col("r.line"),
        col("r.size"),
        col("r.quantity").alias("quantity_returned"),
        col("r.line_total").alias("refund_amount"),
        col("r.currency"),
        col("r.date").alias("return_date"),
        current_timestamp().alias("_created_at")
    )
)


### Creating Fact_returns Table with surrogate key

In [0]:
spark.sql(f"""
CREATE TABLE IF NOT EXISTS {FACT_RETURNS_PATH} (
    return_sk Long GENERATED ALWAYS AS IDENTITY,
    date_sk INTEGER,
    customer_sk LONG,
    product_sk LONG,
    store_sk LONG,
    employee_sk LONG,
    invoice_id STRING,
    line INTEGER,
    size STRING,
    quantity_returned INTEGER,
    refund_amount DOUBLE,
    currency STRING,
    return_date TIMESTAMP,
    _created_at TIMESTAMP
)
USING DELTA
""")

DataFrame[]

### Updating the Fact_returns Table

In [0]:
fact_returns_tbl = DeltaTable.forName(spark, FACT_RETURNS_PATH)

fact_returns_tbl.alias("tgt").merge(
    fact_returns_df.alias("src"),
    """
    tgt.invoice_id = src.invoice_id AND
    tgt.line = src.line AND
    tgt.return_date = src.return_date
    """
).whenNotMatchedInsert(values={
    "date_sk": col("src.date_sk"),
    "customer_sk": col("src.customer_sk"),
    "product_sk": col("src.product_sk"),
    "store_sk": col("src.store_sk"),
    "employee_sk": col("src.employee_sk"),
    "invoice_id": col("src.invoice_id"),
    "line": col("src.line"),
    "size": col("src.size"),
    "quantity_returned": col("src.quantity_returned"),
    "refund_amount": col("src.refund_amount"),
    "currency": col("src.currency"),
    "return_date": col("src.return_date"),
    "_created_at": col("src._created_at")
}).execute()


DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]

In [0]:
spark.read.table(FACT_RETURNS_PATH).limit(5).display()

return_sk,date_sk,customer_sk,product_sk,store_sk,employee_sk,invoice_id,line,size,quantity_returned,refund_amount,currency,return_date,_created_at
1,20241226,292086,12720,5,115,RET-US-005-04349555,1,S,1,-18.0,USD,2024-12-26T00:00:00.000Z,2026-01-18T10:53:50.225Z
3,20250215,323164,3005,5,403,RET-US-005-04359374,1,S,1,-49.0,USD,2025-02-15T00:00:00.000Z,2026-01-18T10:53:50.225Z
5,20230107,1118971,3415,6,201,RET-CN-006-02865832,1,M,1,-222.0,CNY,2023-01-07T00:00:00.000Z,2026-01-18T10:53:50.225Z
7,20230906,201635,3322,6,289,RET-CN-006-02917303,2,M,1,-186.45,CNY,2023-09-06T00:00:00.000Z,2026-01-18T10:53:50.225Z
9,20231218,216342,12843,6,181,RET-CN-006-02951967,1,L,1,-333.0,CNY,2023-12-18T00:00:00.000Z,2026-01-18T10:53:50.225Z


In [0]:
spark.read.table(FACT_RETURNS_PATH).count()

338829