Transforming Data

In [2]:
from pyspark.sql import *
from pyspark.sql.functions import* 
from pyspark.sql.types import *
from pyspark.sql.window import Window

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 4, Finished, Available, Finished)

In [3]:
retailers_bronze = spark.table("bronze_retailers")

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 5, Finished, Available, Finished)

In [4]:
repayments_bronze= spark.table("bronze_repayments")

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 6, Finished, Available, Finished)

In [5]:
transactions_bronze= spark.table("bronze_transactions")

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 7, Finished, Available, Finished)

In [15]:
from pyspark.sql import functions as F
retailers_silver = retailers_bronze \
    .withColumn("onboarding_date", F.to_date(F.col("onboarding_date"))) \
    .withColumn("has_business_registration", F.col("has_business_registration").cast("boolean")) \
    .withColumn("is_defaulter", F.col("is_defaulter").cast("boolean")) \
    .withColumn("latitude", F.col("latitude").cast("double")) \
    .withColumn("longitude", F.col("longitude").cast("double")) \
    .withColumn("credit_limit", F.col("credit_limit").cast("integer")) \
    .withColumn("phone_number", F.lpad(F.col("phone_number").cast("string"), 11, "0")) \
    .withColumn("account_age_days", 
                F.datediff(F.current_date(), F.col("onboarding_date"))) \
    .where(F.col("credit_limit") > 0) \
    .where(F.col("retailer_id").isNotNull()) \
    .dropDuplicates(["retailer_id"])

# Validate coordinates are in Nigeria range
retailers_silver = retailers_silver \
    .where((F.col("latitude").between(4.0, 14.0)) & 
           (F.col("longitude").between(2.5, 15.0)))

StatementMeta(, 4701e79c-784c-4c87-9800-fde495abc496, 17, Finished, Available, Finished)

In [16]:
retailers_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("Silver.dbo.retailers")

StatementMeta(, 4701e79c-784c-4c87-9800-fde495abc496, 18, Finished, Available, Finished)

In [20]:
transactions_silver = transactions_bronze \
    .withColumn("order_date", F.to_date(F.col("order_date"))) \
    .withColumn("order_amount", F.col("order_amount").cast("integer")) \
    .withColumn("num_products", F.col("num_products").cast("integer")) \
    .withColumn("order_year", F.year(F.col("order_date"))) \
    .withColumn("order_month", F.month(F.col("order_date"))) \
    .withColumn("order_quarter", F.quarter(F.col("order_date"))) \
    .withColumn("product_categories_array", F.split(F.col("product_categories"), "\\|")) \
    .withColumn("manufacturers_array", F.split(F.col("manufacturers"), "\\|")) \
    .where(F.col("order_amount") > 0) \
    .where(F.col("transaction_id").isNotNull()) \
    .where(F.col("retailer_id").isNotNull()) \
    .dropDuplicates(["transaction_id"])

window_spec = Window.partitionBy("retailer_id").orderBy("order_date")
transactions_silver = transactions_silver \
    .withColumn("order_rank", F.row_number().over(window_spec)) \
    .withColumn("is_first_order", F.when(F.col("order_rank") == 1, True).otherwise(False))

StatementMeta(, 4701e79c-784c-4c87-9800-fde495abc496, 22, Finished, Available, Finished)

In [21]:
display(transactions_silver)

StatementMeta(, 4701e79c-784c-4c87-9800-fde495abc496, 23, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, ac9332df-65ca-4ba7-a581-60eba16acc03)

In [22]:
transactions_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("Silver.dbo.transactions")

StatementMeta(, 4701e79c-784c-4c87-9800-fde495abc496, 24, Finished, Available, Finished)

In [25]:
repayments_silver = repayments_bronze \
    .withColumn("order_date", F.to_date(F.col("order_date"))) \
    .withColumn("due_date", F.to_date(F.col("due_date"))) \
    .withColumn("payment_date", F.to_date(F.col("payment_date"))) \
    .withColumn("amount_due", F.col("amount_due").cast("integer")) \
    .withColumn("amount_paid", F.col("amount_paid").cast("integer")) \
    .withColumn("days_late", F.col("days_late").cast("integer")) \
    .withColumn("payment_completion_rate", 
                F.col("amount_paid") / F.col("amount_due")) \
    .withColumn("is_on_time", F.when(F.col("days_late") <= 3, True).otherwise(False)) \
    .withColumn("is_defaulted", 
                F.when(F.col("payment_status") == "Defaulted", True).otherwise(False)) \
    .where(F.col("repayment_id").isNotNull()) \
    .dropDuplicates(["repayment_id"])

StatementMeta(, 4701e79c-784c-4c87-9800-fde495abc496, 27, Finished, Available, Finished)

In [26]:
repayments_silver.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("Silver.dbo.repayments")

StatementMeta(, 4701e79c-784c-4c87-9800-fde495abc496, 28, Finished, Available, Finished)

In [8]:
retailers = spark.table("Silver.dbo.retailers")
repayments = spark.table("Silver.dbo.repayments")
transactions = spark.table("Silver.dbo.transactions")

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 10, Finished, Available, Finished)

In [11]:

from pyspark.sql import functions as F

# Create comprehensive denormalized view
denormalized = retailers \
    .join(transactions, "retailer_id", "left") \
    .join(repayments, "transaction_id", "left") \
    .select(
        # Transaction identifiers
        "transaction_id",
        "repayment_id",
        "order_number",
        
        # Retailer identifiers
        retailers["retailer_id"].alias("retailer_id"),
        "business_name",
        "owner_name",
        
        # Dates
        transactions["order_date"].alias("order_date"),
        repayments["due_date"].alias("due_date"),
        repayments["payment_date"].alias("payment_date"),
        "order_year",
        "order_month",
        "order_quarter",
        
        # Transaction details
        transactions["order_amount"].alias("order_amount"),
        "num_products",
        "product_categories",
        "product_categories_array",
        "manufacturers",
        "manufacturers_array",
        "is_first_order",
        
        # Payment details
        repayments["amount_paid"].alias("amount_paid"),
        repayments["days_late"].alias("days_late"),
        repayments["payment_status"].alias("payment_status"),
        repayments["is_on_time"].alias("is_on_time"),
        repayments["is_defaulted"].alias("is_defaulted"),
        repayments["payment_completion_rate"].alias("payment_completion_rate"),
        
        # Retailer demographics
        "owner_gender",
        "owner_age",
        "shop_type",
        "state",
        "urbanization_level",
        "latitude",
        "longitude",
        
        # Business characteristics
        "years_in_business",
        "months_in_business",
        "num_employees",
        "has_business_registration",
        
        # Alternative data
        "mobile_money_pattern",
        "monthly_mobile_money_txns",
        
        # Credit info
        "credit_segment",
        "credit_limit",
        "is_defaulter",
        retailers["account_age_days"].alias("account_age_days")
    )
denormalized = denormalized \
    .withColumn("order_amount", F.coalesce(F.col("order_amount"), F.lit(0))) \
    .withColumn("amount_paid", F.coalesce(F.col("amount_paid"), F.lit(0))) \
    .withColumn("days_late", F.coalesce(F.col("days_late"), F.lit(0))) \
    .withColumn("payment_status", 
                F.when(F.col("payment_status").isNull(), "No Transaction") # Label changed for clarity
                .otherwise(F.col("payment_status")))
# Add calculated fields
denormalized = denormalized \
    .withColumn("order_to_credit_ratio", 
                F.when(F.col("order_amount").isNull(), 0) # If no order, ratio is 0
                .otherwise(F.col("order_amount") / F.col("credit_limit"))) \
    .withColumn("payment_deficit", F.col("order_amount") - F.col("amount_paid")) \
    .withColumn("is_late_payment", F.when(F.col("days_late") > 3, True).otherwise(False)) \
    .withColumn("is_weekend_order", F.when(F.dayofweek(F.col("order_date")).isin([1, 7]), True).otherwise(False))

# Save denormalized table
denormalized.write \
    .format("delta") \
    .mode("overwrite") \
    .option("overwriteSchema", "true") \
    .saveAsTable("Silver.dbo.silver_retailer_transactions")

print(f"✓ Created denormalized table with {denormalized.count():,} records")

# Show schema
print("\nDenormalized table schema:")
denormalized.printSchema()

# Show sample
print("\nSample data:")
denormalized.select(
    "retailer_id", "business_name", "order_date", "order_amount", 
    "payment_status", "credit_segment"
).show(5, truncate=False)

print("\n✓ Silver layer transformation complete!")
print("\n⭐ Key output: silver_retailer_transactions (denormalized)")
print("   This table will be the SOURCE for all Gold layer transformations")

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 13, Finished, Available, Finished)

✓ Created denormalized table with 75,347 records

Denormalized table schema:
root
 |-- transaction_id: string (nullable = true)
 |-- repayment_id: string (nullable = true)
 |-- order_number: integer (nullable = true)
 |-- retailer_id: string (nullable = true)
 |-- business_name: string (nullable = true)
 |-- owner_name: string (nullable = true)
 |-- order_date: date (nullable = true)
 |-- due_date: date (nullable = true)
 |-- payment_date: date (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_month: integer (nullable = true)
 |-- order_quarter: integer (nullable = true)
 |-- order_amount: integer (nullable = false)
 |-- num_products: integer (nullable = true)
 |-- product_categories: string (nullable = true)
 |-- product_categories_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- manufacturers: string (nullable = true)
 |-- manufacturers_array: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- is_first

In [12]:
from pyspark.sql import functions as F

print("--- VALIDATION REPORT ---")

# TEST 1: The "Null Hunter"
# We check if there are any remaining NULLs in the columns we just fixed.
# EXPECTATION: Result should be 0 for all.

null_check = denormalized.select(
    F.sum(F.when(F.col("amount_paid").isNull(), 1).otherwise(0)).alias("Nulls_in_AmountPaid"),
    F.sum(F.when(F.col("days_late").isNull(), 1).otherwise(0)).alias("Nulls_in_DaysLate"),
    F.sum(F.when(F.col("payment_status").isNull(), 1).otherwise(0)).alias("Nulls_in_Status")
)

print("1. Checking for NULLs (Should be 0):")
null_check.show()

# TEST 2: The "Pending Math" Check
# We look specifically at the rows we labeled "Pending/Active".
# EXPECTATION: 
#   - amount_paid should be 0
#   - payment_deficit should equal order_amount (50k - 0 = 50k)

print("2. Verifying Logic for Pending Transactions:")
pending_txns = denormalized.filter(F.col("payment_status") == "Pending/Active")

if pending_txns.count() > 0:
    print(f"✅ Found {pending_txns.count()} pending transactions.")
    
    # Show the proof
    pending_txns.select(
        "retailer_id", 
        "order_amount", 
        "amount_paid",       # Should be 0.0
        "payment_deficit",   # Should match order_amount
        "payment_status"
    ).show(5)
else:
    print("⚠️ No pending transactions found (This might be correct if all loans are paid, or a Join issue).")

# TEST 3: The "Negative Deficit" Check
# Did we break the math? Deficit should never be negative (unless they overpaid).
negative_deficit = denormalized.filter(F.col("payment_deficit") < 0).count()
if negative_deficit == 0:
    print("✅ Math looks good: No negative deficits found.")
else:
    print(f"ℹ️ Note: Found {negative_deficit} overpayments (negative deficit).")

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 14, Finished, Available, Finished)

--- VALIDATION REPORT ---
1. Checking for NULLs (Should be 0):
+-------------------+-----------------+---------------+
|Nulls_in_AmountPaid|Nulls_in_DaysLate|Nulls_in_Status|
+-------------------+-----------------+---------------+
|                  0|                0|              0|
+-------------------+-----------------+---------------+

2. Verifying Logic for Pending Transactions:
⚠️ No pending transactions found (This might be correct if all loans are paid, or a Join issue).
✅ Math looks good: No negative deficits found.


In [13]:
# Check the distinct status names to find the one that means "Currently Owe Money"
denormalized.select("payment_status").distinct().show()

StatementMeta(, 3ff4e965-9196-4cc0-9458-a0ca48314fcb, 15, Finished, Available, Finished)

+-------------------+
|     payment_status|
+-------------------+
|          Defaulted|
|     Partially Paid|
|Paid (Slight Delay)|
|       Paid On Time|
|          Paid Late|
|     No Transaction|
+-------------------+

