In [0]:
root_path = "abfss://etl@banksourcedata.dfs.core.windows.net"
silver_root = f"{root_path}/silver"

In [0]:
df_txn = spark.read.format("delta").load(f"{silver_root}/transactions")
df_acc_prof = spark.read.format("delta").load(f"{silver_root}/AccountProfile")
df_kyc = spark.read.format("delta").load(f"{silver_root}/kyc")

In [0]:
gold_root=f"{root_path}/gold"

In [0]:
dim_kyc = df_kyc.select(
    "CustomerID",
    "DocumentType",
    "DocumentID",
    "KYCStatus"
).dropDuplicates(["CustomerID","DocumentID"])
# display(dim_kyc.limit(10))

dim_kyc.write.format("delta").mode("overwrite").save(f"{gold_root}/dim_kyc")


In [0]:
from pyspark.sql.functions import col
dim_customer = df_acc_prof.select(
    "CustomerID",
    col("AccountHolderName").alias("CustomerName"),
    "Phone",
    "Email",
    "DOB",
    "Address",
    "AccountOpenDate"
).dropDuplicates(["CustomerID"])
# display(dim_customer.limit(10))
dim_customer.write.format("delta").mode("overwrite").save(f"{gold_root}/dim_customer")


In [0]:
dim_account = df_acc_prof.select(
    "AccountNumber",
    "CustomerID",
    "AccountType",
    "BranchName",
    "AccountStatus",
    "AccountOpenDate"
).dropDuplicates(["AccountNumber"])
# display(dim_account.limit(10))
dim_account.write.format("delta").mode("overwrite").save(f"{gold_root}/dim_account")


In [0]:
from pyspark.sql.functions import sequence, to_date, explode, year, month, dayofmonth, weekofyear, quarter

date_range = spark.sql("SELECT sequence(to_date('2020-01-01'), to_date('2035-12-31'), interval 1 day) AS dates")

dim_date = date_range.withColumn("Date", explode("dates")) \
    .drop("dates") \
    .withColumn("Year", year("Date")) \
    .withColumn("Month", month("Date")) \
    .withColumn("Day", dayofmonth("Date")) \
    .withColumn("Week", weekofyear("Date")) \
    .withColumn("Quarter", quarter("Date"))
# display(dim_date.limit(10))
dim_date.write.format("delta").mode("overwrite").save(f"{gold_root}/dim_date")


In [0]:
from pyspark.sql.functions import col, to_date

fact_transactions = df_txn.select(
    col("TransactionID"),
    # col("CustomerID"),
    col("account_number"),
    col("transaction_type"),
    col("Amount"),
    col("transaction_status"),
    # col("FraudFlag"),
    to_date("transaction_time").alias("Date"),
    col("bank_name"),
    # col("ATMID"),
    # col("UPIRefID"),
    col("Location"),
    col("latitude"),
    col("longitude"),
    col("channel")
)


In [0]:
# dim_date = spark.read.format("delta").load("/gold/dim_date")

fact_transactions = fact_transactions.join(
    dim_date,
    fact_transactions["Date"] == dim_date["Date"],
    "left"
).drop(dim_date["Date"])
fact_transactions.write.format("delta").mode("append").save(f"{gold_root}/fact_transactions")

In [0]:
# display(fact_transactions.limit(10))

In [0]:
# fact_transactions.write.format("delta").mode("overwrite").save("/gold/fact_transactions")

In [0]:
from pyspark.sql.functions import count, sum, when, col

fact_customer_profile = fact_transactions.groupBy("account_number").agg(
    count("*").alias("TotalTransactions"),
    sum("Amount").alias("TotalSpending"),
    # sum(when(col("FraudFlag")==1, 1).otherwise(0)).alias("FraudCount")
)

# fact_customer_profile.write.format("delta").mode("overwrite").save("/gold/fact_customer_profile")
display(fact_customer_profile)
fact_customer_profile.write.format("delta").mode("append").save(f"{gold_root}/fact_customer_profile")

account_number,TotalTransactions,TotalSpending
10011,5,72353.59
10026,2,51289.880000000005
10047,9,91698.62
10043,1,26714.88
10044,3,5537.24
10039,4,19534.72
10048,4,14243.74
10020,4,24642.12
10036,5,40704.32
10033,6,31650.23


In [0]:
from pyspark.sql.functions import col, lit, current_date, expr
from delta.tables import DeltaTable

def apply_scd2(silver_df, dim_path, business_keys, tracked_columns, surrogate_key):

    # If dimension table does not exist â†’ first load
    if not DeltaTable.isDeltaTable(spark, dim_path):
        print("Creating new SCD2 dimension:", dim_path)

        df_new = silver_df \
            .withColumn(surrogate_key, expr("uuid()")) \
            .withColumn("ValidFrom", current_date()) \
            .withColumn("ValidTo", lit("9999-12-31")) \
            .withColumn("IsCurrent", lit(1))
        
        df_new.write.format("delta").mode("overwrite").save(dim_path)
        return

    # Load existing dimension table
    dim = DeltaTable.forPath(spark, dim_path)

    # Join on business keys to find matches
    cond = " AND ".join([f"silver.{c} = dim.{c}" for c in business_keys])

    # Detect updated records by comparing tracked columns
    change_cond = " OR ".join([f"silver.{c} <> dim.{c}" for c in tracked_columns])

    # 1. Expire old records
    dim.alias("dim").merge(
        silver_df.alias("silver"),
        cond + " AND (" + change_cond + ") AND dim.IsCurrent = 1"
    ).whenMatchedUpdate(
        set={
            "ValidTo": "current_date()",
            "IsCurrent": "0"
        }
    ).execute()

    # 2. Insert new versions
    dim.alias("dim").merge(
        silver_df.alias("silver"),
        cond + " AND (" + change_cond + ") AND dim.IsCurrent = 1"
    ).whenNotMatchedInsert(
        values={
            surrogate_key: "uuid()",
            **{c: f"silver.{c}" for c in silver_df.columns},
            "ValidFrom": "current_date()",
            "ValidTo": "'9999-12-31'",
            "IsCurrent": "1"
        }
    ).execute()

In [0]:
apply_scd2(
    silver_df=df_acc_prof.select(
        "CustomerID", "AccountHolderName", "Phone", "Email", "Address", "DOB"
    ),
    dim_path=f"{gold_root}/dim_customer_scd2",
    business_keys=["CustomerID"],
    tracked_columns=["CustomerName", "Phone", "Email", "Address", "DOB"],
    surrogate_key="CustomerSK"
)

In [0]:
apply_scd2(
    silver_df=df_acc_prof.select(
        "AccountNumber", "CustomerID", "AccountType", "BranchName", "AccountStatus"
    ),
    dim_path=f"{gold_root}/dim_account_scd2",
    business_keys=["AccountNumber"],
    tracked_columns=["AccountType", "AccountStatus", "BranchName"],
    surrogate_key="AccountSK"
)


In [0]:
from pyspark.sql.functions import sum, count, to_date

# df_txn = spark.read.format("delta").load("/gold/fact_transactions")

daily_summary = df_txn.groupBy(to_date("transaction_time").alias("Date")).agg(
    count("*").alias("TotalTransactions"),
    sum("Amount").alias("TotalAmount"),
    sum(when(col("channel")=="ATM", 1).otherwise(0)).alias("ATM_Count"),
    sum(when(col("channel")=="UPI", 1).otherwise(0)).alias("UPI_Count"),
    # sum(when(col("FraudFlag")==1, 1).otherwise(0)).alias("Fraud_Count")
)
# display(daily_summary.limit(10))
daily_summary.write.format("delta").mode("overwrite").save(f"{gold_root}/agg_daily_transactions")


In [0]:
from pyspark.sql.functions import year, month

monthly_customer_spend = df_txn.withColumn("Year", year("transaction_time")) \
    .withColumn("Month", month("transaction_time")) \
    .groupBy("account_number", "Year", "Month") \
    .agg(
        sum("Amount").alias("MonthlySpend"),
        count("*").alias("MonthlyTxnCount")
    )
# display(monthly_customer_spend.limit(10))
monthly_customer_spend.write.format("delta").mode("overwrite").save(f"{gold_root}/agg_customer_monthly_spend")


In [0]:
branch_summary = df_txn.groupBy("bank_name").agg(
    sum("Amount").alias("TotalBranchAmount"),
    count("*").alias("TotalBranchTransactions"),
    # sum(when(col("FraudFlag")==1, 1).otherwise(0)).alias("BranchFraudcases")
)
# display(branch_summary.limit(10))
branch_summary.write.format("delta").mode("overwrite").save(f"{gold_root}/agg_branch_performance")


In [0]:
# channel_summary = df_txn.groupBy("transaction_type").agg(
channel_summary = df_txn.groupBy("channel").agg(
    count("*").alias("TxnCount"),
    sum("Amount").alias("TotalAmount")
)
# display(channel_summary.limit(10))
channel_summary.write.format("delta").mode("overwrite").save(f"{gold_root}/agg_channel_performance")


In [0]:
jdbc_url = "<url>"
jdbc_props = {
  "user": "niroop",
  "password": "Admin123",
  "driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver"
}


In [0]:
# df = spark.read.format("delta").load(f"{gold_root}/dim_customer_scd2")