In [0]:
read data from adls abfss://raw@shradls.dfs.core.windows.net/

In [0]:
df_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("delimiter", ",") \
    .csv("abfss://raw@shradls.dfs.core.windows.net/")


In [0]:
df_raw.show()

+--------------+-------------------+----------------+-----------+------+--------+-------------+-------------------+
|transaction_id|     transaction_ts|     card_number|merchant_id|amount|currency|      channel|    last_updated_ts|
+--------------+-------------------+----------------+-----------+------+--------+-------------+-------------------+
|             1|2024-09-01 09:05:00|4111111111111111|       M001| 500.0|     INR|          POS|2024-09-01 09:10:00|
|             2|2024-09-01 09:20:00|4111111111111111|       M002|1200.0|     INR|       ONLINE|2024-09-01 09:25:00|
|             3|2024-09-01 09:45:00|5222222222222222|       M003|  75.0|     USD|INTERNATIONAL|2024-09-01 09:50:00|
|             4|2024-09-01 10:00:00|4333333333333333|       M001|3000.0|     INR|          POS|2024-09-01 10:05:00|
|             5|2024-09-01 10:15:00|4111111111111111|       M004| 900.0|     INR|       ONLINE|2024-09-01 10:18:00|
|             6|2024-09-01 10:30:00|5222222222222222|       M002|2500.0|

In [0]:
transaction_id       INT
transaction_ts       TIMESTAMP
card_number           STRING
merchant_id           STRING
amount               DOUBLE
currency              STRING
channel               STRING
last_updated_ts       TIMESTAMP


In [0]:
from pyspark.sql.functions import when, col

df_std = df_raw.withColumn(
    "amount_in_inr",
    when(col("currency") == "USD", col("amount") * 83)
    .otherwise(col("amount"))
)


In [0]:
df_std.show()

+--------------+-------------------+----------------+-----------+------+--------+-------------+-------------------+-------------+
|transaction_id|     transaction_ts|     card_number|merchant_id|amount|currency|      channel|    last_updated_ts|amount_in_inr|
+--------------+-------------------+----------------+-----------+------+--------+-------------+-------------------+-------------+
|             1|2024-09-01 09:05:00|4111111111111111|       M001| 500.0|     INR|          POS|2024-09-01 09:10:00|        500.0|
|             2|2024-09-01 09:20:00|4111111111111111|       M002|1200.0|     INR|       ONLINE|2024-09-01 09:25:00|       1200.0|
|             3|2024-09-01 09:45:00|5222222222222222|       M003|  75.0|     USD|INTERNATIONAL|2024-09-01 09:50:00|       6225.0|
|             4|2024-09-01 10:00:00|4333333333333333|       M001|3000.0|     INR|          POS|2024-09-01 10:05:00|       3000.0|
|             5|2024-09-01 10:15:00|4111111111111111|       M004| 900.0|     INR|       ON

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

window_spec = Window.partitionBy(
    "card_number", "merchant_id", "transaction_ts"
).orderBy(col("last_updated_ts").desc())

df_dedup = (
    df_std
    .withColumn("rn", row_number().over(window_spec))
    .filter(col("rn") == 1)
    .drop("rn")
)


In [0]:
df_dedup.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("hdfc_data_7405616769635716.prokect.silver_credit_card_transactions")


In [0]:
%sql
SELECT * 
FROM hdfc_data_7405616769635716.prokect.silver_credit_card_transactions
ORDER BY transaction_ts;



transaction_id,transaction_ts,card_number,merchant_id,amount,currency,channel,last_updated_ts,amount_in_inr
16,2024-09-01T08:40:00.000Z,4111111111111111,M001,700.0,INR,POS,2024-09-02T08:30:00.000Z,700.0
1,2024-09-01T09:05:00.000Z,4111111111111111,M001,500.0,INR,POS,2024-09-01T09:10:00.000Z,500.0
2,2024-09-01T09:20:00.000Z,4111111111111111,M002,1200.0,INR,ONLINE,2024-09-01T09:25:00.000Z,1200.0
3,2024-09-01T09:45:00.000Z,5222222222222222,M003,75.0,USD,INTERNATIONAL,2024-09-01T09:50:00.000Z,6225.0
4,2024-09-01T10:00:00.000Z,4333333333333333,M001,2800.0,INR,POS,2024-09-03T09:00:00.000Z,2800.0
5,2024-09-01T10:15:00.000Z,4111111111111111,M004,900.0,INR,ONLINE,2024-09-01T10:18:00.000Z,900.0
6,2024-09-01T10:30:00.000Z,5222222222222222,M002,2500.0,INR,ONLINE,2024-09-01T10:35:00.000Z,2500.0
7,2024-09-01T10:45:00.000Z,4333333333333333,M003,120.0,USD,INTERNATIONAL,2024-09-01T10:50:00.000Z,9960.0
8,2024-09-01T11:00:00.000Z,4111111111111111,M001,1800.0,INR,POS,2024-09-01T11:05:00.000Z,1800.0
9,2024-09-01T11:20:00.000Z,5222222222222222,M004,200.0,USD,INTERNATIONAL,2024-09-01T11:25:00.000Z,16600.0


In [0]:
%sql
SELECT COUNT(*) 
FROM hdfc_data_7405616769635716.prokect.silver_credit_card_transactions;


COUNT(*)
23


In [0]:
df_incremental
df_incremental.createOrReplaceTempView("incremental_transactions")
MERGE INTO hdfc_data_7405616769635716.project.silver_credit_card_transactions AS target
USING incremental_transactions AS source
ON  target.card_number = source.card_number
AND target.merchant_id = source.merchant_id
AND target.transaction_ts = source.transaction_ts

WHEN MATCHED
AND source.last_updated_ts > target.last_updated_ts
THEN UPDATE SET
    target.transaction_id   = source.transaction_id,
    target.amount           = source.amount,
    target.amount_in_inr    = source.amount_in_inr,
    target.currency         = source.currency,
    target.channel          = source.channel,
    target.last_updated_ts  = source.last_updated_ts

WHEN NOT MATCHED
THEN INSERT *;


In [0]:
%sql
CREATE OR REPLACE TABLE hdfc_data_7405616769635716.prokect.gold_daily_transaction_summary
AS
SELECT
    DATE(transaction_ts) AS txn_date,
    COUNT(*) AS total_transactions,
    SUM(amount_in_inr) AS total_amount_in_inr
FROM hdfc_data_7405616769635716.prokect.silver_credit_card_transactions
GROUP BY DATE(transaction_ts);


num_affected_rows,num_inserted_rows


In [0]:
%sql
CREATE OR REPLACE TABLE hdfc_data_7405616769635716.prokect.gold_channel_spend_summary
AS
SELECT
    channel,
    SUM(amount_in_inr) AS total_spend_in_inr
FROM hdfc_data_7405616769635716.prokect.silver_credit_card_transactions
GROUP BY channel;


num_affected_rows,num_inserted_rows


In [0]:
%sql DESCRIBE TABLE hdfc_data_7405616769635716.prokect.silver_credit_card_transactions


col_name,data_type,comment
transaction_id,int,
transaction_ts,timestamp,
card_number,bigint,
merchant_id,string,
amount,double,
currency,string,
channel,string,
last_updated_ts,timestamp,
amount_in_inr,double,


In [0]:
%sql
select * from hdfc_data_7405616769635716.prokect.silver_credit_card_transactions;

transaction_id,transaction_ts,card_number,merchant_id,amount,currency,channel,last_updated_ts,amount_in_inr
16,2024-09-01T08:40:00.000Z,4111111111111111,M001,700.0,INR,POS,2024-09-02T08:30:00.000Z,700.0
1,2024-09-01T09:05:00.000Z,4111111111111111,M001,500.0,INR,POS,2024-09-01T09:10:00.000Z,500.0
8,2024-09-01T11:00:00.000Z,4111111111111111,M001,1800.0,INR,POS,2024-09-01T11:05:00.000Z,1800.0
2,2024-09-01T09:20:00.000Z,4111111111111111,M002,1200.0,INR,ONLINE,2024-09-01T09:25:00.000Z,1200.0
14,2024-09-01T12:45:00.000Z,4111111111111111,M002,3100.0,INR,POS,2024-09-01T12:50:00.000Z,3100.0
11,2024-09-01T12:00:00.000Z,4111111111111111,M003,95.0,USD,INTERNATIONAL,2024-09-01T12:05:00.000Z,7885.0
22,2024-09-03T10:00:00.000Z,4111111111111111,M003,85.0,USD,INTERNATIONAL,2024-09-03T10:05:00.000Z,7055.0
5,2024-09-01T10:15:00.000Z,4111111111111111,M004,900.0,INR,ONLINE,2024-09-01T10:18:00.000Z,900.0
19,2024-09-02T10:20:00.000Z,4111111111111111,M004,1400.0,INR,ONLINE,2024-09-02T10:25:00.000Z,1400.0
4,2024-09-01T10:00:00.000Z,4333333333333333,M001,2800.0,INR,POS,2024-09-03T09:00:00.000Z,2800.0


In [0]:
%sql
SELECT
    CAST(transaction_ts AS DATE) AS txn_date,
    COUNT(*) AS total_transactions,
    SUM(amount_in_inr) AS total_amount_in_inr
FROM hdfc_data_7405616769635716.prokect.silver_credit_card_transactions
GROUP BY CAST(transaction_ts AS DATE)
ORDER BY txn_date;


txn_date,total_transactions,total_amount_in_inr
2024-09-01,16,70160.0
2024-09-02,4,14930.0
2024-09-03,3,11055.0


In [0]:
%sql
SELECT
    channel,
    COUNT(*) AS txn_count,
    SUM(amount_in_inr) AS total_spend_in_inr
FROM hdfc_data_7405616769635716.prokect.silver_credit_card_transactions
GROUP BY channel
ORDER BY total_spend_in_inr DESC;


channel,txn_count,total_spend_in_inr
INTERNATIONAL,7,67645.0
ONLINE,9,14800.0
POS,7,13700.0


In [0]:
%sql
SELECT
    card_number,
    CAST(transaction_ts AS DATE) AS txn_date,
    SUM(amount_in_inr) AS daily_spend_in_inr
FROM hdfc_data_7405616769635716.prokect.silver_credit_card_transactions
GROUP BY card_number, CAST(transaction_ts AS DATE)
HAVING SUM(amount_in_inr) > 10000
ORDER BY daily_spend_in_inr DESC;


card_number,txn_date,daily_spend_in_inr
5222222222222222,2024-09-01,38315.0
4111111111111111,2024-09-01,16085.0
4333333333333333,2024-09-01,15760.0
