In [0]:
%sql
Create schema gold;

In [0]:
%sql
use schema gold;

In [0]:
fraud_flag_df=spark.read.table("hive_metastore.silver.Fraud_Flag");
customer_segmentation_df=spark.read.table("hive_metastore.silver.Customer_segments");
transaction_df_cleaned=spark.read.table("hive_metastore.silver.cleaned_transaction");
customer_df_standardized=spark.read.table("hive_metastore.silver.cleaned_customer");
branch_df=spark.read.table("hive_metastore.silver.cleaned_branch");

In [0]:
segment_distribution = customer_segmentation_df.groupBy("segment_name").count()
segment_distribution.show()


+------------+-----+
|segment_name|count|
+------------+-----+
|  High_Value|   13|
|       Loyal|  581|
| Credit_Risk|  933|
|    New_User|   26|
+------------+-----+



In [0]:
from pyspark.sql import functions as F
unique_customers_per_segment = customer_segmentation_df.groupBy("segment_name").agg(F.countDistinct("customer_id").alias("unique_customers"))
unique_customers_per_segment.show()


+------------+----------------+
|segment_name|unique_customers|
+------------+----------------+
|  High_Value|              13|
|       Loyal|             175|
| Credit_Risk|             288|
|    New_User|              10|
+------------+----------------+



In [0]:
from pyspark.sql import functions as F
avg_segments_per_customer = customer_segmentation_df.groupBy("customer_id").agg(F.count("segment_name").alias("segment_count")).agg(F.avg("segment_count"))
avg_segments_per_customer.show()


+------------------+
|avg(segment_count)|
+------------------+
| 3.778588807785888|
+------------------+



####Daily transaction volume

In [0]:
from pyspark.sql import functions as F


daily_volume = transaction_df_cleaned.groupBy(F.date_trunc('day', 'timestamp').alias('day')).agg(
    F.sum('amount').alias('total_volume'),
    F.count('transaction_id').alias('transaction_count')
)
daily_volume.display()

day,total_volume,transaction_count
2023-01-05T00:00:00Z,144503.83695566654,45
2023-01-20T00:00:00Z,13928.789614200592,52
2023-01-08T00:00:00Z,179715.09124934673,49
2023-01-17T00:00:00Z,127313.40093874931,44
2023-01-26T00:00:00Z,32449.480460882187,48
2023-01-06T00:00:00Z,259853.8012394905,48
2023-01-23T00:00:00Z,300973.4696198702,47
2023-01-01T00:00:00Z,1410.8499903678894,47
2023-01-13T00:00:00Z,78897.4529747963,59
2023-01-22T00:00:00Z,135503.9815375805,50


### weekly transaction volume

In [0]:
weekly_volume = transaction_df_cleaned.groupBy(F.date_trunc('week', 'timestamp').alias('week')).agg(
    F.sum('amount').alias('total_volume'),
    F.count('transaction_id').alias('transaction_count')
)
weekly_volume.display()

week,total_volume,transaction_count
2023-01-23T00:00:00Z,1001274.58828187,336
2023-01-02T00:00:00Z,994626.7481119632,338
2023-01-09T00:00:00Z,941291.9074559212,355
2023-01-30T00:00:00Z,1106369.9766197205,339
2022-12-26T00:00:00Z,1410.8499903678894,47
2023-01-16T00:00:00Z,864536.8983428478,344
2023-02-06T00:00:00Z,974755.3784345388,339
2023-02-13T00:00:00Z,624410.7418041229,363
2023-02-20T00:00:00Z,1256468.9532836676,337
2023-02-27T00:00:00Z,630577.5518959761,202


#### Monthly transaction volume

In [0]:

monthly_volume = transaction_df_cleaned.groupBy(F.date_trunc('month', 'timestamp').alias('month')).agg(
    F.sum('amount').alias('total_volume'),
    F.count('transaction_id').alias('transaction_count')
)
monthly_volume.display()

month,total_volume,transaction_count
2023-01-01T00:00:00Z,4235137.305144787,1522
2023-02-01T00:00:00Z,3802676.7678308487,1376
2023-03-01T00:00:00Z,357909.5212453604,102


### potential fraud detection

In [0]:
potential_fraud = transaction_df_cleaned.filter(F.col('amount') > 10000)
potential_fraud.display()

transaction_id,customer_id,branch_id,channel,transaction_type,amount,currency,timestamp,status
T6074,C1666,B0009,BRANCH,DEPOSIT,34251.96,USD,2023-01-22T19:12:00Z,completed
T6094,C1298,B0001,BRANCH,DEPOSIT,57377.81,USD,2023-01-23T07:07:00Z,pending
T6102,C1729,B0005,MOBILE,DEPOSIT,48323.58,USD,2023-01-23T10:30:00Z,pending
T6105,C1208,B0004,MOBILE,PAYMENT,85108.67,USD,2023-01-23T11:52:00Z,completed
T6113,C1654,B0008,WEB,WITHDRAWAL,18974.85,USD,2023-01-23T15:37:00Z,completed
T6122,C1151,B0007,MOBILE,PAYMENT,25094.43,USD,2023-01-23T19:17:00Z,completed
T6125,C1537,B0007,BRANCH,WITHDRAWAL,64663.26,USD,2023-01-23T20:55:00Z,completed
T5203,C1570,B0004,MOBILE,DEPOSIT,14327.18,USD,2023-01-05T06:06:00Z,completed
T5205,C1538,B0009,WEB,TRANSFER,28476.47,USD,2023-01-05T06:59:00Z,completed
T5207,C1806,B0006,MOBILE,DEPOSIT,69988.62,USD,2023-01-05T07:43:00Z,completed
