###Gold Layer

%md
*sourcing config file* 

In [0]:
%run /capstone/config_script/setup_schema_and_config

%md
###Reading source data from silver layer

In [0]:
fraud_flag_df = spark.readStream.format("delta").table(silver_tables['fraud_flag'])
customer_segments_df = spark.readStream.format("delta").table(silver_tables['customer_segments'])

Customer input path: dbfs:/FileStore/capstone/cust_tbl/
Branch input path: dbfs:/FileStore/capstone/branches_tbl/
Transaction input path: dbfs:/FileStore/capstone/txn_tbl/
Checkpoint location: dbfs:/FileStore/capstone/txn_tbl/checkpoint_location/
Checkpoint location silver txn: dbfs:/FileStore/capstone/checkpoint/silver/txn/
Checkpoint location cust txn: dbfs:/FileStore/capstone/checkpoint/silver/cust_txn/
Checkpoint location cust seg : dbfs:/FileStore/capstone/checkpoint/silver/customer_segments/
Checkpoint location fraud flag : dbfs:/FileStore/capstone/checkpoint/silver/fraud_flag/
Checkpoint location gold fraud flag : dbfs:/FileStore/capstone/checkpoint/gold/farud_flag/
Checkpoint location gold cust seg : dbfs:/FileStore/capstone/checkpoint/gold/customer_segments/
Columns for null check: ['transaction_id', 'customer_id']
Expected customer ID length: 5
Ordered fraud flag columns: ['flag_id', 'transaction_id', 'flag_type', 'timestamp', 'confidence_score']
Ordered customer segments col

####Aggregating customer in different Segments

In [0]:
from pyspark.sql.functions import current_timestamp, count, first, udf
from pyspark.sql.types import StringType
import uuid

# Add a timestamp column
customer_segments_df_with_timestamp = customer_segments_df.withColumn("event_timestamp", current_timestamp())

# Apply watermark on the new timestamp column
df_with_watermark = customer_segments_df_with_timestamp.withWatermark("event_timestamp", "10 minutes")

# Perform aggregation
customer_segments_result_df = df_with_watermark.groupBy("customer_id") \
    .agg(
        count("*").alias("segment_count"),
        first("segment_name").alias("segment_name"),
        first("segment_description").alias("segment_description"),
        first("event_timestamp").alias("last_updated_date")  # Using the new timestamp column
    )

# Drop 'segment_count' column and add UUID
def generate_uuid():
    return str(uuid.uuid4())

uuid_udf = udf(generate_uuid, StringType())

customer_segments_result_df = customer_segments_result_df.drop("segment_count").withColumn('segment_id', uuid_udf())

# Reorder the columns
ordered_columns = ['segment_id', 'customer_id', 'segment_name', 'segment_description', 'last_updated_date']
df_ordered = customer_segments_result_df.select(ordered_columns)

# Write the DataFrame to a Delta table
df_ordered.writeStream \
    .format("delta") \
    .outputMode("complete") \
    .option("checkpointLocation", checkpoint_location_gold_customer_segments) \
    .table(gold_tables['gold_customer_segments'])


Out[6]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f384e4234c0>

####Fraud Flag table creation

In [0]:
fraud_flag_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_location_gold_fraud_flag) \
    .table(gold_tables['gold_fraud_flag'])


Out[7]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f384f0c7af0>

####Checking intermediate results

In [0]:
result_flag_df = spark.sql("""
    SELECT flag_type, COUNT(*)
    FROM gold.fraud_flag
    GROUP BY flag_type
""")

result_flag_df.show()

+---------------+--------+
|      flag_type|count(1)|
+---------------+--------+
|pattern_anomaly|       2|
| unusual_amount|      18|
|watchlist_match|     176|
+---------------+--------+



In [0]:
result_seg_df = spark.sql("""
    SELECT segment_name, COUNT(*)
    FROM gold.customer_segments
    GROUP BY segment_name
""")

result_seg_df.show()

+------------+--------+
|segment_name|count(1)|
+------------+--------+
|  High_Value|      17|
| Credit_Risk|      27|
|    New_User|       6|
|       Loyal|      13|
+------------+--------+

