###Gold Layer

%md
*sourcing config file* 

In [0]:
%run /capstone/config_script/setup_schema_and_config

%md
###Reading source data from silver layer

In [0]:
# fraud_flag_df = spark.read.table(silver_tables['fraud_flag'])
# customer_segments_df = spark.read.table(silver_tables['customer_segments'])
fraud_flag_df = spark.readStream.format("delta").table(silver_tables['fraud_flag'])
customer_segments_df = spark.readStream.format("delta").table(silver_tables['customer_segments'])

Customer input path: dbfs:/FileStore/capstone/cust_tbl/
Branch input path: dbfs:/FileStore/capstone/branches_tbl/
Transaction input path: dbfs:/FileStore/capstone/txn_tbl/
Checkpoint location: dbfs:/FileStore/capstone/txn_tbl/checkpoint_location/
Columns for null check: ['transaction_id', 'customer_id']
Expected customer ID length: 5
Ordered fraud flag columns: ['flag_id', 'transaction_id', 'flag_type', 'timestamp', 'confidence_score']
Ordered customer segments columns: ['customer_id', 'segment_name', 'segment_description', 'last_updated_date']
bronze_tables['txn'] is: bronze.txn_tbl
bronze_tables['cust'] is: bronze.cust_tbl
bronze_tables['branch'] is: bronze.branches_tbl


####Aggregating customer in different Segments

In [0]:
from pyspark.sql.functions import count, first
from pyspark.sql import functions as F
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import uuid

df_with_watermark = customer_segments_df.withWatermark("last_updated_date", "10 minutes")

# customer_segments_result_df = customer_segments_df.groupBy("customer_id") \
customer_segments_result_df = df_with_watermark.groupBy("customer_id") \
    .agg(
        count("*").alias("segment_count"),
        first("segment_name").alias("segment_name"),
        first("segment_description").alias("segment_description"),
        first("last_updated_date").alias("last_updated_date")
    )

# Drop 'segment_count' column
def generate_uuid():
    return str(uuid.uuid4())

uuid_udf = udf(generate_uuid, StringType())


# customer_segments_result_df = customer_segments_result_df.drop("segment_count").withColumn('segment_id', F.concat(F.lit("S00"), F.expr("monotonically_increasing_id()")))

customer_segments_result_df = customer_segments_result_df.drop("segment_count").withColumn('segment_id', uuid_udf())

# Reorder the columns
ordered_columns = [ 'segment_id','customer_id', 'segment_name', 'segment_description', 'last_updated_date']
df_ordered = customer_segments_result_df.select(ordered_columns)

# Write the DataFrame to a Delta table
# df_ordered.write.format("delta").mode("overwrite").saveAsTable(gold_tables['gold_customer_segments'])
df_ordered.writeStream \
    .format("delta") \
    .outputMode("complete") \
    .option("checkpointLocation", "dbfs:/FileStore/capstone/txn_tbl/checkpoint/gold_customer_segments/") \
    .table(gold_tables['gold_customer_segments'])



Out[54]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f30ccf7cb80>

In [0]:
df_ordered.printSchema()

root
 |-- segment_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- segment_name: string (nullable = true)
 |-- segment_description: string (nullable = true)
 |-- last_updated_date: timestamp (nullable = true)



####Fraud Flag different table creation

In [0]:
# fraud_flag_df.write.format("delta").mode("overwrite").saveAsTable(gold_tables['gold_fraud_flag'])

fraud_flag_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", "dbfs:/FileStore/capstone/txn_tbl/checkpoint/gold_customer_segments/") \
    .table(gold_tables['gold_fraud_flag'])




Out[61]: <pyspark.sql.streaming.query.StreamingQuery at 0x7f30cc7a7550>