##Reading data into Bronze layer from source files

####Importing the conifg parameters

In [0]:
%run /capstone/config_script/setup_schema_and_config

####Reading data from cust table

In [0]:
cust_df = spark.read.format("delta").load(input_path_cust)

Customer input path: dbfs:/FileStore/capstone/cust_tbl/
Branch input path: dbfs:/FileStore/capstone/branches_tbl/
Transaction input path: dbfs:/FileStore/capstone/txn_tbl/
Checkpoint location: dbfs:/FileStore/capstone/txn_tbl/checkpoint_location/
Columns for null check: ['transaction_id', 'customer_id']
Expected customer ID length: 5
Ordered fraud flag columns: ['flag_id', 'transaction_id', 'flag_type', 'timestamp', 'confidence_score']
Ordered customer segments columns: ['customer_id', 'segment_name', 'segment_description', 'last_updated_date']
bronze_tables['txn'] is: bronze.txn_tbl
bronze_tables['cust'] is: bronze.cust_tbl
bronze_tables['branch'] is: bronze.branches_tbl


####Reading data from branches table


In [0]:
branches_df = spark.read.format("delta").load(input_path_branch)

####Writing data into bronze tables 

In [0]:

cust_df.write.mode('overwrite').saveAsTable(bronze_tables['cust'])
branches_df.write.mode('overwrite').saveAsTable(bronze_tables['branch'])

####*Schema Evalution Code commented for now*

In [0]:
# cust_df.write.format("delta").mode("append").option("mergeSchema","true").saveAsTable(bronze_tables['cust'])
# branches_df.write.format("delta").mode("append").option("mergeSchema","true").saveAsTable(bronze_tables['branch'])

###Reading streaming transaction data 

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Delta Stream Processing") \
    .getOrCreate()

# Read stream from Delta format
streamingDF = spark.readStream.format("delta").option("header", "true").load(input_path_txn)

def foreach_batch_function(df, epoch_id):
    # Write to a Databricks Delta table
    df.write.format("delta").mode("append").saveAsTable(bronze_tables['txn'])
    print(f"Batch for epoch {epoch_id} processed.")

# Write stream using foreachBatch to continuously trigger
query = streamingDF.writeStream.foreachBatch(foreach_batch_function).trigger(processingTime='10 seconds').option("checkpointLocation", checkpoint_location).start()

query.awaitTermination()


Batch for epoch 0 processed.
Batch for epoch 1 processed.
Batch for epoch 2 processed.
Batch for epoch 3 processed.
Batch for epoch 4 processed.
Batch for epoch 5 processed.
Batch for epoch 6 processed.
Batch for epoch 7 processed.
Batch for epoch 8 processed.
Batch for epoch 9 processed.
Batch for epoch 10 processed.
Batch for epoch 11 processed.
Batch for epoch 12 processed.
Batch for epoch 13 processed.
Batch for epoch 14 processed.
Batch for epoch 15 processed.
Batch for epoch 16 processed.
Batch for epoch 17 processed.
Batch for epoch 18 processed.
Batch for epoch 19 processed.
Batch for epoch 20 processed.
Batch for epoch 21 processed.
Batch for epoch 22 processed.
Batch for epoch 23 processed.
Batch for epoch 24 processed.
