In [0]:
# dbutils.fs.rm("/mnt/realtimedeai/checkpoints/streaming/sales_transactions/", recurse=True)


In [0]:

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType,LongType

sales_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("product_id", LongType(), True),
    StructField("store_id", LongType(), True),
    StructField("quantity_sold", LongType(), True),
    StructField("sale_amount", DoubleType(), True),
    StructField("transaction_time", StringType(), True)
])

print("Schema defined successfully!")


In [0]:

streaming_bronze_path = "/mnt/realtimedeai/bronze/streaming/"


# Read Streaming Data 

streaming_sales_df = (spark
    .readStream                                    
    .format("parquet")                             
    .schema(sales_schema)                          
    .load(streaming_bronze_path)                   
)

print("Streaming data source set up successfully!")




In [0]:
# Deduplication: Remove duplicate transaction_id (streaming best practice)
streaming_sales_cleaned_df = streaming_sales_df.dropDuplicates(["transaction_id"])


print("Duplicates removed successfully!")


In [0]:

# Step 2: Null Handling & Invalid Data Filtering


from pyspark.sql.functions import col

streaming_sales_filtered_df = (
    streaming_sales_cleaned_df
    .filter(
        (col("quantity_sold") > 0) &
        (col("sale_amount") > 0) &
        (col("transaction_id").isNotNull()) &
        (col("product_id").isNotNull()) &
        (col("store_id").isNotNull())
    )
)

print("Nulls and invalid rows removed successfully!")


In [0]:
from pyspark.sql.functions import col, to_timestamp

streaming_sales_timecasted_df = (
    streaming_sales_filtered_df
    .withColumn("transaction_time", to_timestamp(col("transaction_time"), "yyyy-MM-dd'T'HH:mm:ss.SSSSSS"))
)

print("transaction_time converted to TimestampType successfully (with correct format)!")


In [0]:

# Step 4: Add processed_time (ingestion timestamp)


from pyspark.sql.functions import current_timestamp


streaming_sales_ready_df = (
    streaming_sales_timecasted_df
    .withColumn("processed_time", current_timestamp())
)

print("processed_time column added successfully!")


In [0]:


silver_output_path = "/mnt/realtimedeai/silver/streaming/sales_transactions/"
checkpoint_path = "/mnt/realtimedeai/checkpoints/streaming/sales_transactions/"


write_query = (
    streaming_sales_ready_df                       
    .writeStream
    .format("delta")                               
    .outputMode("append")                          
    .option("checkpointLocation", checkpoint_path) 
    .start(silver_output_path)                     
)

print("Streaming write started successfully! Data is being written to the Silver layer.")


In [0]:
write_query.status


In [0]:
write_query.stop()