In [0]:
dbutils.fs.cp("file:/Workspace/Shared/sales_csv.csv","dbfs:/Filestore/streaming/input/sales_csv.csv")
dbutils.fs.cp("file:/Workspace/Shared/customer_datas.json","dbfs:/Filestore/streaming/input/customer_datas.json")

True

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

#Initialize SparkSession
spark=SparkSession.builder.appName("Structured Streaming Example").getOrCreate()

#Define the schema for the csv data
sales_schema="OrderID INT , OrderDate STRING , CustomerID STRING , PRODUCT STRING, Quantity INT ,Price DOUBLE"

#Read streaming data from csv files
df_sales_stream=spark.readStream \
    .format("csv") \
        .option("header","true") \
            .schema(sales_schema) \
                .load("dbfs:/Filestore/streaming/input/")

#Define the schema for the json data
customer_schema="CustomerID STRING, CustomerName STRING, Region STRING , SignupDate STRING"

#Read streaming data from json files
df_customers_stream=spark.readStream \
    .format("json") \
        .schema(customer_schema) \
            .load("dbfs:/Filestore/streaming/input/")

df_customers_stream.printSchema()


root
 |-- CustomerID: string (nullable = true)
 |-- CustomerName: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- SignupDate: string (nullable = true)



In [0]:
from pyspark.sql.functions import current_date, datediff ,to_timestamp

# Transform the sales data: Add a new column for total amount
df_sales_transformed=df_sales_stream.select(
    col("OrderID"),
    to_timestamp(col("OrderDate"),"yyyy-MM-dd HH:mm:ss").alias("OrderDate"), #convert OrderDate to TIMESTAMP
    col("Product"),
    col("Quantity"),
    col("Price"),
    (col("Quantity")*col("Price")).alias("TotalAmount")
)
print("Applies transformation on sales data...")

#Add watermark to handle late data and perform aggregation
df_sales_aggregated=df_sales_transformed.withWatermark("OrderDate", "1 day") \
    .groupBy("Product") \
        .agg({"TotalAmount":"sum"})

print("Aggregated sales data...")

# Transform the customer data: Add a new column for the number of years since signup
df_customer_transformed=df_customers_stream.withColumn(
    "YearsSinceSignup",
    datediff(current_date(),to_timestamp(col("SignupDate"),"yyyy-MM-dd")).cast("int")/365
)
print("Applies transformation on customer data...")

Applies transformation on sales data...
Aggregated sales data...
Applies transformation on customer data...


In [0]:
#Write the aggregated sales data to console sink for debugging
sales_query=df_sales_aggregated.writeStream \
    .outputMode("update") \
        .format("console") \
            .start()

print("Started streaming query to write aggregated sales data to console...")

# Write the transformed customer data to console sink for debugging
customers_query=df_customer_transformed.writeStream \
    .outputMode("append") \
        .format("console") \
            .start()

print("Started streaming query to write transformed customer data to console...")

Started streaming query to write aggregated sales data to console...
Started streaming query to write transformed customer data to console...
