In [0]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
%sh
ls /dbfs/mnt/demo-datasets/DE-Pro-new/bookstore/kafka-raw/

In [0]:
spark.sql("SHOW TABLES").show()

In [0]:
%sql
-- drop table orders_bronze;

In [0]:
df_raw = spark.read.json("dbfs:/mnt/demo-datasets/DE-Pro/bookstore/kafka-raw/01.json")
display(df_raw)

In [0]:
df_raw.printSchema()

In [0]:
def process_bronze():
  
    schema = "key BINARY, value BINARY, topic STRING, partition LONG, offset LONG, timestamp LONG"

    query = (spark.readStream
                        .format("cloudFiles")
                        .option("cloudFiles.format", "json")
                        .schema(schema)
                        .load("dbfs:/mnt/demo-datasets/DE-Pro-new/bookstore/kafka-raw")
                        .withColumn("timestamp", (col("timestamp")/1000).cast("timestamp"))  
                        .withColumn("year_month", date_format("timestamp", "yyyy-MM"))
                  .writeStream
                      .option("checkpointLocation", "dbfs:/mnt/checkpoints/bronze")
                      .option("mergeSchema", True)
                      .partitionBy("topic", "year_month")
                      .trigger(availableNow=True)
                      .table("bronze_table"))
    
    query.awaitTermination()

In [0]:
%sh
# ls /dbfs/mnt/demo/
# ls /dbfs/mnt/demo-datasets/DE-Pro-new/bookstore/kafka-raw/
# rm -rf /dbfs/mnt/demo_pro/checkpoints

In [0]:
%sh
# rm -rf /dbfs/mnt/demo_pro_new

In [0]:
process_bronze()

In [0]:
batch_df = spark.table("bronze_table")
display(batch_df)

In [0]:
%sql
select count(*) from bronze_table;

In [0]:
json_schema = "order_id STRING, order_timestamp Timestamp, customer_id STRING, quantity BIGINT, total BIGINT, books ARRAY<STRUCT<book_id STRING, quantity BIGINT, subtotal BIGINT>>"

def process_silver():
        query = (spark.readStream.table("bronze_table")
                .filter("topic = 'orders'")
                .select(from_json(col("value").cast("string"), json_schema).alias("v"))
                .select("v.*")
                .filter("quantity > 0")
        .writeStream
                .option("checkpointLocation", "dbfs:/mnt/checkpoints/silver")
                .trigger(availableNow=True)
                .table("silver_table"))

        query.awaitTermination()

In [0]:
process_silver()

In [0]:
%sql
select * from silver_table;

In [0]:
%sql
select count(*) from silver_table;

In [0]:
%sql
alter table silver_table add constraint timestamp_within_range check (order_timestamp >= '2020-01-01');

In [0]:
%sql
describe history silver_table;

In [0]:
%sql
describe extended silver_table;

In [0]:
%sql
select * from silver_table limit 1;

In [0]:
%sql
insert into silver_table values('10','2022-03-12','C023472',0,0,NULL),
('20','2019-01-14','R23432',0,0,NULL),
('30','2020-01-01','F2341',0,0,NULL)

In [0]:
%sql
select * from silver_table
where order_id in ('10','20','30');

In [0]:
%sql
alter table silver_table add constraint valid_quantity check (quantity > 0)

In [0]:
%sql
select * from silver_table
where quantity <= 0;

In [0]:
json_schema = "order_id STRING, order_timestamp Timestamp, customer_id STRING, quantity BIGINT, total BIGINT, books ARRAY<STRUCT<book_id STRING, quantity BIGINT, subtotal BIGINT>>"

def process_silver():
        query = (spark.readStream.table("bronze_table")
                .filter("topic = 'orders'")
                .select(from_json(col("value").cast("string"), json_schema).alias("v"))
                .select("v.*")
                .filter("quantity > 0") 
                .where(col("timestamp") >= '2020-01-01')
        .writeStream
                .option("checkpointLocation", "dbfs:/mnt/checkpoints/silver")
                .trigger(availableNow=True)
                .table("silver_table"))

        query.awaitTermination()

In [0]:
process_silver()

In [0]:
%sql
select * from silver_table;

In [0]:
%sql
alter table silver_table drop constraint timestamp_within_range;

In [0]:
%sql
describe extended silver_table;

In [0]:
%sql
drop table silver_table;
drop table bronze_table;

In [0]:
%sh
# ls /dbfs/mnt/checkpoints/
# rm -rf /dbfs/mnt/demo_pro_new/checkpoints/
# rm -rf /dbfs/mnt/checkpoints/

In [0]:
(spark.read
        .table("bronze_table")
        .filter("topic = 'orders'")
        .count())



In [0]:
batch_df.printSchema()

In [0]:
json_schema = "order_id STRING, order_timestamp Timestamp, customer_id STRING, quantity BIGINT, total BIGINT, books ARRAY<STRUCT<book_id STRING, quantity BIGINT, subtotal BIGINT>>"

def eliminate_duplicates():
    query = (spark.read.table("bronze_table")
                    .filter("topic = 'orders'")
                    .select(from_json(col("value").cast("string"), json_schema).alias("v"))
                    .select("v.*")
                    .dropDuplicates(["order_id", "order_timestamp"])
                    .count()
    )
    # query.awaitTermination()

In [0]:
print(eliminate_duplicates())

In [0]:
def process_bronze():
    schema = "key BINARY, value BINARY, topic STRING, partition LONG, offset LONG, timestamp LONG"

    query = (spark.readStream
                    .format("cloudFiles")
                    .option("cloudFiles.format", "json")
                    .schema(schema)
                    .load("dbfs:/mnt/demo-datasets/DE-Pro/bookstore/kafka-raw/")
                    .withColumn("timestamp", (col("timestamp")/1000).cast("timestamp"))
                    .withColumn("year_month", date_format("timestamp", "yyyy-MM"))
                .writeStream
                    .option("checkpointLocation", "dbfs:/mnt/demo_pro/checkpoints/final_draft/bronze_table")
                    .option("mergeSchema","True")
                    .partitionBy("topic","year_month")
                    .trigger(availableNow=True)
                    .table("bronze_table")
                    )
    
    query.awaitTermination()

In [0]:
%sh
ls /dbfs/mnt/demo-datasets/

In [0]:
%sh
# rm -rf /dbfs/mnt/demo-datasets/DE-Pro/bookstore/kafka-raw/03.json

In [0]:
%sql
create or replace table books_silver
(book_id string, title string, author string, price double, current boolean, effective_date timestamp, end_date timestamp)

In [0]:
def type2_upsert(microBatchDF, batch):
    microBatchDF.createOrReplaceTempView("updates")
    
    sql_query = """
        MERGE INTO books_silver
        USING (
            SELECT updates.book_id as merge_key, updates.*
            FROM updates

            UNION ALL

            SELECT NULL as merge_key, updates.*
            FROM updates
            JOIN books_silver ON updates.book_id = books_silver.book_id
            WHERE books_silver.current = true AND updates.price <> books_silver.price
          ) staged_updates
        ON books_silver.book_id = merge_key 
        WHEN MATCHED AND books_silver.current = true AND books_silver.price <> staged_updates.price THEN
          UPDATE SET current = false, end_date = staged_updates.updated
        WHEN NOT MATCHED THEN
          INSERT (books_silver.book_id, books_silver.title, books_silver.author, books_silver.price, books_silver.current, books_silver.effective_date, books_silver.end_date)
          VALUES (staged_updates.book_id, staged_updates.title, staged_updates.author, staged_updates.price, true, staged_updates.updated, NULL)
    """
    
    microBatchDF.sparkSession.sql(sql_query)

In [0]:
def process_books_new():
  json_schema = "book_id string, title string, author string, price double, current boolean, updated timestamp, end_date timestamp"

  query = (spark.readStream \
                  .table("bronze_table_alpha") \
                  .filter("topic = 'books'") \
                  .select(from_json(col("value").cast("string"), json_schema).alias("v")) \
                  .select("v.*")  \
                .writeStream \
                  .foreachBatch(type2_upsert) \
                  .option("checkpointLocation", "dbfs:/mnt/demo_pro/checkpoints/final_draft/bronze_orders") \
                  .trigger(availableNow=True) \
                  .start())

  query.awaitTermination()

In [0]:
%sql
-- drop  table bronze_table;
-- drop table books_silver;

In [0]:
process_books_new()

In [0]:
%sql
select * from books_silver;

In [0]:
%sql
drop view bronze_view;

In [0]:
schema = "key BINARY, value BINARY, topic STRING, partition LONG, offset LONG, timestamp LONG"

spark.readStream \
    .format("cloudFiles") \
    .option("cloudFiles.format", "json") \
    .schema(schema) \
    .load("dbfs:/mnt/demo-datasets/DE-Pro/bookstore/kafka-raw/books-updates/") \
    .withColumn("timestamp", (col("timestamp")/1000).cast("timestamp")) \
    .withColumn("year_month", date_format("timestamp", "yyyy-MM")) \
    .createOrReplaceTempView("bronze_view")

In [0]:
%sh
ls /dbfs/mnt/demo-datasets/DE-Pro/bookstore/kafka-raw/books-updates/

In [0]:
%sql
select * from bronze_view;

In [0]:
spark.table("bronze_view") \
    .writeStream \
    .option("checkpointLocation","dbfs:/mnt/demo_pro/checkpoints/revamp/bronze-climax") \
    .option("mergeSchema","True") \
    .partitionBy("topic","year_month") \
    .trigger(availableNow=True) \
    .table("bronze_table_alpha")

In [0]:
%sql
select * from bronze_table_alpha
where topic='books';
-- 1200, 21, 2000
-- 2700,21,4500

In [0]:
%sql
select * from bronze_table_alpha;

In [0]:
%sh
cp /dbfs/mnt/demo-datasets/DE-Pro/bookstore/kafka-streaming/04.json /dbfs/mnt/demo-datasets/DE-Pro/bookstore/kafka-raw/books-updates

In [0]:
%sh
ls /dbfs/mnt/demo-datasets/DE-Pro/bookstore/