In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [2]:
# %fs   - for using in DataBricks, "spark" already runned
# ls /databricks-datasets

spark = SparkSession.builder\
    .master("local")\
    .appName("lambda")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/03 15:57:20 WARN util.Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/04/03 15:57:20 WARN util.Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [3]:
orders_df = spark.read\
   .option("header", True)\
   .option("inferSchema", True)\
   .csv("/datalake/bronze/dshop/orders/spark_stream/orders_2022-02-24.csv")

[Stage 1:>                                                          (0 + 1) / 1]                                                                                

In [4]:
orders_schema = [i for i in orders_df.schema]

In [5]:
orders_schema

[StructField(order_id,IntegerType,true),
 StructField(product_id,IntegerType,true),
 StructField(client_id,IntegerType,true),
 StructField(store_id,IntegerType,true),
 StructField(quantity,IntegerType,true),
 StructField(order_date,StringType,true)]

In [6]:
csv_schema = StructType(orders_schema)

In [7]:
orders_data = spark.readStream\
    .schema(csv_schema)\
    .option("maxFilesPerTrigger", 1)\
    .csv("/datalake/bronze/dshop/orders/spark_stream/")

In [8]:
orders_df = orders_data\
    .where(F.col('store_id') == 1)
orders_df = orders_df\
    .withColumn('quantity', F.round(F.col('quantity') * 3 / 2, 2))\
    .withColumnRenamed('quantity', 'order_quantity')\
    .withColumn('order_date', F.col('order_date').cast('date'))

In [9]:
display(orders_df)

DataFrame[order_id: int, product_id: int, client_id: int, store_id: int, order_quantity: double, order_date: date]

In [10]:
orders = orders_df.writeStream\
    .format('memory')\
    .queryName('orders_stream')\
    .outputMode("append")\
    .start()

22/04/03 15:57:36 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-8013bbc6-a9fe-4d90-a48d-6fa38e5e998a. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
                                                                                

In [11]:
# orders_stream_df = spark.table("orders_stream").where(col('store_id') == 1)
# orders_stream_df.show()

In [12]:
spark.sql("SELECT * FROM orders_stream").show()
# %sql
# select count(*) from orders_stream where store_id = 1    # DataBricks selecting

+--------+----------+---------+--------+--------------+----------+
|order_id|product_id|client_id|store_id|order_quantity|order_date|
+--------+----------+---------+--------+--------------+----------+
|      17|     22637|      569|       1|           4.5|2021-01-01|
|      17|     27577|      569|       1|           6.0|2021-01-01|
|      17|      4168|      569|       1|           6.0|2021-01-01|
|      17|     19135|      569|       1|           3.0|2021-01-01|
|      17|      8650|      569|       1|           9.0|2021-01-01|
|      17|     11289|      569|       1|           1.5|2021-01-01|
|      17|      2056|      569|       1|           9.0|2021-01-01|
|      17|     44526|      569|       1|           1.5|2021-01-01|
|      17|     17653|      569|       1|           6.0|2021-01-01|
|      17|      5302|      569|       1|           7.5|2021-01-01|
|      17|     14680|      569|       1|           3.0|2021-01-01|
|      21|     17569|     1130|       1|           7.5|2021-01

22/04/03 15:57:43 WARN scheduler.TaskSetManager: Stage 11 contains a task of very large size (8670 KiB). The maximum recommended task size is 1000 KiB.


In [13]:
spark.sql("SELECT count(*) FROM orders_stream").show()

+--------+
|count(1)|
+--------+
|  136764|
+--------+



22/04/03 15:57:51 WARN scheduler.TaskSetManager: Stage 12 contains a task of very large size (9754 KiB). The maximum recommended task size is 1000 KiB.


In [14]:
orders.stop()

In [15]:
csv_schema = StructType([StructField("client_id", IntegerType(), False), StructField("client_name", StringType(), False)])

In [16]:
clients_df = spark.read\
    .option("header", True)\
    .schema(csv_schema)\
    .csv("/datalake/bronze/dshop/clients/2022-03-04/clients.csv")

In [17]:
# clients_df = spark.table("dim_client")  # in DataBricks possibly to use "dim_client" table from another session without table definition and population 

In [18]:
clients_df.show()

+---------+---------------+
|client_id|    client_name|
+---------+---------------+
|        1| Anthony Reilly|
|        2|Christina Boyle|
|        3|  Andrew Walker|
|        4|Emily Rodriguez|
|        5|    Glen Travis|
|        6|     Adam Mayer|
|        7|  Lydia Griffin|
|        8|     Marcus Cox|
|        9|   Sean Johnson|
|       10|    Joel Mullen|
|       11|   Laurie Brown|
|       12|  Kevin Johnson|
|       13|    Paula Lopez|
|       14|Michelle Hodges|
|       15|   Jerome Russo|
|       16|Jonathon Porter|
|       17| Andrew Jackson|
|       18|   Emma Jackson|
|       19|     Pam Wilson|
|       20| Andrea Sanders|
+---------+---------------+
only showing top 20 rows



22/04/03 15:58:00 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv


In [19]:
enriched_df = orders_df.join(clients_df, orders_df.client_id == clients_df.client_id, 'inner')\
    .select(
         orders_df.order_id,
         orders_df.product_id,
         clients_df.client_name,
         orders_df.order_quantity,
         orders_df.order_date
)

In [20]:
display(enriched_df)

DataFrame[order_id: int, product_id: int, client_name: string, order_quantity: double, order_date: date]

In [21]:
enriched = enriched_df.writeStream\
    .format('memory')\
    .queryName('enriched_stream')\
    .outputMode("append")\
    .start()

22/04/03 15:58:05 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-2f9df43c-d860-4f15-9b93-411be4a9379e. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/04/03 15:58:05 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:58:06 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:58:06 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:


In [22]:
spark.sql("SELECT * FROM enriched_stream").show(100)

+--------+----------+--------------------+--------------+----------+
|order_id|product_id|         client_name|order_quantity|order_date|
+--------+----------+--------------------+--------------+----------+
|      17|     22637|       Hannah Cooper|           4.5|2021-01-01|
|      17|     27577|       Hannah Cooper|           6.0|2021-01-01|
|      17|      4168|       Hannah Cooper|           6.0|2021-01-01|
|      17|     19135|       Hannah Cooper|           3.0|2021-01-01|
|      17|      8650|       Hannah Cooper|           9.0|2021-01-01|
|      17|     11289|       Hannah Cooper|           1.5|2021-01-01|
|      17|      2056|       Hannah Cooper|           9.0|2021-01-01|
|      17|     44526|       Hannah Cooper|           1.5|2021-01-01|
|      17|     17653|       Hannah Cooper|           6.0|2021-01-01|
|      17|      5302|       Hannah Cooper|           7.5|2021-01-01|
|      17|     14680|       Hannah Cooper|           3.0|2021-01-01|
|      21|     17569|       Julie 

22/04/03 15:58:15 WARN scheduler.TaskSetManager: Stage 33 contains a task of very large size (10880 KiB). The maximum recommended task size is 1000 KiB.


In [23]:
spark.sql("SELECT count(*) FROM enriched_stream").show()

+--------+
|count(1)|
+--------+
|  136764|
+--------+



22/04/03 15:58:20 WARN scheduler.TaskSetManager: Stage 34 contains a task of very large size (10880 KiB). The maximum recommended task size is 1000 KiB.


In [24]:
enriched.stop()

In [25]:
# % sql
# CREATE OR REPLACE TABLE tmp.rules(
#    user_gender varchar(2),
#    age_less_than int,
#    age_greater_than int,
#    weight_less_than int,
#    weight_greater_than int,
#    user_is_smoker varchar(2),
#    km_walked_greater_than int
# );

# INSERT into tmp.rules VALUES ('M', NULL, 52, NULL, 147, 'Y', 2);

In [26]:
# %sql
# select * from tmp.rules

In [27]:
# rules_df = spark.table("tmp.rules")

In [28]:
csv_schema = StructType([\
                        StructField("quantity_less_than", DoubleType(), True)\
                       ,StructField("quantity_greater_than", DoubleType(), True)\
                       ,StructField("date_less_than", DateType(), True)
                       ,StructField("date_greater_than", DateType(), True)\
                        ])

In [29]:
rules_df = spark.read\
    .option("header", True)\
    .schema(csv_schema)\
    .csv("/datalake/bronze/samples/rules.csv")

In [30]:
rules_df.show()
display(rules_df)

+------------------+---------------------+--------------+-----------------+
|quantity_less_than|quantity_greater_than|date_less_than|date_greater_than|
+------------------+---------------------+--------------+-----------------+
|               9.0|                  2.0|    2021-02-15|       2021-02-01|
+------------------+---------------------+--------------+-----------------+



DataFrame[quantity_less_than: double, quantity_greater_than: double, date_less_than: date, date_greater_than: date]

In [31]:
cond = [
    ((enriched_df.order_quantity > rules_df.quantity_greater_than) & (enriched_df.order_quantity < rules_df.quantity_less_than))
    & ((enriched_df.order_date > rules_df.date_greater_than) & (enriched_df.order_date < rules_df.date_less_than)) 
]

In [32]:
notifications_df = enriched_df.join(rules_df, cond)

In [33]:
users_notify_df = notifications_df.select('order_id','product_id','client_name','order_quantity','order_date')

In [34]:
users_notify = users_notify_df.writeStream\
    .format('memory')\
    .queryName('users_notify_stream')\
    .outputMode("append")\
    .start()

22/04/03 15:58:43 WARN streaming.StreamingQueryManager: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-440f9402-b642-446f-bcb3-87f7d434a6ee. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
22/04/03 15:58:43 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:58:44 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:58:45 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:


In [35]:
spark.sql("SELECT * FROM users_notify_stream").show(100)

+--------+----------+---------------+--------------+----------+
|order_id|product_id|    client_name|order_quantity|order_date|
+--------+----------+---------------+--------------+----------+
|    8150|     16276|    Megan Baker|           7.5|2021-02-02|
|    8150|     37617|    Megan Baker|           3.0|2021-02-02|
|    8150|     43342|    Megan Baker|           4.5|2021-02-02|
|    8150|     36527|    Megan Baker|           3.0|2021-02-02|
|    8150|     19601|    Megan Baker|           4.5|2021-02-02|
|    8176|     43510|  Andrew Walker|           3.0|2021-02-02|
|    8176|     32445|  Andrew Walker|           3.0|2021-02-02|
|    8176|     38084|  Andrew Walker|           7.5|2021-02-02|
|    8176|     37158|  Andrew Walker|           4.5|2021-02-02|
|    8176|     20513|  Andrew Walker|           7.5|2021-02-02|
|    8176|     39468|  Andrew Walker|           3.0|2021-02-02|
|    8176|     31677|  Andrew Walker|           3.0|2021-02-02|
|    8176|     14963|  Andrew Walker|   

22/04/03 15:58:52 WARN scheduler.TaskSetManager: Stage 64 contains a task of very large size (1111 KiB). The maximum recommended task size is 1000 KiB.


In [36]:
spark.sql("SELECT count(*) FROM users_notify_stream").show(100)

+--------+
|count(1)|
+--------+
|   13932|
+--------+



22/04/03 15:59:10 WARN scheduler.TaskSetManager: Stage 65 contains a task of very large size (1111 KiB). The maximum recommended task size is 1000 KiB.


In [37]:
users_notify.stop()

In [38]:
# users_notify_delta = users_notify_df.writeStream\
#    .format('delta')\
#    .outputMode("append")\
#    .option("checkpointlocation", "/tmp/tmp/checkpoint")\
#    .start("/tmp/tmp/16_delta_stream")

In [None]:
# %sql
# CREATE TABLE tmp.users_notify
#    USING DELTA
#    LOCATION '/tmp/tmp/16_delta_stream'

In [None]:
# %sql
# select * from tmp.users_notify

In [39]:
users_notify_parquet = users_notify_df.writeStream\
    .format('parquet')\
    .outputMode("append")\
    .option("checkpointlocation", "/test/checkpoint")\
    .start("/test/16_delta_stream")

22/04/03 15:59:16 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:59:17 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:59:18 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:59:19 WARN csv.CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 3, schema size: 2
CSV file: hdfs://master:9000/datalake/bronze/dshop/clients/2022-03-04/clients.csv
22/04/03 15:59:19 WARN csv.CSVHeaderChec

In [40]:
users_notify_parquet.stop()