Spark Streaming

In [27]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as sqlfn
from pyspark.sql import types as sqlt

In [28]:
# We need two threads('local[2]), one to stream and another one to conume the data.
spark = SparkSession.builder.appName('SparkStreamingLab1').master('local[2]').getOrCreate()

In [36]:
# Enforce schema, to avoid dynamic changes that can break things!
schema_employee = sqlt.StructType([
    sqlt.StructField('employee_id',sqlt.IntegerType(), True),
    sqlt.StructField('department_name',sqlt.StringType(), True),
    sqlt.StructField('name',sqlt.StringType(), True),
    sqlt.StructField('last_name',sqlt.StringType(), True),
    sqlt.StructField('hire_timestamp',sqlt.TimestampType(), True)
])

In [37]:
# Read Stream
df_employees = spark.readStream.format('csv').schema(schema_employee)\
                    .option('header',True)\
                    .option('maxFilesPerTrigger',1)\
                    .load(r'/home/sasa/Downloads/Code/notebooks/datasets/csv')

In [38]:
# Is my stream activated?
df_employees.isStreaming

True

In [39]:
# Show schema
df_employees.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- department_name: string (nullable = true)
 |-- name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- hire_timestamp: timestamp (nullable = true)



In [40]:
# Add aggregation
df_large_teams = df_employees.withWatermark("hire_timestamp", "10 minutes")\
                    .groupBy('department_name','hire_timestamp')\
                        .agg((sqlfn.count('employee_id').alias('count')), sqlfn.max('hire_timestamp')).where ('count >1')

In [41]:
df_stream_large_teams = df_large_teams.writeStream.format('console').outputMode('complete').start()

24/03/18 12:49:31 WARN ResolveWriteToStream: Temporary checkpoint location created which is deleted normally when the query didn't fail: /tmp/temporary-090ee51b-fd59-4717-a6d7-1d16e2ff9f78. If it's required to delete it under any circumstances, please set spark.sql.streaming.forceDeleteTempCheckpointLocation to true. Important to know deleting temp checkpoint folder is best effort.
24/03/18 12:49:31 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

-------------------------------------------
Batch: 0
-------------------------------------------
+---------------+--------------+-----+-------------------+
|department_name|hire_timestamp|count|max(hire_timestamp)|
+---------------+--------------+-----+-------------------+
+---------------+--------------+-----+-------------------+



                                                                                

-------------------------------------------
Batch: 1
-------------------------------------------
+--------------------+-------------------+-----+-------------------+
|     department_name|     hire_timestamp|count|max(hire_timestamp)|
+--------------------+-------------------+-----+-------------------+
|           Marketing|2023-08-20 13:20:13|    2|2023-08-20 13:20:13|
|            Big Data|2023-08-20 13:40:17|    2|2023-08-20 13:40:17|
|              Design|2023-08-20 13:20:14|    2|2023-08-20 13:20:14|
|            Big Data|2023-08-20 13:20:16|    2|2023-08-20 13:20:16|
|Software Engineering|2023-08-20 13:20:15|    2|2023-08-20 13:20:15|
|             Finance|2023-08-20 13:20:12|    2|2023-08-20 13:20:12|
|            Big Data|2023-08-20 13:20:10|    2|2023-08-20 13:20:10|
+--------------------+-------------------+-----+-------------------+



                                                                                

-------------------------------------------
Batch: 2
-------------------------------------------
+--------------------+-------------------+-----+-------------------+
|     department_name|     hire_timestamp|count|max(hire_timestamp)|
+--------------------+-------------------+-----+-------------------+
|           Marketing|2023-08-20 13:20:13|    3|2023-08-20 13:20:13|
|            Big Data|2023-08-20 13:40:17|    3|2023-08-20 13:40:17|
|              Design|2023-08-20 13:20:14|    3|2023-08-20 13:20:14|
|            Big Data|2023-08-20 13:20:16|    3|2023-08-20 13:20:16|
|Software Engineering|2023-08-20 13:20:15|    3|2023-08-20 13:20:15|
|             Finance|2023-08-20 13:20:12|    3|2023-08-20 13:20:12|
|            Big Data|2023-08-20 13:20:10|    3|2023-08-20 13:20:10|
+--------------------+-------------------+-----+-------------------+



In [42]:
df_stream_large_teams = df_large_teams.writeStream\
                                    .format('csv')\
                                    .outputMode('append')\
                                    .option("path", "output/large_depts/")\
                                    .option("checkpointLocation", "/home/sasa/Downloads/Code/notebooks/datasets/checkpoints/")\
                                    .start()

24/03/18 12:59:10 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

In [43]:
# Stop stream
df_stream_large_teams.stop()