In [0]:
"""
            Checkpoint
- help to save intermediate results from failure
Ex will be implemented below:
    1. read data using auto loader
    2. run query and save data in table in memory without checkpoint
    3. run query and save data in table in memory with checkpoint (save intermediate results in in dbfs)

    4. cancel two query to run
    5. add new files to path query readstream from
    6. run 2 queries again

Conclusion:
the query without checkpoint, will run or read data from scratch the previous data 
"""

In [0]:
dbutils.fs.ls("dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/")

Out[1]: [FileInfo(path='dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/_schemas/', name='_schemas/', size=0, modificationTime=1694498223000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/attrition_04.csv', name='attrition_04.csv', size=11523, modificationTime=1694502481000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/attrition_05.csv', name='attrition_05.csv', size=11447, modificationTime=1694673949000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/attrition_06.csv', name='attrition_06.csv', size=11608, modificationTime=1694673949000)]

In [0]:
dbutils.fs.rm("dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/attrition_05.csv")

dbutils.fs.rm("dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/attrition_06.csv")

Out[8]: True

In [0]:
dbutils.fs.ls("dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/")

Out[9]: [FileInfo(path='dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/_schemas/', name='_schemas/', size=0, modificationTime=1694498223000),
 FileInfo(path='dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/attrition_04.csv', name='attrition_04.csv', size=11523, modificationTime=1694502481000)]

In [0]:
"""
                                Auto Loader
- format("cloudFiles"): Auto loader provides structured streaming source called cloud files
when specify, automatically process new file when arrive/incrementally process the data
- schema: specify location to track the schema of files 
- load: specify dir location which we read data from 
- schemaHints: without specify it make all columns string but need some columns to be int or float
to able to make calculalations
"""
attr_stream_data = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "csv")\
        .option("cloudFiles.schemaLocation", "dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/")\
            .option("cloudFiles.schemaHints", "Age int, DailyRate int, DistanceFromHome int, HourlyRate int, MonthlyIncome int, PercentSalaryHike int, YearsSinceLastPromotion int, YearsWithCurrManager int")\
                .load("dbfs:/FileStore/shared_uploads/auto_loader_streaming/attrition_source_stream/")

attr_stream_data.display()

Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,EducationField,Gender,HourlyRate,JobLevel,JobRole,JobSatisfaction,MaritalStatus,MonthlyIncome,OverTime,PercentSalaryHike,PerformanceRating,YearsSinceLastPromotion,YearsWithCurrManager,_rescued_data
41,No,Travel_Rarely,334,Sales,2,Life Sciences,Male,88,4,Manager,2,Single,16015,No,19,3,0,4,
18,No,Travel_Rarely,812,Sales,10,Medical,Female,69,1,Sales Representative,3,Single,1200,No,12,3,0,0,
28,No,Travel_Rarely,1476,Research & Development,16,Medical,Male,68,2,Healthcare Representative,1,Single,5661,No,19,3,0,7,
31,No,Travel_Rarely,218,Sales,7,Technical Degree,Male,100,2,Sales Executive,4,Married,6929,No,11,3,7,7,
39,No,Travel_Rarely,1132,Research & Development,1,Medical,Male,48,3,Healthcare Representative,4,Divorced,9613,No,17,3,3,7,
36,No,Non-Travel,1105,Research & Development,24,Life Sciences,Female,47,2,Laboratory Technician,2,Married,5674,No,15,3,0,8,
32,No,Travel_Rarely,906,Sales,7,Life Sciences,Male,91,2,Sales Executive,3,Married,5484,No,14,3,4,8,
38,No,Travel_Rarely,849,Research & Development,25,Life Sciences,Female,81,3,Research Director,2,Married,12061,No,17,3,0,1,
58,No,Non-Travel,390,Research & Development,1,Life Sciences,Male,32,2,Healthcare Representative,3,Divorced,5660,Yes,13,3,1,2,
31,No,Travel_Rarely,691,Research & Development,5,Technical Degree,Male,86,1,Research Scientist,4,Married,4821,Yes,12,3,0,3,


In [0]:
# streaming query
monthly_income_groupby_jobRole = attr_stream_data.groupby("JobRole").agg({"MonthlyIncome": "avg"})
monthly_income_groupby_jobRole.display()

JobRole,avg(MonthlyIncome)
Sales Executive,6105.518518518518
Manufacturing Director,6290.125
Laboratory Technician,3338.866666666667
Sales Representative,2429.0
Healthcare Representative,8267.454545454546
Research Scientist,3675.222222222222
Manager,15566.0
Research Director,15267.5
Human Resources,4553.0


In [0]:
# write data into table ""query_without_checkpoint"" in memory without checkpoint
monthly_income_groupby_jobRole.writeStream\
    .queryName("query_without_checkpoint")\
        .outputMode("complete")\
            .format("memory")\
                .start()

Out[12]: <pyspark.sql.streaming.query.StreamingQuery at 0x7fb4fce4a790>

In [0]:
spark.sql("select * from query_without_checkpoint").display()

JobRole,avg(MonthlyIncome)
Sales Executive,6105.518518518518
Manufacturing Director,6290.125
Laboratory Technician,3338.866666666667
Sales Representative,2429.0
Healthcare Representative,8267.454545454546
Research Scientist,3675.222222222222
Manager,15566.0
Research Director,15267.5
Human Resources,4553.0


In [0]:
# write data into table "query_checkpoint" in memory with saving results in "dbfs:/FileStore/checkpoint/"
# the results saved in this table are resilient to failure
monthly_income_groupby_jobRole.writeStream\
    .queryName("query_checkpoint")\
        .outputMode("complete")\
            .option("checkpointLocation", "dbfs:/FileStore/checkpoint_1/")\
                .format("memory")\
                    .start()

Out[14]: <pyspark.sql.streaming.query.StreamingQuery at 0x7fb4fc900af0>

In [0]:
spark.sql("select * from query_checkpoint").display()

JobRole,avg(MonthlyIncome)
