In [0]:
"""
                    Performing Streaming Transformation on Data    
1. Create sub directory called "car_source_stream"          
2. Upload the first file
3. read data as cloudFiles use Auto Loader(check incrememently if has new files & process them)
4. make projection(selection), filtering & aggregation
"""

In [0]:
# 1. Create sub directory called "car_source_stream" 
dbutils.fs.mkdirs("dbfs:/FileStore/shared_uploads/auto_loader_streaming/car_source_stream")

Out[1]: True

In [0]:
dbutils.fs.ls("dbfs:/FileStore/shared_uploads/auto_loader_streaming/car_source_stream")

Out[2]: [FileInfo(path='dbfs:/FileStore/shared_uploads/auto_loader_streaming/car_source_stream/car_ad_01.csv', name='car_ad_01.csv', size=1112, modificationTime=1693036370000)]

In [0]:
"""
                                Auto Loader
- format("cloudFiles"): Auto loader provides structured streaming source called cloud files
when specify, automatically process new file when arrive
- schema: specify location to track the schema of files 
- load: specify dir location which we read data from 
- schemaHints: without specify it make all columns string but need some columns to be int or float
to able to make calculalations
"""
car_stream_data = spark.readStream.format("cloudFiles")\
    .option("cloudFiles.format", "csv")\
        .option("cloudFiles.schemaLocation", "dbfs:/FileStore/shared_uploads/auto_loader_streaming/car_source_stream")\
            .option("cloudFiles.schemaHints", "price float, mileage int, engV float, year int")\
                .load("dbfs:/FileStore/shared_uploads/auto_loader_streaming/car_source_stream")

In [0]:
display(car_stream_data)

car,price,body,mileage,engV,engType,registration,year,model,drive,_rescued_data
Ford,15500.0,crossover,68,2.5,Gas,yes,2010,Kuga,full,
Mercedes-Benz,20500.0,sedan,173,1.8,Gas,yes,2011,E-Class,rear,
Mercedes-Benz,35000.0,other,135,5.5,Petrol,yes,2008,CL 550,rear,
Mercedes-Benz,17800.0,van,162,1.8,Diesel,yes,2012,B 180,front,
Nissan,16600.0,crossover,83,2.0,Petrol,yes,2013,X-Trail,full,
Honda,6500.0,sedan,199,2.0,Petrol,yes,2003,Accord,front,
Renault,10500.0,vagon,185,1.5,Diesel,yes,2011,Megane,front,
Mercedes-Benz,21500.0,sedan,146,1.8,Gas,yes,2012,E-Class,rear,
Mercedes-Benz,22700.0,sedan,125,2.2,Diesel,yes,2010,E-Class,rear,
Nissan,20447.154,crossover,0,1.2,Petrol,yes,2016,Qashqai,front,


In [0]:
car_stream_data.select("year", "price").display()

year,price
2010,15500.0
2011,20500.0
2008,35000.0
2012,17800.0
2013,16600.0
2003,6500.0
2011,10500.0
2012,21500.0
2010,22700.0
2016,20447.154


Databricks visualization. Run in Databricks to view.

In [0]:
car_stream_transformed_1 = car_stream_data.select("car", "model", "body", "year", "price")\
    .where("price > 10000")
display(car_stream_transformed_1)

car,model,body,year,price
Ford,Kuga,crossover,2010,15500.0
Mercedes-Benz,E-Class,sedan,2011,20500.0
Mercedes-Benz,CL 550,other,2008,35000.0
Mercedes-Benz,B 180,van,2012,17800.0
Nissan,X-Trail,crossover,2013,16600.0
Renault,Megane,vagon,2011,10500.0
Mercedes-Benz,E-Class,sedan,2012,21500.0
Mercedes-Benz,E-Class,sedan,2010,22700.0
Nissan,Qashqai,crossover,2016,20447.154
Mercedes-Benz,E-Class,sedan,2011,20400.0


In [0]:
car_stream_transformed_2 = car_stream_transformed_1.select("*")\
    .filter((car_stream_transformed_1["car"].isin(["Mercedes-Benz", "BMW"])) &
          (car_stream_transformed_1["year"] > 2010)
    )

display(car_stream_transformed_2)

car,model,body,year,price
Mercedes-Benz,E-Class,sedan,2011,20500.0
Mercedes-Benz,B 180,van,2012,17800.0
Mercedes-Benz,E-Class,sedan,2012,21500.0
Mercedes-Benz,E-Class,sedan,2011,20400.0
Mercedes-Benz,E-Class,sedan,2012,22500.0
Mercedes-Benz,E-Class,sedan,2012,21500.0
BMW,750,sedan,2016,129222.0
Mercedes-Benz,GLE-Class,crossover,2016,99999.0
Mercedes-Benz,GLE-Class,crossover,2016,104999.0
Mercedes-Benz,Sprinter 324 пасс.,van,2013,200000.0


In [0]:
car_stream_transformed_3 = car_stream_transformed_2.select("*")\
    .groupBy("year")\
        .count()\
            .withColumnRenamed("count", "total")

display(car_stream_transformed_3)

year,total
2015,1
2013,2
2014,1
2012,5
2016,15
2011,6


In [0]:
car_stream_transformed_4 = car_stream_transformed_2.select("*")\
    .groupBy("body")\
        .agg({"price":"avg", "year":"count"})\
            .withColumnRenamed("count(year)", "total")\
                .withColumnRenamed("avg(price)", "price_avg")

display(car_stream_transformed_4)

body,total,price_avg
van,2,108900.0
crossover,15,82849.73333333334
sedan,13,36227.307692307695


In [0]:
"""
                conclusion:
- the transformation you make on batch data, also the same on stream data
- practice to add new stream data into sub dir "car_source_stream", and monitor the updated results of your transformations
"""