In [0]:
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [0]:
dbutils.fs.mkdirs("dbfs:/FileStore/datasets/airline_data")

In [0]:
schema = StructType([StructField("id",IntegerType(),True),
                     StructField("Gender",StringType(),True),
                     StructField("Customer Type",StringType(),True),
                     StructField("Age",IntegerType(),True),
                     StructField("Type of Travel",StringType(),True),
                     StructField("Class",StringType(),True),
                     StructField("Flight Distance",IntegerType(),True),
                     StructField("Inflight wifi service",IntegerType(),True),
                     StructField("Departure/Arrival time convenient",IntegerType(),True),
                     StructField("Ease of Online booking",IntegerType(),True),
                     StructField("Gate location",IntegerType(),True),
                     StructField("Food and drink",IntegerType(),True),
                     StructField("Seat comfort",IntegerType(),True),
                     StructField("Online boarding",IntegerType(),True),
                     StructField("Inflight entertainment",IntegerType(),True),
                     StructField("On-board service",IntegerType(),True),
                     StructField("Leg room service",IntegerType(),True),
                     StructField("Baggage handling",IntegerType(),True),
                     StructField("Checkin service",IntegerType(),True),
                     StructField("Inflight service",IntegerType(),True),
                     StructField("Cleanliness",IntegerType(),True),
                     StructField("Departure Delay in Minutes",IntegerType(),True),
                     StructField("Arrival Delay in Minutes",IntegerType(),True),
                     StructField("satisfaction",StringType(),True),
])

id	Gender	Customer Type	Age	Type of Travel	Class	Flight Distance	Inflight wifi service	Departure/Arrival time convenient	Ease of Online booking	Gate location	Food and drink	Online boarding	Seat comfort	Inflight entertainment	On-board service	Leg room service	Baggage handling	Checkin service	Inflight service	Cleanliness	Departure Delay in Minutes	Arrival Delay in Minutes	satisfaction

In [0]:
airline_data_full = spark.readStream \
                        .format("csv") \
                            .option("header",True) \
                                .schema(schema) \
                                    .load("dbfs:/FileStore/datasets/airline_data")

airline_data_full.display()

id,Gender,Customer Type,Age,Type of Travel,Class,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Seat comfort,Online boarding,Inflight entertainment,On-board service,Leg room service,Baggage handling,Checkin service,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction
19556,Female,Loyal Customer,52,Business travel,Eco,160,5,4,3,4,3,4,3,5,5,5,5,2,5,5,50,44,satisfied
90035,Female,Loyal Customer,36,Business travel,Business,2863,1,1,3,1,5,4,5,4,4,4,4,3,4,5,0,0,satisfied
12360,Male,disloyal Customer,20,Business travel,Eco,192,2,0,2,4,2,2,2,2,4,1,3,2,2,2,0,0,neutral or dissatisfied
77959,Male,Loyal Customer,44,Business travel,Business,3377,0,0,0,2,3,4,4,1,1,1,1,3,1,4,0,6,satisfied
36875,Female,Loyal Customer,49,Business travel,Eco,1182,2,3,4,3,4,1,2,2,2,2,2,4,2,4,0,20,satisfied
39177,Male,Loyal Customer,16,Business travel,Eco,311,3,3,3,3,5,5,3,5,4,3,1,1,2,5,0,0,satisfied
79433,Female,Loyal Customer,77,Business travel,Business,3987,5,5,5,5,3,5,5,5,5,5,5,4,5,3,0,0,satisfied
97286,Female,Loyal Customer,43,Business travel,Business,2556,2,2,2,2,4,4,5,4,4,4,4,5,4,3,77,65,satisfied
27508,Male,Loyal Customer,47,Business travel,Eco,556,5,2,2,2,5,5,5,5,2,2,5,3,3,5,1,0,satisfied
62482,Female,Loyal Customer,46,Business travel,Business,1744,2,2,2,2,3,4,4,4,4,4,4,5,4,4,28,14,satisfied


In [0]:
airline_data = airline_data_full.select("Gender","Age","Type of Travel","Class","Baggage handling","Checkin service","Departure Delay in Minutes","Arrival Delay in Minutes")

In [0]:
airline_data = airline_data.withColumn("Timestamp",current_timestamp())

airline_data.display()

Gender,Age,Type of Travel,Class,Baggage handling,Checkin service,Departure Delay in Minutes,Arrival Delay in Minutes,Timestamp
Female,52,Business travel,Eco,5,2,50,44,2023-09-09T12:10:49.794+0000
Female,36,Business travel,Business,4,3,0,0,2023-09-09T12:10:49.794+0000
Male,20,Business travel,Eco,3,2,0,0,2023-09-09T12:10:49.794+0000
Male,44,Business travel,Business,1,3,0,6,2023-09-09T12:10:49.794+0000
Female,49,Business travel,Eco,2,4,0,20,2023-09-09T12:10:49.794+0000
Male,16,Business travel,Eco,1,1,0,0,2023-09-09T12:10:49.794+0000
Female,77,Business travel,Business,5,4,0,0,2023-09-09T12:10:49.794+0000
Female,43,Business travel,Business,4,5,77,65,2023-09-09T12:10:49.794+0000
Male,47,Business travel,Eco,5,3,1,0,2023-09-09T12:10:49.794+0000
Female,46,Business travel,Business,4,5,28,14,2023-09-09T12:10:49.794+0000


In [0]:
flight_class_age_df = airline_data.groupBy(airline_data.Class) \
                            .agg({"Age":"avg"})

flight_class_age_df.display()

Class,avg(Age)
Eco Plus,42.375
Business,42.01923076923077
Eco,42.40677966101695


In [0]:
flight_class_baggage_df = airline_data.groupBy(airline_data.Class) \
                            .agg({"Baggage Handling":"avg"})

flight_class_baggage_df.display()

Class,avg(Baggage Handling)
Eco Plus,2.875
Business,3.769230769230769
Eco,3.6440677966101696


Applying Tumbling Windows

In [0]:
avg_dep_delay_window_df = airline_data.groupBy(window(airline_data.Timestamp,"2 minutes")) \
                                        .agg({"Departure Delay in Minutes":"avg"})

In [0]:
avg_dep_delay_window_df.display()

window,avg(Departure Delay in Minutes)
"List(2023-09-09T12:38:00.000+0000, 2023-09-09T12:40:00.000+0000)",11.175
"List(2023-09-09T12:26:00.000+0000, 2023-09-09T12:28:00.000+0000)",17.25
"List(2023-09-09T12:22:00.000+0000, 2023-09-09T12:24:00.000+0000)",14.666666666666666
"List(2023-09-09T12:28:00.000+0000, 2023-09-09T12:30:00.000+0000)",11.2


In [0]:
avg_checking_score_window_df = airline_data.groupBy(window(airline_data.Timestamp,"2 minutes"), "Class") \
                                            .agg({"Checkin service":"avg"})

In [0]:
avg_checking_score_window_df.display()

window,Class,avg(Checkin service)
"List(2023-09-09T12:32:00.000+0000, 2023-09-09T12:34:00.000+0000)",Business,3.6666666666666665
"List(2023-09-09T12:38:00.000+0000, 2023-09-09T12:40:00.000+0000)",Eco,3.272727272727273
"List(2023-09-09T12:32:00.000+0000, 2023-09-09T12:34:00.000+0000)",Eco,3.027027027027027
"List(2023-09-09T12:38:00.000+0000, 2023-09-09T12:40:00.000+0000)",Business,3.5
"List(2023-09-09T12:32:00.000+0000, 2023-09-09T12:34:00.000+0000)",Eco Plus,3.333333333333333
"List(2023-09-09T12:38:00.000+0000, 2023-09-09T12:40:00.000+0000)",Eco Plus,3.5


In [0]:
avg_checkin_score_df = airline_data.groupBy(window(airline_data.Timestamp,"30 seconds"),airline_data["Type of Travel"]).agg({"Checkin service":"avg"})

display(avg_checkin_score_df)

window,Type of Travel,avg(Checkin service)
"List(2023-09-09T12:42:30.000+0000, 2023-09-09T12:43:00.000+0000)",Personal Travel,3.702702702702703
"List(2023-09-09T12:42:30.000+0000, 2023-09-09T12:43:00.000+0000)",Business travel,3.1951219512195124


Databricks visualization. Run in Databricks to view.

In [0]:
# Explore options of sliding and session window in similar fashion