## Imports

In [None]:
# Import Spark
from pyspark.sql import SparkSession, functions as F, types as T

## Instantiate a Spark session

In [7]:
# Create a Spark session
spark = (SparkSession.builder
         # Give a name to the Spark application
         .appName("spark-fundamentals-lab")
         # Execute the Spark using the resources of the local device
         .master("local[*]")
         # Activate AQE to optimize shuffle and join strategy
         .config("spark.sql.adaptive.enabled", "true")
         # Looks for another Spark session to activate it, or create one if needed
         .getOrCreate())

In [8]:
# Print the Spark version
print("Spark version:", spark.version)

Spark version: 4.0.0


## Load the dataframe

In [None]:
# Create a plan to load the csv, in multiple steps (nothing is loaded until an action is performed)
df = (
    spark.read
    # Set the first rows as headers
    .option("header", True)
    # Deduct data types
    .option("inferSchema", True)
    # Import the csv 
    .csv("data/airlines_flights_data.csv")
)

In [11]:
# Print the schema
df.printSchema()

root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)



In [None]:
# Execute an action to load the data: load the 5 first rows of the dataframe
df.show(5, truncate=False) # truncate=False is to be sure we print the entire content of a cell

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index|airline |flight |source_city|departure_time|stops|arrival_time |destination_city|class  |duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|0    |SpiceJet|SG-8709|Delhi      |Evening       |zero |Night        |Mumbai          |Economy|2.17    |1        |5953 |
|1    |SpiceJet|SG-8157|Delhi      |Early_Morning |zero |Morning      |Mumbai          |Economy|2.33    |1        |5953 |
|2    |AirAsia |I5-764 |Delhi      |Early_Morning |zero |Early_Morning|Mumbai          |Economy|2.17    |1        |5956 |
|3    |Vistara |UK-995 |Delhi      |Morning       |zero |Afternoon    |Mumbai          |Economy|2.25    |1        |5955 |
|4    |Vistara |UK-963 |Delhi      |Morning       |zero |Morning      |Mumbai          |Economy|2.33    |1        |5955 |
+-----+--------+-------+

## Write the dataframe in Parquet

In [None]:
# Write in Parquet
(
    df.write
      # Removes and entirely reloads data
      .mode("overwrite")
      # Convert the dataframe to Parquet and export it to the data folder
      .parquet("data/silver/airlines_flights_data")
)

                                                                                

## Reload the data in Parquet

In [17]:
dfp = spark.read.parquet("data/silver/airlines_flights_data")

In [19]:
# Print the schema (columns and their respective types)
dfp.printSchema()

root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: double (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)



In [None]:
# Count the number of rows in the dataframe (equivalent of len in pandas)
dfp.count()

300153

In [None]:
# Print the main statistics on the dataframe (equivalent of describe in pandas)
dfp.summary().show()

25/09/04 20:35:32 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 11:>                                                         (0 + 6) / 6]

+-------+-----------------+-------+--------------------+-----------+--------------+------+------------+----------------+--------+------------------+------------------+------------------+
|summary|            index|airline|              flight|source_city|departure_time| stops|arrival_time|destination_city|   class|          duration|         days_left|             price|
+-------+-----------------+-------+--------------------+-----------+--------------+------+------------+----------------+--------+------------------+------------------+------------------+
|  count|           300153| 300153|              300153|     300153|        300153|300153|      300153|          300153|  300153|            300153|            300153|            300153|
|   mean|         150076.0|   NULL|5.427411873908628...|       NULL|          NULL|  NULL|        NULL|            NULL|    NULL|12.221020812718939|26.004750910369044|20889.660523133203|
| stddev|86646.85201148395|   NULL|1.803651814074487...|       NU

                                                                                

## Explore the data

In [22]:
dfp.show(5, truncate=False)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index|airline |flight |source_city|departure_time|stops|arrival_time |destination_city|class  |duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|0    |SpiceJet|SG-8709|Delhi      |Evening       |zero |Night        |Mumbai          |Economy|2.17    |1        |5953 |
|1    |SpiceJet|SG-8157|Delhi      |Early_Morning |zero |Morning      |Mumbai          |Economy|2.33    |1        |5953 |
|2    |AirAsia |I5-764 |Delhi      |Early_Morning |zero |Early_Morning|Mumbai          |Economy|2.17    |1        |5956 |
|3    |Vistara |UK-995 |Delhi      |Morning       |zero |Afternoon    |Mumbai          |Economy|2.25    |1        |5955 |
|4    |Vistara |UK-963 |Delhi      |Morning       |zero |Morning      |Mumbai          |Economy|2.33    |1        |5955 |
+-----+--------+-------+

In [30]:
# filter is a transformation --> By executing only this command, we only get a plan
dfp.filter("airline=='SpiceJet'")

# By adding the action show, we can execute a query and print a result dataframe
dfp.filter("airline=='SpiceJet'").show(5, truncate=False)

# Explain the execution plan of the query
dfp.filter("airline=='SpiceJet'").explain()

# Print the number of rows matching with the condition
dfp.filter("airline=='SpiceJet'").count()

+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|index|airline |flight |source_city|departure_time|stops|arrival_time|destination_city|class  |duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|0    |SpiceJet|SG-8709|Delhi      |Evening       |zero |Night       |Mumbai          |Economy|2.17    |1        |5953 |
|1    |SpiceJet|SG-8157|Delhi      |Early_Morning |zero |Morning     |Mumbai          |Economy|2.33    |1        |5953 |
|28   |SpiceJet|SG-8169|Delhi      |Evening       |zero |Night       |Mumbai          |Economy|2.33    |1        |10260|
|38   |SpiceJet|SG-2976|Delhi      |Evening       |one  |Night       |Mumbai          |Economy|4.5     |1        |12123|
|39   |SpiceJet|SG-2976|Delhi      |Evening       |one  |Morning     |Mumbai          |Economy|15.25   |1        |12123|
+-----+--------+-------+--------

9011