In [None]:
# Import required PySpark modules
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, avg

# Start Spark session
spark = SparkSession.builder.appName("Airline Passenger Satisfaction").getOrCreate()

# Load CSV file (make sure the file is in the same folder)
df = spark.read.option("header", True).option("inferSchema", True).csv("airline_passenger_satisfaction.csv")

# Show schema and sample data
df.printSchema()
df.show(5)

# 1. Total flights per airline
flights_per_airline = df.groupBy("Airline Name").count().orderBy("count", ascending=False)
print("Total Flights per Airline:")
flights_per_airline.show()

# 2. Average departure delay per airline
avg_dep_delay = df.groupBy("Airline Name").avg("Departure Delay in Minutes").orderBy("avg(Departure Delay in Minutes)")
print("Average Departure Delay (minutes) per Airline:")
avg_dep_delay.show()

# 3. Cancellation rate per airline
cancellation_rate = df.groupBy("Airline Name").agg(
    count("*").alias("Total Flights"),
    sum((col("Flight cancelled") == "Yes").cast("int")).alias("Cancelled Flights")
).withColumn("Cancellation Rate (%)", (col("Cancelled Flights") / col("Total Flights")) * 100)

print("Cancellation Rate by Airline:")
cancellation_rate.orderBy("Cancellation Rate (%)", ascending=False).show()

# 4. Average flight distance per airline
avg_distance = df.groupBy("Airline Name").avg("Flight Distance").orderBy("avg(Flight Distance)", ascending=False)
print("Average Flight Distance per Airline:")
avg_distance.show()

# 5. Flights per origin airport
flights_by_origin = df.groupBy("Origin Airport").count().orderBy("count", ascending=False)
print("Flights per Origin Airport:")
flights_by_origin.show(10)

# Stop Spark
spark.stop()
