In [None]:
# Step 1: Start Spark Session
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, sum, avg, round

spark = SparkSession.builder.appName("Airline_PySpark_Analysis").getOrCreate()

# Step 2: Load the dataset
# Make sure 'airline_passenger_satisfaction.csv' is in the same folder
df = spark.read.option("header", True).option("inferSchema", True).csv("airline_passenger_satisfaction.csv")

# Step 3: View schema and sample rows
df.printSchema()
df.show(5)

# Step 4: Total number of flights per airline
flights_per_airline = df.groupBy("Airline Name").count().orderBy(col("count").desc())
flights_per_airline.show()

# Step 5: Average departure delay per airline
avg_dep_delay = df.groupBy("Airline Name").agg(
    round(avg("Departure Delay in Minutes"), 2).alias("Avg_Departure_Delay")
).orderBy("Avg_Departure_Delay", ascending=False)
avg_dep_delay.show()

# Step 6: Cancellation rate per airline
cancel_rate = df.groupBy("Airline Name").agg(
    count("*").alias("Total_Flights"),
    sum((col("Flight cancelled") == "Yes").cast("int")).alias("Cancelled_Flights")
).withColumn("Cancellation_Rate(%)", round((col("Cancelled_Flights") / col("Total_Flights")) * 100, 2))
cancel_rate.orderBy("Cancellation_Rate(%)", ascending=False).show()

# Step 7: Average flight distance per airline
avg_distance = df.groupBy("Airline Name").agg(
    round(avg("Flight Distance"), 2).alias("Avg_Flight_Distance")
).orderBy("Avg_Flight_Distance", ascending=False)
avg_distance.show()

# Step 8: Flight count per origin airport
flights_by_origin = df.groupBy("Origin Airport").count().orderBy("count", ascending=False)
flights_by_origin.show(10)

# Step 9: Summary
print("✅ Analysis Complete")

# Step 10: Stop Spark Session
spark.stop()
