In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, avg, max as spark_max


spark = SparkSession.builder.appName("FlightDelayAnalysis").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/28 15:36:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/10/28 15:36:58 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Load dataset
df = spark.read.csv("Flight Dataset - CSV(in).csv", header=True, inferSchema=True)

                                                                                

In [12]:
df.head(5)

[Row(FL_DATE='1/1/2006', DEP_DELAY=5, ARR_DELAY=19, AIR_TIME=350, DISTANCE=2475, DEP_TIME=9.083333, ARR_TIME=12.483334),
 Row(FL_DATE='1/2/2006', DEP_DELAY=167, ARR_DELAY=216, AIR_TIME=343, DISTANCE=2475, DEP_TIME=11.783334, ARR_TIME=15.766666),
 Row(FL_DATE='1/3/2006', DEP_DELAY=-7, ARR_DELAY=-2, AIR_TIME=344, DISTANCE=2475, DEP_TIME=8.883333, ARR_TIME=12.133333),
 Row(FL_DATE='1/4/2006', DEP_DELAY=-5, ARR_DELAY=-13, AIR_TIME=331, DISTANCE=2475, DEP_TIME=8.916667, ARR_TIME=11.95),
 Row(FL_DATE='1/5/2006', DEP_DELAY=-3, ARR_DELAY=-17, AIR_TIME=321, DISTANCE=2475, DEP_TIME=8.95, ARR_TIME=11.883333)]

In [6]:
# --- Task 1: Flights that arrived earlier than expected ---
def flights_arrived_early(df):
    return df.filter(col("ARR_DELAY") < 0).count()
print("Task 1:", flights_arrived_early(df))

Task 1: 534655


In [7]:
# --- Task 2: Typical departure time for flights over 2000 miles ---
def typical_departure_time_long_flights(df):
    return df.filter(col("DISTANCE") > 2000).agg(avg("DEP_TIME").alias("avg_departure_time")).collect()[0]["avg_departure_time"]
print("Task 2:", typical_departure_time_long_flights(df))

Task 2: 13.973233947624635


In [8]:
# --- Task 3: Proportion of flights with arrival delays > 60 minutes ---
def proportion_long_delays(df):
    total = df.count()
    delayed = df.filter(col("ARR_DELAY") > 60).count()
    return delayed / total if total > 0 else 0

print("Task 3:", proportion_long_delays(df))

Task 3: 0.053066


In [9]:
# --- Task 4: Average airtime for flights that left earlier than 9:00 AM ---
def average_airtime_morning_flights(df):
    return df.filter(col("DEP_TIME") < 9.0).agg(avg("AIR_TIME").alias("avg_airtime")).collect()[0]["avg_airtime"]

print("Task 4:", average_airtime_morning_flights(df))

Task 4: 111.36120276990287


In [10]:
# --- Task 5: Max arrival delay for flights without departure delay ---
def max_arrival_delay_no_dep_delay(df):
    return df.filter(col("DEP_DELAY") <= 0).agg(spark_max("ARR_DELAY").alias("max_arr_delay")).collect()[0]["max_arr_delay"]
print("Task 5:", max_arrival_delay_no_dep_delay(df))

Task 5: 701
