In [25]:
!pip install pyspark



In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg,sum,max

spark=SparkSession.builder.appName('airline_flight_data').getOrCreate()

airline_df=spark.read.csv("/content/airline_data.csv", header=True, inferSchema=True)

In [27]:
# 1.Find the Total Distance Traveled by Each Airline
total_distance_travelled = airline_df.groupBy("airline").agg(sum("distance").alias("total_distance"))
print("Total Distance Traveled by Each Airline: ")
total_distance_travelled.show()

Total Distance Traveled by Each Airline: 
+---------+--------------+
|  airline|total_distance|
+---------+--------------+
|    Delta|         11840|
|   United|          5920|
|  JetBlue|          4180|
|Southwest|          2300|
| American|          5540|
+---------+--------------+



In [28]:
# 2. Filter Flights with Delays Greater than 30 Minutes
delayed_flights = airline_df.filter(airline_df["delay_min"] > 30)
print("Flights with Delays Greater than 30 Minutes: ")
delayed_flights.show()

Flights with Delays Greater than 30 Minutes: 
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|flight_id|airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|      date|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|        2| United|        UA456|   SFO|        ORD|2024-09-09 09:30:00|2024-09-09 15:00:00|       45|    2960|2023-07-01|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+



In [29]:
# 3. Find the Flight with the Longest Distance
longest_flight = airline_df.orderBy(airline_df["distance"].desc()).limit(1)
print("Flight with the Longest Distance: ")
longest_flight.show()

Flight with the Longest Distance: 
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|flight_id|airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|      date|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|        7|JetBlue|        JB302|   BOS|        LAX|2024-09-09 06:30:00|2024-09-09 09:45:00|       10|    4180|2023-07-03|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+



In [30]:
# 4. Calculate the Average Delay Time for Each Airline
average_delay = airline_df.groupBy("airline").agg(avg("delay_min").alias("average_delay"))
print("Average Delay Time for Each Airline: ")
average_delay.show()

Average Delay Time for Each Airline: 
+---------+------------------+
|  airline|     average_delay|
+---------+------------------+
|    Delta|16.666666666666668|
|   United|              22.5|
|  JetBlue|              10.0|
|Southwest|               2.5|
| American|              20.0|
+---------+------------------+



In [31]:
# 5. Identify Flights That Were Not Delayed
not_delayed_flights = airline_df.filter(airline_df["delay_min"] == 0)
print("Flights That Were Not Delayed: ")
not_delayed_flights.show()

Flights That Were Not Delayed: 
+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|flight_id|  airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|      date|
+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|        3|Southwest|        SW789|   DAL|        ATL|2024-09-09 06:00:00|2024-09-09 08:30:00|        0|    1150|2023-07-01|
|        6|   United|        UA457|   ORD|        SFO|2024-09-09 11:00:00|2024-09-09 14:30:00|        0|    2960|2023-07-02|
|       10|    Delta|        DL125|   JFK|        SEA|2024-09-09 13:00:00|2024-09-09 17:00:00|        0|    3900|2023-07-04|
+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+



In [32]:
# 6. Find the Top 3 Most Frequent Routes
top_3_routes = airline_df.groupBy("origin", "destination").count().orderBy("count", ascending=False).limit(3)
print("Top 3 Most Frequent Routes: ")
top_3_routes.show()

Top 3 Most Frequent Routes: 
+------+-----------+-----+
|origin|destination|count|
+------+-----------+-----+
|   SFO|        ORD|    1|
|   LAX|        JFK|    1|
|   DEN|        MIA|    1|
+------+-----------+-----+



In [34]:
# 7. Calculate the Total Number of Flights per Day
flights_per_day = airline_df.groupBy("date").count()
print("Total Number of Flights per Day: ")
flights_per_day.show()

Total Number of Flights per Day: 
+----------+-----+
|      date|count|
+----------+-----+
|2023-07-04|    1|
|2023-07-02|    3|
|2023-07-03|    3|
|2023-07-01|    3|
+----------+-----+



In [35]:
# 8. Find the Airline with the Most Flights
most_flights_airline = airline_df.groupBy("airline").count().orderBy("count", ascending=False).limit(1)
print("Airline with the Most Flights: ")
most_flights_airline.show()

Airline with the Most Flights: 
+-------+-----+
|airline|count|
+-------+-----+
|  Delta|    3|
+-------+-----+



In [36]:
# 9. Calculate the Average Flight Distance per Day
average_distance_per_day = airline_df.groupBy("date").agg(avg("distance").alias("average_distance"))
print("Average Flight Distance per Day: ")
average_distance_per_day.show()

Average Flight Distance per Day: 
+----------+------------------+
|      date|  average_distance|
+----------+------------------+
|2023-07-04|            3900.0|
|2023-07-02|3233.3333333333335|
|2023-07-03|            2700.0|
|2023-07-01|2693.3333333333335|
+----------+------------------+



In [37]:
# 10. Create a New Column for On-Time Status
from pyspark.sql.functions import when
data_with_on_time = airline_df.withColumn("on_time", when(airline_df["delay_min"] == 0, True).otherwise(False))
print(" added colum on-time status: ")
data_with_on_time.show()

 added colum on-time status: 
+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+-------+
|flight_id|  airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|      date|on_time|
+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+-------+
|        1|    Delta|        DL123|   JFK|        LAX|2024-09-09 08:00:00|2024-09-09 11:00:00|       30|    3970|2023-07-01|  false|
|        2|   United|        UA456|   SFO|        ORD|2024-09-09 09:30:00|2024-09-09 15:00:00|       45|    2960|2023-07-01|  false|
|        3|Southwest|        SW789|   DAL|        ATL|2024-09-09 06:00:00|2024-09-09 08:30:00|        0|    1150|2023-07-01|   true|
|        4|    Delta|        DL124|   LAX|        JFK|2024-09-09 12:00:00|2024-09-09 20:00:00|       20|    3970|2023-07-02|  false|
|        5| American|        AA101|   M