In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=c5e2b5f5e6fc86ee2000d5415033054164e0c9b2b024244471af5dfc0c6b16e7
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg,sum,max

spark=SparkSession.builder.appName('airline_flight_data').getOrCreate()

airline_df=spark.read.csv("/content/airline_flight_data.csv", header=True, inferSchema=True)

In [22]:
# 1.Find the Total Distance Traveled by Each Airline
total_distance_travelled = airline_df.groupBy("airline").agg(sum("distance").alias("total_distance"))
print("Total Distance Traveled by Each Airline: ")
total_distance_travelled.show()

# 2. Filter Flights with Delays Greater than 30 Minutes
delayed_flights = airline_df.filter(airline_df["delay_min"] > 30)
print("Flights with Delays Greater than 30 Minutes: ")
delayed_flights.show()

# 3. Find the Flight with the Longest Distance
longest_flight = airline_df.orderBy(airline_df["distance"].desc()).limit(1)
print("Flight with the Longest Distance: ")
longest_flight.show()

# 4. Calculate the Average Delay Time for Each Airline
average_delay = airline_df.groupBy("airline").agg(avg("delay_min").alias("average_delay"))
print("Average Delay Time for Each Airline: ")
average_delay.show()

# 5. Identify Flights That Were Not Delayed
not_delayed_flights = airline_df.filter(airline_df["delay_min"] == 0)
print("Flights That Were Not Delayed: ")
not_delayed_flights.show()

# 6. Find the Top 3 Most Frequent Routes
top_3_routes = airline_df.groupBy("origin", "destination").count().orderBy("count", ascending=False).limit(3)
print("Top 3 Most Frequent Routes: ")
top_3_routes.show()

# 7. Calculate the Total Number of Flights per Day
flights_per_day = airline_df.groupBy("start_date ").count()
print("Total Number of Flights per Day: ")
flights_per_day.show()

# 8. Find the Airline with the Most Flights
most_flights_airline = airline_df.groupBy("airline").count().orderBy("count", ascending=False).limit(1)
print("Airline with the Most Flights: ")
most_flights_airline.show()

# 9. Calculate the Average Flight Distance per Day
average_distance_per_day = airline_df.groupBy("start_date ").agg(avg("distance").alias("average_distance"))
print("Average Flight Distance per Day: ")
average_distance_per_day.show()

# 10. Create a New Column for On-Time Status
from pyspark.sql.functions import when
data_with_on_time = airline_df.withColumn("on_time", when(airline_df["delay_min"] == 0, True).otherwise(False))
print(" added colum on-time status: ")
data_with_on_time.show()


Total Distance Traveled by Each Airline: 
+---------+--------------+
|  airline|total_distance|
+---------+--------------+
|    Delta|         11840|
|   United|          5920|
|  JetBlue|          4180|
|Southwest|          2300|
| American|          5540|
+---------+--------------+

Flights with Delays Greater than 30 Minutes: 
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+-----------+
|flight_id|airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|start_date |
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+-----------+
|        2| United|        UA456|   SFO|        ORD|2024-09-08 09:30:00|2024-09-08 15:00:00|       45|    2960| 2023-07-01|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+-----------+

Flight with the Longest Distance: 
+---------+-