<a href="https://colab.research.google.com/github/TanishqLambhate/Data-Science-Training/blob/pyspark/Pyspark_Ex_Airline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark


In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('test').getOrCreate()
csv_file="/content/Airline.csv"
df=spark.read.csv(csv_file,header=True,inferSchema=True)
df.show()

+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|flight_id|  airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|      date|
+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|        1|    Delta|        DL123|   JFK|        LAX|2024-09-09 08:00:00|2024-09-09 11:00:00|       30|    3970|01-07-2023|
|        2|   United|        UA456|   SFO|        ORD|2024-09-09 09:30:00|2024-09-09 15:00:00|       45|    2960|01-07-2023|
|        3|Southwest|        SW789|   DAL|        ATL|2024-09-09 06:00:00|2024-09-09 08:30:00|        0|    1150|01-07-2023|
|        4|    Delta|        DL124|   LAX|        JFK|2024-09-09 12:00:00|2024-09-09 20:00:00|       20|    3970|02-07-2023|
|        5| American|        AA101|   MIA|        DEN|2024-09-09 07:00:00|2024-09-09 10:00:00|       15|    2770|02-07-2023|


In [16]:
# 1. Find the Total Distance Traveled by Each Airline
# Group the data by airline and calculate the total distance traveled for
# each airline.
df.groupBy("airline").agg(sum("distance").alias("TotalDistance")).show()

# 2. Filter Flights with Delays Greater than 30 Minutes
# Filter the dataset to show only flights where the delay was greater than
# 30 minutes.
df.filter(col("delay_min") > 30).show()

# 3. Find the Flight with the Longest Distance
# Identify the flight that covered the longest distance.
df.orderBy(col("distance").desc()).first()

# 4. Calculate the Average Delay Time for Each Airline
# Group the data by airline and calculate the average delay time in
# minutes for each airline.
df.groupBy("airline").agg(avg("delay_min").alias("AverageDelay")).show()

# 5. Identify Flights That Were Not Delayed
# Filter the dataset to show only flights with delay_minutes = 0 .
df.filter(col("delay_min") == 0).show()


+---------+-------------+
|  airline|TotalDistance|
+---------+-------------+
|    Delta|        11840|
|   United|         5920|
|  JetBlue|         4180|
|Southwest|         2300|
| American|         5540|
+---------+-------------+

+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|flight_id|airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|      date|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+
|        2| United|        UA456|   SFO|        ORD|2024-09-09 09:30:00|2024-09-09 15:00:00|       45|    2960|01-07-2023|
+---------+-------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+

+---------+------------------+
|  airline|      AverageDelay|
+---------+------------------+
|    Delta|16.666666666666668|
|   United|              

In [19]:
# 6. Find the Top 3 Most Frequent Routes
# Group the data by origin and destination to find the top 3 most
# frequent flight routes.
df.groupBy("origin", "destination").count().orderBy(col("count").desc()).limit(3).show()

# 7. Calculate the Total Number of Flights per Day
# Group the data by date and calculate the total number of flights on
# each day.
df.groupBy("date").count().show()

# 8. Find the Airline with the Most Flights
# Identify the airline that operated the most flights.
df.groupBy("airline").count().orderBy(col("count").desc()).first()

# 9. Calculate the Average Flight Distance per Day
# Group the data by date and calculate the average flight distance for
# each day.
df.groupBy("date").agg(avg("distance").alias("AverageDistance")).show()

# 10. Create a New Column for On-Time Status
# Add a new column called on_time that indicates whether a flight was on
# time ( True if delay_minutes = 0 , otherwise False ).
df.withColumn("on_time", when(col("delay_min") == 0, True).otherwise(False)).show()


+------+-----------+-----+
|origin|destination|count|
+------+-----------+-----+
|   SFO|        ORD|    1|
|   LAX|        JFK|    1|
|   DEN|        MIA|    1|
+------+-----------+-----+

+----------+-----+
|      date|count|
+----------+-----+
|01-07-2023|    3|
|02-07-2023|    3|
|03-07-2023|    3|
|04-07-2023|    1|
+----------+-----+

+----------+------------------+
|      date|   AverageDistance|
+----------+------------------+
|01-07-2023|2693.3333333333335|
|02-07-2023|3233.3333333333335|
|03-07-2023|            2700.0|
|04-07-2023|            3900.0|
+----------+------------------+

+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+-------+
|flight_id|  airline|flight_number|origin|destination|     departure_time|       arrival_time|delay_min|distance|      date|on_time|
+---------+---------+-------------+------+-----------+-------------------+-------------------+---------+--------+----------+-------+
|