In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc, count, window
spark = SparkSession.builder.appName('trial_mini_project').getOrCreate()



# importing data from datastet

df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)
#df_q2 = df.select('Dest', 'TailNum').where(df["Dest"] != '0')

df.printSchema()

root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- DayofMonth: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- DepTime: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- ArrTime: string (nullable = true)
 |-- CRSArrTime: integer (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNum: integer (nullable = true)
 |-- TailNum: string (nullable = true)
 |-- ActualElapsedTime: string (nullable = true)
 |-- CRSElapsedTime: string (nullable = true)
 |-- AirTime: string (nullable = true)
 |-- ArrDelay: string (nullable = true)
 |-- DepDelay: string (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Dest: string (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- TaxiIn: integer (nullable = true)
 |-- TaxiOut: integer (nullable = true)
 |-- Cancelled: integer (nullable = true)
 |-- CancellationCode: string (nullable = true)
 |-- Diverted: integer (nullable = true)
 |-- C

# 1) Find the most frequent tail number which is getting in destination 


# ----------------------------------------------------------------------

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc, count, countDistinct, row_number
from pyspark.sql.window import Window

#importing data from local data folder
df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)

df_q1 = df
df_q1.na.drop(subset=['TailNum','Dest','Month'])

#Calculating which flights covered maximum origins by month wise

df_q11 = df_q1.filter(df_q1.Cancelled == 0).groupBy('Dest','TailNum').agg(count('TailNum').alias("Count_TailNum")).sort(desc('Count_TailNum'))


maxDest = Window.partitionBy('Dest').orderBy(col("Count_TailNum").desc())


flight_with_maxDest = df_q11.withColumn("row",row_number().over(maxDest)).filter(col("row") == 1).drop("row")

print("Flights that covered maximum Dest month wise")

flight_with_maxDest.sort(desc('Count_TailNum')).show() 

Flights that covered maximum Dest month wise
+----+-------+-------------+
|Dest|TailNum|Count_TailNum|
+----+-------+-------------+
| HNL| N655BR|         2227|
| DEN| N455YV|         1377|
| LAX| N313AE|         1250|
| ORD| N680AE|         1177|
| DFW| N286AE|         1161|
| PHX| N988HA|         1079|
| SEA| N556AS|         1076|
| ATL| N634AS|         1017|
| LGA| N916DE|         1003|
| IAH| N15941|          994|
| SLC| N457SW|          982|
| OGG| N479HA|          853|
| SJC| N841AE|          835|
| CVG| N656CA|          760|
| CLT| N906FJ|          706|
| JFK| N197JB|          702|
| ANC| N768AS|          701|
| IAD| N859MJ|          676|
| SFO| N293SW|          668|
| PHL| N944UW|          636|
+----+-------+-------------+
only showing top 20 rows



In [3]:
"""from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc, count, window
#importing data from local data folder
df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)

#selecting the required columns , this makes processing faster
df_q2 = df.select('Dest','TailNum', 'Cancelled')

df_q2_no_null = df_q2.filter((df_q2["TailNum"] != "0") & (df_q2["TailNum"] != "000000") & (df_q2["Cancelled"] != 1)) #removing rows with TailNum = 0/000000

#print(type(df_q2))
print(" +++++ "*10)
most_frequent = df_q2_no_null.groupBy("TailNum",'Dest').agg(count("TailNum").alias("Count_TailNum")).sort(col("Count_TailNum").desc()).show(1000)


#most_frequent.distinct().show()"""

'from pyspark.sql import SparkSession\nfrom pyspark.sql.functions import avg, col, desc, count, window\n#importing data from local data folder\ndf = spark.read.csv(\'Data/2007.csv\',inferSchema=True,header=True)\n\n#selecting the required columns , this makes processing faster\ndf_q2 = df.select(\'Dest\',\'TailNum\', \'Cancelled\')\n\ndf_q2_no_null = df_q2.filter((df_q2["TailNum"] != "0") & (df_q2["TailNum"] != "000000") & (df_q2["Cancelled"] != 1)) #removing rows with TailNum = 0/000000\n\n#print(type(df_q2))\nprint(" +++++ "*10)\nmost_frequent = df_q2_no_null.groupBy("TailNum",\'Dest\').agg(count("TailNum").alias("Count_TailNum")).sort(col("Count_TailNum").desc()).show(1000)\n\n\n#most_frequent.distinct().show()'

# ----------------------------------------------------------------------

# 2) Find out the cancelled flight  details for the last quarter of the year 2007



# ----------------------------------------------------------------------

In [4]:
df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)

df.na.drop()

df.filter((df.Month >=10) & (df.Month <=12) & (df.Cancelled == 1)).show(5)


+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|2007|   10|         2|        2|     NA|      1930|     NA|      2150|           WN|      195

# ----------------------------------------------------------------------

# 3) Find out the average weather delays for a particular flight per month


# ----------------------------------------------------------------------

In [5]:
from pyspark.sql.functions import avg, col, desc

df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)

df.groupBy("Month").avg('WeatherDelay').sort(col("Month")).show()

+-----+-------------------+
|Month|  avg(WeatherDelay)|
+-----+-------------------+
|    1| 0.8126742594025668|
|    2| 1.1426651862433788|
|    3| 0.6333765638468795|
|    4|   0.51643216930666|
|    5| 0.6052272846017077|
|    6| 1.2763936562420544|
|    7| 1.0766004687307265|
|    8| 0.8375915956275956|
|    9|0.41135346150449775|
|   10|0.45674389516057345|
|   11| 0.3357768086867862|
|   12| 1.1352771929481762|
+-----+-------------------+



# ----------------------------------------------------------------------

# 4) Inspite of NASDelay, SecurityDelay, LateAircraftDelay,Weatherdealy which flight reached on time



# ----------------------------------------------------------------------

In [6]:
df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)


df.filter(((df.NASDelay > 0) | (df.SecurityDelay > 0) | (df.LateAircraftDelay > 0) | (df.WeatherDelay > 0)) & (df.ArrDelay <= 0) ).count()

0

# ----------------------------------------------------------------------

# 5) Month wise total distance travelled by each flight number in every month

# ----------------------------------------------------------------------

In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc


df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)
df_q2 = df.select('Month','FlightNum', 'Distance').filter((df["TailNum"] != "0") & (df["TailNum"] != "000000"))

print(type(df_q2))

total_distance = df_q2.groupBy("Month","FlightNum").sum("Distance").sort(col("Month")).show()

#total_count = df_q2.groupBy("TailNum").sum("Distance").sort(col("sum(Distance)").desc()).show()


<class 'pyspark.sql.dataframe.DataFrame'>
+-----+---------+-------------+
|Month|FlightNum|sum(Distance)|
+-----+---------+-------------+
|    1|      739|       218313|
|    1|     2344|        71799|
|    1|     2285|        39326|
|    1|     2847|        73732|
|    1|      547|       215454|
|    1|     1726|       139989|
|    1|     2367|       103377|
|    1|     2478|        76863|
|    1|      847|       103350|
|    1|      381|       263587|
|    1|      152|       347567|
|    1|      541|       170169|
|    1|     2215|        54846|
|    1|     2682|        32871|
|    1|     2250|        36816|
|    1|     1207|       130163|
|    1|     2267|        28446|
|    1|     2699|        12831|
|    1|     7187|         5328|
|    1|     7174|        29747|
+-----+---------+-------------+
only showing top 20 rows



# ----------------------------------------------------------------------

# 6) Month wise how many flights get diverted(origin to destination)



# ----------------------------------------------------------------------

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc


df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)
df_q2 = df.select('Origin','Month','Dest','TailNum','Diverted').filter((df["TailNum"] != "0") & (df["TailNum"] != "000000"))

print(" ++++ "*10)

# monthwise total diveted flight
total_sum_diverted = df_q2.groupBy("Month").sum("Diverted").sort(col("Month")).show()

print(" ++++ "*10)

# monthwise total diveted flight with respect to each origin and destination
total_distance = df_q2.groupBy("Month",'Origin','Dest').sum("Diverted").sort(col("Month")).show(12)

print(" ++++ "*10)

 ++++  ++++  ++++  ++++  ++++  ++++  ++++  ++++  ++++  ++++ 
+-----+-------------+
|Month|sum(Diverted)|
+-----+-------------+
|    1|         1200|
|    2|         1261|
|    3|         1275|
|    4|         1193|
|    5|         1442|
|    6|         2199|
|    7|         2150|
|    8|         2100|
|    9|          942|
|   10|          977|
|   11|          845|
|   12|         1488|
+-----+-------------+

 ++++  ++++  ++++  ++++  ++++  ++++  ++++  ++++  ++++  ++++ 
+-----+------+----+-------------+
|Month|Origin|Dest|sum(Diverted)|
+-----+------+----+-------------+
|    1|   STL| TUL|            0|
|    1|   TPA| PHX|            1|
|    1|   DEN| LAS|            0|
|    1|   MCI| SEA|            0|
|    1|   MCO| ISP|            1|
|    1|   PDX| GEG|            0|
|    1|   IAH| MOB|            0|
|    1|   SDF| IAH|            0|
|    1|   CMH| DCA|            0|
|    1|   ATL| LEX|            0|
|    1|   CVG| IAD|            0|
|    1|   HSV| DCA|            0|
+-----+------+-

# ----------------------------------------------------------------------

# 7) Week and month wise number of trips in all the flights



# ----------------------------------------------------------------------

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc


df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)
df_no_cancel = df.filter('Cancelled== 0') 
df_q2 = df_no_cancel.select('Month','DayOfWeek','TailNum').filter((df["TailNum"] != "0") & (df["TailNum"] != "000000") & (df["Cancelled"] == 0))



total_distance = df_q2.groupBy("Month",'DayOfWeek','TailNum').agg(count("TailNum").alias("total_trips")).sort(['Month','DayOfWeek']).show()

+-----+---------+-------+-----------+
|Month|DayOfWeek|TailNum|total_trips|
+-----+---------+-------+-----------+
|    1|        1| N938UA|         21|
|    1|        1|   N793|         17|
|    1|        1| N27506|         13|
|    1|        1| N444YV|         30|
|    1|        1| N611SW|         39|
|    1|        1| N594SW|         17|
|    1|        1| N915EV|         24|
|    1|        1| N509AA|         21|
|    1|        1| N489UA|         14|
|    1|        1| N679DA|         20|
|    1|        1|  N6701|         11|
|    1|        1| N919DL|         20|
|    1|        1| N907EV|         23|
|    1|        1| N981AT|         25|
|    1|        1| N583NW|          9|
|    1|        1| N304US|         20|
|    1|        1| N372NW|         19|
|    1|        1| N601NW|         25|
|    1|        1| N5EFAA|         15|
|    1|        1| N4YBAA|         16|
+-----+---------+-------+-----------+
only showing top 20 rows



# ----------------------------------------------------------------------

In [10]:
#total_distance.sort(['Month','DayOfWeek']).show()

# 8) Which flights covered maximum origin and destination by month wise


# ----------------------------------------------------------------------

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc,expr

#importing data from local data folder
df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)

#selecting the required columns , this makes processing faster
df_q2 = df.select('Origin','Month','Dest','TailNum','FlightNum','Cancelled').filter((df["TailNum"] != "0") & (df["TailNum"] != "000000"))

df_q2_no_null = df_q2.filter((df_q2["Origin"] != "null") & (df_q2["Dest"] != "null") & (df_q2["Cancelled"] != 1) ) #removing rows with null values

print(type(df_q2))

print(" --++--  "*10 + " \n With respect to Tail Number \n" + " --++-- "*10)

most_frequent_TailNum = df_q2_no_null.groupBy('Month','Origin','Dest','TailNum').agg(count("TailNum").alias("Total_trips_TailNum")).sort(col("Total_trips_TailNum").desc()).show()

print(" --++--  "*10 + " \n With respect to Flight Number \n" + " --++-- "*10)

most_frequent_FlightNum = df_q2_no_null.groupBy('Month','Origin','Dest','FlightNum').agg(count("FlightNum").alias("Total_trips_FlightNum")).sort(col("Total_trips_FlightNum").desc())

most_frequent_FlightNum.show()

<class 'pyspark.sql.dataframe.DataFrame'>
 --++--   --++--   --++--   --++--   --++--   --++--   --++--   --++--   --++--   --++--   
 With respect to Tail Number 
 --++--  --++--  --++--  --++--  --++--  --++--  --++--  --++--  --++--  --++-- 
+-----+------+----+-------+-------------------+
|Month|Origin|Dest|TailNum|Total_trips_TailNum|
+-----+------+----+-------+-------------------+
|    8|   HNL| OGG| N841AL|                 94|
|    7|   HNL| OGG| N841AL|                 86|
|   12|   OGG| HNL| N841AL|                 86|
|   12|   HNL| OGG| N841AL|                 86|
|    8|   OGG| HNL| N485HA|                 85|
|   10|   HNL| KOA| N655BR|                 84|
|   10|   KOA| HNL| N655BR|                 84|
|    1|   KOA| HNL| N646BR|                 83|
|    1|   HNL| KOA| N646BR|                 82|
|    7|   LIH| HNL| N841AL|                 80|
|    5|   HNL| OGG| N836AL|                 79|
|    5|   BOS| LGA| N908DE|                 78|
|   10|   LGA| BOS| N911DE|        

# ----------------------------------------------------------------------

In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc, count, countDistinct, row_number
from pyspark.sql.window import Window

#importing data from local data folder
df = most_frequent_FlightNum
maxorigin = Window.partitionBy('Origin','Dest').orderBy(col("Total_trips_FlightNum").desc())

flight_with_maxorigin = df.withColumn("row",row_number().over(maxorigin)).filter(col("row") == 1).drop("row")

print("Flights that covered maximum origins month wise")

flight_with_maxorigin.show()


Flights that covered maximum origins month wise
+-----+------+----+---------+---------------------+
|Month|Origin|Dest|FlightNum|Total_trips_FlightNum|
+-----+------+----+---------+---------------------+
|    5|   ADK| AKN|      139|                    9|
|   10|   ATL| GSP|      678|                   31|
|    2|   AVP| JFK|     4941|                    1|
|    8|   BFL| SAN|      446|                   31|
|    5|   BQN| MCO|      724|                   31|
|    3|   CLE| SJU|     1412|                    5|
|    1|   EWR| STT|     1884|                   31|
|    5|   FSD| ATL|     5344|                   31|
|    8|   LAS| LIT|     1045|                   31|
|    3|   LAX| OXR|     6103|                   31|
|    1|   MCI| IAH|     2547|                   31|
|   12|   MLI| MCO|      392|                   15|
|    3|   MSP| AVL|     2900|                   31|
|    7|   ORD| PDX|      671|                   31|
|    8|   PBI| DCA|     1282|                   31|
|   10|   PHL| M

In [13]:
#8.Which flights covered maximum origin and destination by month wise
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc,expr

#importing data from local data folder
df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)

#Removing null values
df8=df
df8.na.drop(subset=['Origin','Dest','Month','FlightNum','Cancelled'])

#Calculating which flights covered maximum origins by month wise

df81 = df8.filter(df8.Cancelled == 0).groupBy('Month', 'FlightNum').agg(countDistinct('Origin').alias("Max_Origins")).sort('Month', desc('Max_Origins'))

maxorigin = Window.partitionBy('Month').orderBy(col("Max_Origins").desc())

flight_with_maxorigin = df81.withColumn("row",row_number().over(maxorigin)).filter(col("row") == 1).drop("row")

print("Flights that covered maximum origins month wise")

flight_with_maxorigin.show()

#Calculating which flights covered maximum destinations by month wise

df82 = df8.filter(df8.Cancelled == 0).groupBy('Month', 'FlightNum').agg(countDistinct('Dest').alias("Max_Destinations")).sort('Month', desc('Max_Destinations'))

maxdest = Window.partitionBy('Month').orderBy(col("Max_Destinations").desc())

flight_with_maxdest=df82.withColumn("row",row_number().over(maxdest)).filter(col("row") == 1).drop("row")

print("Flights that covered maximum destinations month wise")

flight_with_maxdest.show()




Flights that covered maximum origins month wise
+-----+---------+-----------+
|Month|FlightNum|Max_Origins|
+-----+---------+-----------+
|   12|      151|         22|
|    1|      433|         18|
|    6|      226|         18|
|    3|      644|         18|
|    5|      644|         17|
|    9|       62|         20|
|    4|      644|         17|
|    8|       67|         18|
|    7|      425|         17|
|   10|       66|         20|
|   11|      303|         21|
|    2|      500|         18|
+-----+---------+-----------+

Flights that covered maximum destinations month wise
+-----+---------+----------------+
|Month|FlightNum|Max_Destinations|
+-----+---------+----------------+
|   12|      151|              20|
|    1|      372|              18|
|    6|      308|              18|
|    3|      432|              17|
|    5|      644|              19|
|    9|      385|              18|
|    4|      473|              17|
|    8|       67|              18|
|    7|      425|              17

# ----------------------------------------------------------------------

# 9) Average month wise arrival delay (flightnum wise)



# ----------------------------------------------------------------------

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc


df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)
df_no_cancel = df.filter('Cancelled== 0') 
df_q2 = df_no_cancel.select('Month','ArrDelay','FlightNum')



total_distance = df_q2.groupBy("Month", "FlightNum").agg(avg("ArrDelay").alias("avg_monthly_arrival_delay")).sort(['Month']).show()

+-----+---------+-------------------------+
|Month|FlightNum|avg_monthly_arrival_delay|
+-----+---------+-------------------------+
|    1|       59|        6.102941176470588|
|    1|      239|        6.021739130434782|
|    1|     1972|        6.168539325842697|
|    1|       96|       10.427631578947368|
|    1|     2543|        5.857142857142857|
|    1|     1670|        6.069767441860465|
|    1|     1170|        19.62439024390244|
|    1|     1839|       2.7282608695652173|
|    1|     1703|         9.09727626459144|
|    1|      318|        8.606666666666667|
|    1|     2592|        1.911764705882353|
|    1|     1185|        12.43661971830986|
|    1|     1430|       15.057692307692308|
|    1|     1799|        6.566666666666666|
|    1|     2759|        42.69565217391305|
|    1|     3058|       10.226666666666667|
|    1|     3014|       1.8641975308641976|
|    1|     2898|       0.7391304347826086|
|    1|     7109|       16.894736842105264|
|    1|     5448|       1.425925

# ----------------------------------------------------------------------

# 10) Average month wise departure delay (flightnum wise)

# ----------------------------------------------------------------------

In [15]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, col, desc


df = spark.read.csv('Data/2007.csv',inferSchema=True,header=True)
df_no_cancel = df.filter('Cancelled== 0') 
df_q2 = df_no_cancel.select('Month','DepDelay','FlightNum')



total_distance = df_q2.groupBy("Month","FlightNum").agg(avg("DepDelay").alias("avg_monthly_departure_delay")).sort(['Month']).show()

+-----+---------+---------------------------+
|Month|FlightNum|avg_monthly_departure_delay|
+-----+---------+---------------------------+
|    1|      739|         3.3737864077669903|
|    1|     2344|           7.40952380952381|
|    1|     2285|         -4.232558139534884|
|    1|     2847|          5.572327044025157|
|    1|      547|          17.09205020920502|
|    1|     1726|          5.578231292517007|
|    1|     2367|         3.8255813953488373|
|    1|     2478|       0.033707865168539325|
|    1|      847|         12.486842105263158|
|    1|      381|         4.7611940298507465|
|    1|      152|                  4.8515625|
|    1|      541|         12.436681222707424|
|    1|     2215|         1.4193548387096775|
|    1|     2682|          8.419354838709678|
|    1|     2250|         2.6056338028169015|
|    1|     1207|          9.425531914893616|
|    1|     2267|         31.345454545454544|
|    1|     2699|          5.810344827586207|
|    1|     7187|                 

# ----------------------------------------------------------------------