In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [None]:
#For Using in Spark RDD
sc = SparkContext()

In [None]:
#For Using in Spark DataFrame
spark = SparkSession.builder.appName("Airline Performance").getOrCreate()

In [None]:
#day of week delay mapper
def delayMap(s):
    flight = s.split(",")
    delay = 0
    if flight[49] == "0.00" and flight[51] == "0.00" and flight[44] != "":
        delay = float(flight[44])
        
    return (flight[0].strip('"'), flight[2].strip('"'), flight[4].strip('"')), delay

In [None]:
#Spark RDD
%%timeit  -n1 -r1
flights = sc.textFile("hdfs:///airline/dataset")
delays = flights.map(delayMap)
sumCount = (0,0)
sumCountRDD = delays.aggregateByKey(sumCount, lambda a,b: (a[0] + b, a[1] + 1), lambda a,b: (a[0] + b[0], a[1] + b[1]))
delayAvg = sumCountRDD.mapValues(lambda v: v[0]/v[1])
delayAvg = delayAvg.sortBy(lambda x: x[1], ascending=False).collect()

In [None]:
#Spark InferSchema
%%timeit -n1 -r1

flights = spark.read.option("header",True).option("inferSchema" , True).csv('hdfs:///airline_performance/fft00')
flights.createOrReplaceTempView("flights")

result = spark.sql("select Year, Month, DayOfWeek , avg(ArrDelay) as Arrival_Delay from flights group by Year, Month, DayOfWeek order by Arrival_Delay desc").collect()

In [None]:
#Spark Schema Specified
%%timeit -n1 -r1
schema = StructType([
    StructField("Year", StringType()),
    StructField("Month", StringType()),
    StructField("DayOfWeek", StringType()),
    StructField("Arrival_Delay", IntegerType())
])

flights = spark.read.csv('hdfs:///airline/dataset', header=True, schema=schema)
flights.createOrReplaceTempView("flights")

result = spark.sql("select Year, Month, DayOfWeek , avg(Arrival_Delay) as Arrival_Delay from flights group by Year, Month, DayOfWeek order by Arrival_Delay desc").collect()

In [None]:
#Spark SQL
%%timeit -n1 -r1
flights = spark.read.option("header",True).csv('hdfs:///airline_performance/fft00')
flights.createOrReplaceTempView("flights")

result = spark.sql("select Year, Month, DayOfWeek , avg(cast(ArrDelay as int)) as Arrival_Delay from flights group by Year, Month, DayOfWeek order by Arrival_Delay desc").collect()