# Query 3

the percentage of flights belonging to a given "distance group" that were able to 
    halve their departure delays by the time they arrived at their destinations. 
    Distance groups assort flights by their total distance in miles. Flights with distances
    that are less than 200 miles belong in group 1, flights with distances that are between 
    200 and 399 miles belong in group 2, flights with distances that are between 400 and 599 
    miles belong in group 3, and so on. The last group contains flights whose distances are 
    between 2400 and 2599 miles.

### The dataset is composed in this way:

<br>1 Year -> 1994-2008 
<br>2 Month -> 1-12
<br>3 DayofMonth -> 1-31
<br>4 DayOfWeek -> 1 (Monday) - 7 (Sunday)
<br>5 DepTime -> actual departure time (local, hhmm)
<br>6 CRSDepTime -> scheduled departure time (local, hhmm)
<br>7 ArrTime -> actual arrival time (local, hhmm)
<br>8 CRSArrTime -> scheduled arrival time (local, hhmm)
<br>9 UniqueCarrier -> unique carrier code
<br>10 FlightNum -> flight number
<br>11 TailNum -> plane tail number
<br>12 ActualElapsedTime -> in minutes
<br>13 CRSElapsedTime -> in minutes
<br>14 AirTime -> in minutes
<br>15 ArrDelay -> arrival delay, in minutes
<br>16 DepDelay -> departure delay, in minutes
<br>17 Origin -> origin IATA airport code
<br>18 Dest -> destination IATA airport code
<br>19 Distance -> in miles
<br>20 TaxiIn -> taxi in time, in minutes
<br>21 TaxiOut -> taxi out time in minutes
<br>22 Cancelled -> was the flight cancelled?
<br>23 CancellationCode -> reason for cancellation
        (A = carrier, B = weather, C = NAS, D = security)
<br>24 Diverted -> 1 = yes, 0 = no
<br>25 CarrierDelay -> in minutes
<br>26 WeatherDelay -> in minutes
<br>27 NASDelay -> in minutes
<br>28 SecurityDelay -> in minutes
<br>29 LateAircraftDelay -> in minutes

In [1]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import when
from pyspark.sql.functions import udf
from pyspark.sql.functions import to_date, col, lit, concat, weekofyear
from pyspark.sql.types import IntegerType
import pandas as pd
import math

sqlContext = SQLContext(sc)

In [2]:
def loadYearCsv(year):
    filePath = "./Data/"+str(year)+".csv"
    
#load dataset from csv file and then creates a dataframe 
    df = sqlContext.read.csv(filePath, header='true')

#DataFrame["Distance", "DepDelay", "ArrDelay"]
    df = df.select("Distance", "DepDelay", "ArrDelay") \
        .filter(df["DepDelay"] != 'NA') \
        .filter(df["ArrDelay"] != 'NA') \
        .filter(df["Distance"] <= 2599) 
    
    return df

In [3]:
# Assign to different distance group based on the distance
def distanceGroup(dist):
    dist = int(dist)
    distGroup = math.floor(dist/200)+1
    return distGroup

In [4]:
# Returns 1 if the arrival delay is the half of the departure delay
def delayDifference(depDelay, arrDelay):
    depDelay = int(depDelay)
    arrDelay = int(arrDelay)
    if (depDelay <0):
        return 0
    if ( depDelay >= 2*arrDelay):
        return 1
    else:
        return 0

In [7]:
def halvesDelay(year):
    df = loadYearCsv(year)
    udf_distanceGroup = udf(distanceGroup, IntegerType())
    udf_delayDifference = udf(delayDifference, IntegerType())

    #DataFrame[Distance, DepDelay, ArrDelay, DistGroup, HalvesDelay]
    df = df.withColumn('DistGroup', udf_distanceGroup("Distance")) 
    df = df.withColumn('HalvesDelay', udf_delayDifference(df.DepDelay, df.ArrDelay))

    # Counts total of flights per distance group
    #DataFrame[DistGroup, FlightsPerDistGroup]
    df1 = df.groupBy("DistGroup").count().withColumnRenamed("count", "FlightsPerDistGroup")

    # Counts the amount of flights that halve the delay based on the distance group
    #DataFrame[DistGroup, HalvedPerDistGroup]
    df2 = df.filter(df["HalvesDelay"]==1).groupBy("DistGroup").count().withColumnRenamed("count", "HalvedPerDistGroup")
    
    #DataFrame[DistGroup, FlightsPerDistGroup, HalvedPerDistGroup]
    df = df1.join(df2, "DistGroup")
    
    return df

In [6]:
finalPath = "./Results/Query3/query3.csv"
years = list(range(1994, 2009,1))

# For every year, it computes the total amount of flights and the amount of those who managed to halve their delay
# per distance group. Then it adds it to dfFinal dataframe.
for y in years:
    tmpDf = halvesDelay(y)
    if (y==1994):
        dfFinal = tmpDf
    else:
        dfFinal = dfFinal.union(tmpDf)
        
# Sums the two counters for every distance group
dfFinal = dfFinal.groupBy("DistGroup").sum("HalvedPerDistGroup", "FlightsPerDistGroup") \
                .withColumnRenamed("sum(HalvedPerDistGroup)", "HalvedPerDistGroup") \
                .withColumnRenamed("sum(FlightsPerDistGroup)", "FlightsPerDistGroup") 
dfFinal = dfFinal.withColumn("PercHalved", dfFinal["HalvedPerDistGroup"]/dfFinal["FlightsPerDistGroup"])
    
# Writes the result on csv
with open(finalPath, 'a') as f:     
    dfFinal.toPandas().to_csv(f, header=True)
        