# Query 5

the yearly percentage of flights cancelled for each reason (A = carrier, B = weather, C = NAS, D = security)

### The dataset is composed in this way:

<br>1 Year -> 1994-2008 
<br>2 Month -> 1-12
<br>3 DayofMonth -> 1-31
<br>4 DayOfWeek -> 1 (Monday) - 7 (Sunday)
<br>5 DepTime -> actual departure time (local, hhmm)
<br>6 CRSDepTime -> scheduled departure time (local, hhmm)
<br>7 ArrTime -> actual arrival time (local, hhmm)
<br>8 CRSArrTime -> scheduled arrival time (local, hhmm)
<br>9 UniqueCarrier -> unique carrier code
<br>10 FlightNum -> flight number
<br>11 TailNum -> plane tail number
<br>12 ActualElapsedTime -> in minutes
<br>13 CRSElapsedTime -> in minutes
<br>14 AirTime -> in minutes
<br>15 ArrDelay -> arrival delay, in minutes
<br>16 DepDelay -> departure delay, in minutes
<br>17 Origin -> origin IATA airport code
<br>18 Dest -> destination IATA airport code
<br>19 Distance -> in miles
<br>20 TaxiIn -> taxi in time, in minutes
<br>21 TaxiOut -> taxi out time in minutes
<br>22 Cancelled -> was the flight cancelled?
<br>23 CancellationCode -> reason for cancellation
        (A = carrier, B = weather, C = NAS, D = security)
<br>24 Diverted -> 1 = yes, 0 = no
<br>25 CarrierDelay -> in minutes
<br>26 WeatherDelay -> in minutes
<br>27 NASDelay -> in minutes
<br>28 SecurityDelay -> in minutes
<br>29 LateAircraftDelay -> in minutes

In [1]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import when
from pyspark.sql.functions import to_date, col, lit, concat, weekofyear
from pyspark.sql.types import IntegerType
import pandas as pd
import math

sqlContext = SQLContext(sc)

In [2]:
def loadYearCsv(year):
    filePath = "./Data/"+str(year)+".csv"

#load dataset from csv file and then creates a dataframe 
    df = sqlContext.read.csv(filePath, header='true')

#DataFrame[Year, CancellationCode]
    df = df.select("Year", "CancellationCode", "Cancelled" ) 
    df = df.orderBy("Year")

    return df

In [8]:
def reasonsCancellation(year):    
    
    #loads data 
    #DataFrame[Year, CancellationCode]
    df = loadYearCsv(year)
    
    #df0 contains the total amount of flights with a cancellation code
    df0 = df.filter(col("CancellationCode") != 'NA').filter(col("CancellationCode") != 'null').groupBy("Year").count() \
            .withColumnRenamed("count", "Total")
    
    #dfA contains the amount of flights with cancellation code A
    dfA = df.filter(col("CancellationCode") == 'A').groupBy("Year").count() \
            .withColumnRenamed("count", "ACanc")
    
    #dfB contains the amount of flights with cancellation code B
    dfB = df.filter(col("CancellationCode") == 'B').groupBy("Year").count() \
            .withColumnRenamed("count", "BCanc") 

    #dfC contains the amount of flights with cancellation code C
    dfC = df.filter(col("CancellationCode") == 'C').groupBy("Year").count() \
            .withColumnRenamed("count", "CCanc") 

    #dfD contains the amount of flights with cancellation code D
    dfD = df.filter(col("CancellationCode") == 'D').groupBy("Year").count()  \
            .withColumnRenamed("count", "DCanc")
    
    dfFinal = df0.join(dfA, "Year")
    dfFinal = dfFinal.join(dfB, "Year")
    dfFinal = dfFinal.join(dfC, "Year")
    dfFinal = dfFinal.join(dfD, "Year")
    
    #computes the percentage
    dfFinal = dfFinal.withColumn("PercA", dfFinal["ACanc"]/dfFinal["Total"])
    dfFinal = dfFinal.withColumn("PercB", dfFinal["BCanc"]/dfFinal["Total"])
    dfFinal = dfFinal.withColumn("PercC", dfFinal["CCanc"]/dfFinal["Total"])
    dfFinal = dfFinal.withColumn("PercD", dfFinal["DCanc"]/dfFinal["Total"])

    return dfFinal


In [9]:
finalPath = "./Results/Query5/query5.csv"
years = list(range(1994, 2009,1))

with open(finalPath, 'a') as f:
    for y in years:
        tmpDf = reasonsCancellation(y)
        if (y==1994):
            tmpDf.toPandas().to_csv(f, header=True)
        else:
            tmpDf.toPandas().to_csv(f, header=False)
