# QUERY 1


The percentage of canceled flights per day, throughout the entire data set

### The dataset is composed in this way:

<br>1 Year -> 1994-2008 
<br>2 Month -> 1-12
<br>3 DayofMonth -> 1-31
<br>4 DayOfWeek -> 1 (Monday) - 7 (Sunday)
<br>5 DepTime -> actual departure time (local, hhmm)
<br>6 CRSDepTime -> scheduled departure time (local, hhmm)
<br>7 ArrTime -> actual arrival time (local, hhmm)
<br>8 CRSArrTime -> scheduled arrival time (local, hhmm)
<br>9 UniqueCarrier -> unique carrier code
<br>10 FlightNum -> flight number
<br>11 TailNum -> plane tail number
<br>12 ActualElapsedTime -> in minutes
<br>13 CRSElapsedTime -> in minutes
<br>14 AirTime -> in minutes
<br>15 ArrDelay -> arrival delay, in minutes
<br>16 DepDelay -> departure delay, in minutes
<br>17 Origin -> origin IATA airport code
<br>18 Dest -> destination IATA airport code
<br>19 Distance -> in miles
<br>20 TaxiIn -> taxi in time, in minutes
<br>21 TaxiOut -> taxi out time in minutes
<br>22 Cancelled -> was the flight cancelled?
<br>23 CancellationCode -> reason for cancellation
        (A = carrier, B = weather, C = NAS, D = security)
<br>24 Diverted -> 1 = yes, 0 = no
<br>25 CarrierDelay -> in minutes
<br>26 WeatherDelay -> in minutes
<br>27 NASDelay -> in minutes
<br>28 SecurityDelay -> in minutes
<br>29 LateAircraftDelay -> in minutes

In [1]:
from pyspark.sql import SQLContext
from pyspark.sql import functions as F
import pandas as pd
sqlContext = SQLContext(sc)

In [2]:
# Given a year, it returns a dataframe like this:
# DataFrame[Year, Month, Day, FlightsPerDay, CancFlightsPerDay, PercCancelled]
def percentageCancelledFlights(year):

    year = str(year)
    filePath = "./Data/"+year+".csv"
    
    #load dataset from csv file and then creates a dataframe 
    flightsDateCanc = sqlContext.read.csv(filePath, header='true')

    #DataFrame[Year, Month, Day, Cancelled]
    flightsDateCanc = flightsDateCanc.select("Year", "Month", 
                            "DayofMonth", "Cancelled").withColumnRenamed('DayofMonth', 'Day')

    #DataFrame[Year, Month, Day, FlightsPerDay]
    allFlights = flightsDateCanc.groupBy("Year", "Month", "Day").count().withColumnRenamed("count", "FlightsPerDay")

    #DataFrame[Year, Month, Day, CancFlightsPerDay]
    cancFlights = flightsDateCanc.filter(flightsDateCanc["Cancelled"] == 1).groupBy("Year", "Month", "Day") \
            .count().withColumnRenamed("count", "CancFlightsPerDay")

    #DataFrame[Year, Month, Day, FlightsPerDay, CancFlightsPerDay]
    percCancFlights = allFlights.join(cancFlights, ["Year", "Month", "Day"])
    
    #DataFrame[Year, Month, Day, FlightsPerDay, CancFlightsPerDay, PercCancelled]
    percCancFlights = percCancFlights.withColumn('PercCancelled', percCancFlights["CancFlightsPerDay"]/percCancFlights["FlightsPerDay"])
    
    #Orders by date
    percCancFlights = percCancFlights.withColumn("Month", percCancFlights.Month.cast("int"))
    percCancFlights = percCancFlights.withColumn("Day", percCancFlights.Day.cast("int"))
    percCancFlights = percCancFlights.orderBy("Year", "Month", "Day")

    return percCancFlights

In [4]:
finalPath = "./Results/Query1/query1.csv"
years = list(range(1994, 2009,1))

with open(finalPath, 'a') as f:
    for y in years:
        tmpDf = percentageCancelledFlights(y)
        if (y==1994):
            tmpDf.toPandas().to_csv(f, header=True)
        else:
            tmpDf.toPandas().to_csv(f, header=False)
