# QUERY 2

weekly percentages of delays that are due to weather, throughout the entire data set

### The dataset is composed in this way:

<br>1 Year -> 1994-2008 
<br>2 Month -> 1-12
<br>3 DayofMonth -> 1-31
<br>4 DayOfWeek -> 1 (Monday) - 7 (Sunday)
<br>5 DepTime -> actual departure time (local, hhmm)
<br>6 CRSDepTime -> scheduled departure time (local, hhmm)
<br>7 ArrTime -> actual arrival time (local, hhmm)
<br>8 CRSArrTime -> scheduled arrival time (local, hhmm)
<br>9 UniqueCarrier -> unique carrier code
<br>10 FlightNum -> flight number
<br>11 TailNum -> plane tail number
<br>12 ActualElapsedTime -> in minutes
<br>13 CRSElapsedTime -> in minutes
<br>14 AirTime -> in minutes
<br>15 ArrDelay -> arrival delay, in minutes
<br>16 DepDelay -> departure delay, in minutes
<br>17 Origin -> origin IATA airport code
<br>18 Dest -> destination IATA airport code
<br>19 Distance -> in miles
<br>20 TaxiIn -> taxi in time, in minutes
<br>21 TaxiOut -> taxi out time in minutes
<br>22 Cancelled -> was the flight cancelled?
<br>23 CancellationCode -> reason for cancellation
        (A = carrier, B = weather, C = NAS, D = security)
<br>24 Diverted -> 1 = yes, 0 = no
<br>25 CarrierDelay -> in minutes
<br>26 WeatherDelay -> in minutes
<br>27 NASDelay -> in minutes
<br>28 SecurityDelay -> in minutes
<br>29 LateAircraftDelay -> in minutes

In [9]:
from pyspark.sql import SQLContext
from pyspark.sql.functions import when
from pyspark.sql.functions import to_date, col, lit, concat, weekofyear
import pandas as pd

sqlContext = SQLContext(sc)

In [1]:
def loadYearCsv(year):
    filePath = "./Data/"+str(year)+".csv"

#load dataset from csv file and then creates a dataframe 
    df = sqlContext.read.csv(filePath, header='true')

#DataFrame[Year, Month, Day, DayOfWeek, WeatherDelay]
    df = df.select("Year", "Month", "DayofMonth", "DayOfWeek", "WeatherDelay" ) \
                .withColumnRenamed('DayofMonth', 'Day')
    df = df.orderBy(["Year","Month", "Day"] )

#Add column Date (YYYY-MM-DD)
#DataFrame[Year, Month, Day, DayOfWeek, WeatherDelay, Date]
    df = df.withColumn('Date', 
            to_date(concat(col("Year"), lit('-'), col("Month"), lit('-'),col("Day")))) 

#Add column week
#DataFrame[Year:str, Month:str, Day:str, DayOfWeek:str, WeatherDelay:str, Date:str, Week:int]
    df = df.withColumn("Week", weekofyear(df["Date"]))
    return df

In [2]:
#if the first day of the year is Mon-Thu returns true, otherwise false
def fromMonToThu(df):
    if(int(df.first().DayOfWeek) <= 4):
        return True
    else:
        return False

In [3]:
#if the last day of the year is Thu-Sun returns true, otherwise false
def fromThuToSun(df):
    if(int(df.first().DayOfWeek) >= 4):
        return True
    else:
        return False

In [4]:
# It has been chosen to work on dataframe with weeks from 1 to 52/53, instead of working on 
# year based dataframes. This function adjust the dataframe accordingly

def fixFirstRows(df, year):
    if (fromMonToThu(df)):
        if(df.first().DayOfWeek != '1'):
            #No problem for year = 1994
            dfTmp = loadYearCsv(year-1)
            dfTmp = dfTmp.where((col("Month") == '12') & (col("Week") == 1) & (col("Year") == str(year-1)))
            dfTmp = dfTmp.union(df)
        else:
            return df
    else:
        dfTmp = df.where((col("Month")!='1' )|(col("Week") < 52))
        dfTmp = dfTmp.orderBy("Date")
    return dfTmp


In [5]:
# It has been chosen to work on dataframe with weeks from 1 to 52/53, instead of working on 
# year based dataframes. This function adjust the dataframe accordingly

def fixLastRows(df, year):
    #This is used to check which is the week day of the 31st of december
    dfTmp = df.where(col("Year") == str(year)).orderBy(["Date"], ascending = False )
    if (fromThuToSun(dfTmp)):
        if (dfTmp.first().DayOfWeek == '7'):
            return df
        else:
            # --> week >= 52
            #no problem for year = 2008
            dfTmp = loadYearCsv(year+1)
            dfTmp = dfTmp.where((col("Month") == '1') & (col("Week") >= 52))
            df = df.union(dfTmp)
            
    else: 
        # --> Week = 1
        df = df.where((col("Month") != '12') | (col("Week") != 1) | (col("Year") == str(year-1)))
    return df      

In [6]:
def weeklyDelays(year):    

    #DataFrame[Year, Month, Day, DayOfWeek, WeatherDelay, Date, Week]
    df = loadYearCsv(year)
    df = fixFirstRows(df, year)
    df = fixLastRows(df, year)

    #DataFrame[Date, DayOfWeek, Week]
    d0 = df.select("Date", "DayOfWeek", "Week")

    #DataFrame[Date, Week, WeatherDelay]
    df = df.select("Date", "Week", "WeatherDelay")
    df = df.orderBy("Date")
    
    #### df1
    #DataFrame[Date, Week, FlightsPerDay]
    df1 = df.groupBy("Date", "Week").count() \
            .withColumnRenamed("count", "FlightsPerDay")
    
    #DataFrame[Week, FlightsPerWeek]
    df1 = df1.groupBy("Week").sum("FlightsPerDay") \
            .withColumnRenamed("sum(FlightsPerDay)", "FlightsPerWeek")
    ####df2
    #Delays filtered so that there are no 0 or NA values
    #DataFrame[Date, Week, DelaysPerDay]
    df2 = df.filter(df["WeatherDelay"]!= 'NA') \
            .filter(df["WeatherDelay"]>0) \
            .groupBy("Date", "Week").count() \
            .withColumnRenamed("count", "DelaysPerDay")
    
    #DataFrame[Week, DelaysPerWeek]
    df2 = df2.groupBy("Week").sum("DelaysPerDay") \
            .withColumnRenamed("sum(DelaysPerDay)", "DelaysPerWeek")
    
    ####join df1 and df2
    #DataFrame[Week, FlightsPerWeek, DelaysPerWeek]
    df = df1.join(df2, ["Week"], how = 'left')
    df = df.na.fill(0)
    df = df.orderBy("Week")
    
    ####compute the percentage of delayed flights per week
    #DataFrame[Week, FlightsPerWeek, DelaysPerWeek, percDelaysPerWeek]
    df = df.withColumn('PercDelaysPerWeek', df["DelaysPerWeek"]/df["FlightsPerWeek"])
    
    ####prepare week's start date and end date
    startDay = d0.filter(col("DayOfWeek") == '1').drop("DayOfWeek").distinct() \
            .withColumnRenamed("Date", "StartDay")
    endDay = d0.filter(col("DayOfWeek") == '7').drop("DayOfWeek").distinct() \
            .withColumnRenamed("Date", "EndDay")
    
    #DataFrame[Week, StartDay, EndDay]
    d0 = startDay.join(endDay, "Week")
    
    ####final dataframe
    #DataFrame[Week, FlightsPerWeek, DelaysPerWeek, percDelaysPerWeek, StartDay, EndDay]
    df = df.join(d0, "Week")
    
    #comment the line below if you want that the output shows also weeks with no delay
    df = df.where(col("DelaysPerWeek")!=0)
    df = df.withColumn("YearWeek", concat(lit(year), lit('-'), col("Week")))
    df = df.select("YearWeek", "FlightsPerWeek", "DelaysPerWeek", "PercDelaysPerWeek")
    df = df.orderBy("YearWeek")
     
    return df



In [8]:
finalPath = "./Results/Query2/query2.csv"
years = list(range(1994, 2009,1))

with open(finalPath, 'a') as f:
    for y in years:
        tmpDf = weeklyDelays(y)
        if (y==1994):
            tmpDf.toPandas().to_csv(f, header=True)
        else:
            tmpDf.toPandas().to_csv(f, header=False)

