### 0 - MODULES

all modules, libraries, imports and constant used in this program

In [458]:
import pandas as pd

FLIGHT_DELAY_INPUT = "FlightSchedule.csv" 
TOL = 0.0001

<h3> 1 - READ FLIGHT DETAIL</h3>

This function reads the flight detail from the csv file, <br> the file is organized as:
<ol>
<li>FL_DATE: day of the flight in format YYYY-mm-dd.</li>
<li>TAIL_NUM: aircraft registration number, unique to a single aircraft.</li>
<li>UNIQUE_CARRIER: flight carrier id.</li>
<li>FL_NUM: number of the flight</li>
<li>ORIGIN: departure airport code.</li>
<li>DEST: destination airport code.</li>
<li>CRS_DEP_TIME: scheduled departure time (local time: HHMM) shown in the carriers’ Computerized
Reservations Systems (CRS)</li>
<li>DEP_TIME: actual departure time (local time: HHMM)</li>
<li>DEP_DELAY: overall delay at departure. Difference in minutes (floating point number) between scheduled and actual departure time. Early departures set to 0.</li>
<li>CRS_ARR_TIME: scheduled arrival time (local time: HHMM) shown in the carriers’ Computerized
Reservations Systems (CRS)</li>
<li>ARR_TIME: actual arrival time (local time: HHMM)</li>
<li>ARR_DELAY: overall delay. Difference in minutes (floating point number) between scheduled and
actual arrival time. Early arrivals show negative numbers.</li>
<li>CARRIER_DELAY: delay in minutes (floating point number) caused by the carrier</li>
<li>WEATHER_DELAY: delay in minutes (floating point number) caused by the weather.</li>
<li>NAS_DELAY: delay in minutes (floating point number) caused by the National Air System (NAS).</li>
<li>SECURITY_DELAY: delay in minutes (floating point number) caused by the security</li>
<li>LATE_AIRCRAFT_DELAY: delay in minutes (floating point number) caused by the aircraft</li>
<li>There are some other fields in this dataset</li>
</ol>
Returns the pandas dataFrame filtering out the last column

In [459]:
import numpy as np
def readFileInputFlight(file:str)->pd.DataFrame:
    return pd.read_csv(file).iloc[:, :-1]

### 2 - ANSWER THE FOLLOWING QUESTION

Use the info() and describe() methods to analyze how your records are distributed. <br>
Before continuing, try to answer the following questions:
<ol>
<li>which type does each column have?</li>
<li>are there any missing values?</li>
<li>how many unique carriers are present?</li>
<li>how many unique airports are present?</li>
<li>from which time interval data were collected?</li>
</ol>

In [460]:
def answerQuestion(df:pd.DataFrame)->None:
    df.info() # 1

    df.describe() #2
    
    temp = df.loc[:, "UNIQUE_CARRIER"].unique()
    display(temp)
    print(len(temp))

    temp = (df.loc[:, "ORIGIN_AIRPORT_ID"] + df.loc[:, "DEST_AIRPORT_ID"]).unique() #4
    display(temp)
    print(len(temp))

    print(min(df.loc[:, "FL_DATE"]), max(df.loc[:, "FL_DATE"])) # 5
    
    

### 3 - Filter out all canceled flight

This function filters out all the canceled flight from the dataFrame

In [461]:
def filterCancelFlight(df:pd.DataFrame)->pd.DataFrame:
    return df[df.loc[:, 'CANCELLED'] < TOL].drop(columns=['CANCELLED', 'CANCELLATION_CODE'])

### 4 - QUERIES 
Use any pandas method and functionality to answer the following queries:
- how many flights had each carrier operated?
- for each carrier, compute the mean delay considering all possible reasons (due to the carrier,
weather, etc.)


In [462]:
def meanCarriersDelay(df: pd.DataFrame, vers:bool)->pd.Series:
    if vers:
        return (df.loc[:, [col for col in df.columns if 'delay' in col.strip().lower()]]
            .set_index(keys=df.loc[:, 'UNIQUE_CARRIER']).sum(axis=1, numeric_only=True)
            .groupby(['UNIQUE_CARRIER'], axis=0)
            .mean(numeric_only=True)) 
    else:
        return (df.loc[:, ['UNIQUE_CARRIER']+[col for col in df.columns if 'delay' in col.strip().lower()]]
            .groupby(['UNIQUE_CARRIER']).mean(numeric_only=True)) 


def queries(df:pd.DataFrame) -> None:
    print("Unique Carriers flight")
    ans = df.loc[:, 'UNIQUE_CARRIER'].value_counts()
    for k in ans.index:
        print(k, '\t', ans[k])
        
    print("Mean delay for carriers ")
    display(meanCarriersDelay(df, False))

### 5 - Adding columns
add two new columns to your DataFrame:
- weekday: it is the day of the week expressed as an integer number. Check out Pandas dayofweek
attribute.
- delaydelta: it is the difference between the arrival delay and the departure one.

In [463]:
def addWeekDay(df:pd.DataFrame)->pd.DataFrame:
    df.insert(loc=1, column='DayOfWeek', value=pd.Series(pd.to_datetime(df.loc[:, "FL_DATE"].values)).dt.day_name())

### MAIN FUNCTION
This is the main function of our program, it does:
<ol>
<li>Read the input flight file</li>
<li>Answers a set of questions</li>
<li>Filters out the cancelled flights</li>
<li>Answers a set of queries</li>
<li>Adds two new columns to the dataFrame</li>
</ol>

In [464]:
def main()->None:
    df = readFileInputFlight(FLIGHT_DELAY_INPUT) # 1
    #answerQuestion(df) # 2
    df = filterCancelFlight(df) # 3
    df = df.reindex(index=list(range(len(df))))
    # queries(df) # 4
    addWeekDay(df) # 5
    #addDelayDelta(df) # 5
    display(df)
    
main()

Unnamed: 0,FL_DATE,DayOfWeek,UNIQUE_CARRIER,AIRLINE_ID,TAIL_NUM,FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN_CITY_MARKET_ID,ORIGIN,...,WHEELS_ON,TAXI_IN,CRS_ARR_TIME,ARR_TIME,ARR_DELAY,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY
0,2017-01-01,Sunday,AA,19805.0,N787AA,1.0,12478.0,1247803.0,31703.0,JFK,...,1143.0,26.0,1142.0,1209.0,27.0,27.0,0.0,0.0,0.0,0.0
1,2017-01-01,Sunday,AA,19805.0,N783AA,2.0,12892.0,1289204.0,32575.0,LAX,...,1757.0,12.0,1727.0,1809.0,42.0,34.0,0.0,8.0,0.0,0.0
2,2017-01-01,Sunday,AA,19805.0,N791AA,4.0,12892.0,1289204.0,32575.0,LAX,...,2025.0,15.0,1958.0,2040.0,42.0,7.0,0.0,0.0,0.0,35.0
3,2017-01-01,Sunday,AA,19805.0,N391AA,5.0,11298.0,1129804.0,30194.0,DFW,...,1744.0,5.0,1612.0,1749.0,97.0,77.0,0.0,20.0,0.0,0.0
4,2017-01-01,Sunday,AA,19805.0,N346AA,6.0,13830.0,1383002.0,33830.0,OGG,...,631.0,11.0,600.0,642.0,42.0,0.0,0.0,42.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
441126,2017-01-31,Tuesday,DL,19790.0,N862DN,1074.0,14869.0,1486903.0,34614.0,SLC,...,2027.0,5.0,2053.0,2032.0,-21.0,,,,,
441127,2017-01-31,Tuesday,DL,19790.0,N933DL,1075.0,10397.0,1039705.0,30397.0,ATL,...,1004.0,4.0,1019.0,1008.0,-11.0,,,,,
441128,2017-01-31,Tuesday,DL,19790.0,N933DL,1075.0,12217.0,1221702.0,30255.0,HSV,...,1242.0,4.0,1311.0,1246.0,-25.0,,,,,
441129,2017-01-31,Tuesday,DL,19790.0,N968AT,1076.0,12951.0,1295104.0,32951.0,LFT,...,816.0,7.0,840.0,823.0,-17.0,,,,,
