### Importing Libraries : 

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import calendar

### Data Reading and Cleaning : 

In [7]:
flight_06 = pd.read_csv('/Users/rak/Desktop/2006.csv')
flight_07 = pd.read_csv('/Users/rak/Desktop/2007.csv')

airports = pd.read_csv('/Users/rak/Desktop/airports.csv')
carriers = pd.read_csv('/Users/rak/Desktop/carriers.csv')
plane_data = pd.read_csv('/Users/rak/Desktop/plane-data.csv')

In [8]:
# Dropping rows with cancelled or diverted flights:

flight_06=flight_06[flight_06['Cancelled']==0] 
flight_06=flight_06[flight_06['Diverted']==0] 

flight_07=flight_07[flight_07['Cancelled']==0]
flight_07=flight_07[flight_07['Diverted']==0] 

In [9]:
# Merging the Dataframes:

flight_data = pd.concat([ flight_06, flight_07],ignore_index=True)

### Question 4 :
#### Can you detect cascading failures as delays in one airport cause delays in others?

In order to detect cascading failures, the effect of a delayed flight on a subsequest flight needed to be analysed. When sorting by tail number we can see how many flights each aircraft made in the three year period. Sorting by the number of flights, we find that aircrafts with the highest number of flights have done over 10,000 flights in the three year period. This indicates that that these planes multiple flights a day and tracking their flight path from one airport to another can help us analyse the impact of delays on successive flights.

In [10]:
# Creating a new column 'TotalDelay' that caluculates total delay:
flight_data['TotalDelay'] = flight_data['ArrDelay'] + flight_data['DepDelay']

# Creating a new column 'isDelay' which indicates whether a flight was delayed or not:
flight_data['isDelay'] = flight_data['TotalDelay']>0
flight_data['isDelay'] = flight_data['isDelay'].astype(int)

In [11]:
# Extracting the top flights by sorting the 'TailNum' by the number of flights:
flights = flight_data['TailNum'].value_counts()
print(flights)

N308SW    8560
N478HA    8195
N479HA    8079
N480HA    8078
N485HA    8055
          ... 
N601QX       2
N194JB       1
N623US       1
N78009       1
N857NW       1
Name: TailNum, Length: 5811, dtype: int64


In [12]:
# printing top 10 flights:
flights[:10]

N308SW    8560
N478HA    8195
N479HA    8079
N480HA    8078
N485HA    8055
N484HA    7953
N481HA    7947
N487HA    7861
N475HA    7844
N477HA    7842
Name: TailNum, dtype: int64

Here we can see the top flights in terms of number of flights. The top 5 flights will be use to analyse cascading failures:

In [13]:
# Creating a list of the top 10 flights:
top_flights = ['N308SW', 'N478HA', 'N479HA', 'N480HA', 'N485HA', 'N484HA', 'N481HA', 'N487HA', 'N475HA', 'N477HA']


In [14]:
# Changing DepTime to DateTime object:
flight_data['DepTime'] = pd.to_datetime(flight_data['DepTime'], format='%H%M', errors='coerce').dt.time

# Renaming 'DayOfMonth' to day in order to use datetime function on it
flight_data = flight_data.rename(columns={'DayofMonth':'Day'})

# Creating 'Date' column using datetime function with year month and date:
flight_data['Date'] = pd.to_datetime(flight_data[['Year', 'Month', 'Day']], format='%Y-%m-%d')

# Creating 'DateTime' column to store date along with departure time of each flight in flight_data:
flight_data['DateTime'] = flight_data['Date'].dt.strftime('%Y-%m-%d ') + flight_data['DepTime'].astype(str)

In [15]:
# Creating a new dataframe 'cascade' which only contains flights of specific Tail Number and then sorting it by 
# the newly created 'DateTime' column
cascade = flight_data.loc[flight_data['TailNum']=="N485HA"].sort_values(by=['DateTime'])

In [16]:
cascade.shape

(8055, 33)

In [17]:
# Selecting only only relevant column for analysis:
cascade = cascade[['TailNum', 'ArrDelay', 'DepDelay', 'LateAircraftDelay','isDelay','DateTime', 'DepTime','Origin','Dest']]

In [18]:
cascade[:20]

Unnamed: 0,TailNum,ArrDelay,DepDelay,LateAircraftDelay,isDelay,DateTime,DepTime,Origin,Dest
335229,N485HA,-4.0,-4.0,0,0,2006-01-01 07:26:00,07:26:00,HNL,LIH
335260,N485HA,-5.0,-4.0,0,0,2006-01-01 08:28:00,08:28:00,LIH,HNL
337521,N485HA,-12.0,-11.0,0,0,2006-01-01 09:19:00,09:19:00,HNL,OGG
337552,N485HA,-6.0,-8.0,0,0,2006-01-01 10:27:00,10:27:00,OGG,HNL
336093,N485HA,-6.0,-7.0,0,0,2006-01-01 11:33:00,11:33:00,HNL,ITO
336062,N485HA,-4.0,-6.0,0,0,2006-01-01 12:54:00,12:54:00,ITO,HNL
337056,N485HA,-2.0,-2.0,0,0,2006-01-01 14:18:00,14:18:00,HNL,KOA
337025,N485HA,-4.0,-5.0,0,0,2006-01-01 15:28:00,15:28:00,KOA,HNL
336362,N485HA,-4.0,-4.0,0,0,2006-01-01 16:41:00,16:41:00,HNL,LIH
336393,N485HA,-9.0,-6.0,0,0,2006-01-01 17:44:00,17:44:00,LIH,HNL


Cascading delays were determined on the following criteria:

1. There was a delay in departure from the first airport
2. There was a delay in arrival in the following airport 
3. Along with the arrial delay, a late aircraft delay was detected in the second airport

If all three conditions were met, that particular flight would be deemed to have caused a cascading delay

In [19]:

# Creating lists to store data about cascading delays:
TailNum = []
Flights = []
Delayed = []
Delayed_p = []
Cascade_p = []
Cascade_dp = []

for i in top_flights:
    # Creating a new dataframe 'cascade' which only contains flights of specific Tail Number and then sorting it by 
    # the newly created 'DateTime' column
    cascade = flight_data.loc[flight_data['TailNum']== i ].sort_values(by=['DateTime'])
    
    # First, a new column 'next_arr' is created to store the arrival delay in the following airport using the shift function
    cascade['next_arr']=cascade['ArrDelay'].shift(-1)

    # Then, the late aircraft delay in the following airport was stored in 'next_lateaircraft':
    cascade['next_lateaircraft']= cascade['LateAircraftDelay'].shift(-1)
    # The column 'CascadingDelay' stores whether the flight caused a cascading delay if all three conditions are met
    cascade['CascadingDelay'] = ((cascade['DepDelay'] > 0) 
                            & (cascade['next_arr'].fillna(0) > 0) 
                            & (cascade['next_lateaircraft'].fillna(0) > 0))

    # Making the datatype of CascadingDelay int:
    cascade['CascadingDelay'] = cascade['CascadingDelay'].astype(int)
    
    
    #Storing all the data as lists to add to results dataframe:
    TailNum.append(i)
    Flights.append(len(cascade))
    Delayed.append(cascade['isDelay'].sum())
    Delayed_p.append(cascade['isDelay'].sum()/len(cascade) * 100)
    Cascade_p.append(cascade['CascadingDelay'].sum()/len(cascade) * 100)
    Cascade_dp.append(cascade['CascadingDelay'].sum()/cascade['isDelay'].sum() * 100)
    
data = {'TailNum': TailNum, 'Flights': Flights, 'Delayed': Delayed, '% Delayed': Delayed_p, '% Cascade': Cascade_p, '% cascade of Delayed flights': Cascade_dp}
cascade_results = pd.DataFrame(data)

cascade_results
    


    

Unnamed: 0,TailNum,Flights,Delayed,% Delayed,% Cascade,% cascade of Delayed flights
0,N308SW,8560,3864,45.140187,6.530374,14.466874
1,N478HA,8195,1196,14.594265,1.366687,9.364548
2,N479HA,8079,1147,14.197302,1.027355,7.236269
3,N480HA,8078,1220,15.102748,1.609309,10.655738
4,N485HA,8055,1298,16.114215,1.477343,9.167951
5,N484HA,7953,1158,14.560543,1.257387,8.635579
6,N481HA,7947,1237,15.565622,1.321253,8.488278
7,N487HA,7861,1142,14.527414,1.424755,9.807356
8,N475HA,7844,1130,14.405915,1.236614,8.584071
9,N477HA,7842,1132,14.435093,1.275185,8.833922


In [22]:
# Calculating the Average % of Cascading delays from delayed flights for the top 10 flights:
del_cas = cascade_results['% cascade of Delayed flights'].mean()

In [24]:
print("The percentage of flights that were delayed that had cascading failures is:",round(del_cas,2))

The percentage of flights that were delayed that had cascading failures is: 9.52
