### Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Data Reading and Cleaning :

In [2]:
flight_05 = pd.read_csv('/Users/rak/Desktop/2005.csv')
flight_06 = pd.read_csv('/Users/rak/Desktop/2006.csv')
flight_07 = pd.read_csv('/Users/rak/Desktop/2007.csv')
flight_08 = pd.read_csv('/Users/rak/Desktop/2008.csv')
airports = pd.read_csv('/Users/rak/Desktop/airports.csv')
carriers = pd.read_csv('/Users/rak/Desktop/carriers.csv')
plane_data = pd.read_csv('/Users/rak/Desktop/plane-data.csv')

In [3]:
print(flight_05.shape)
print(flight_06.shape)
print(flight_07.shape)
print(flight_08.shape)

(7140596, 29)
(7141922, 29)
(7453215, 29)
(2389217, 29)


2008 has fewer entries and therefore will not be used


In [4]:
#Checking for null values

print(flight_05.isna().sum())
print(flight_06.isna().sum())
print(flight_07.isna().sum())

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               133730
CRSDepTime                 0
ArrTime               147758
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                    0
ActualElapsedTime     147758
CRSElapsedTime             0
AirTime               147758
ArrDelay              147758
DepDelay              133730
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     7006865
Diverted                   0
CarrierDelay               0
WeatherDelay               0
NASDelay                   0
SecurityDelay              0
LateAircraftDelay          0
dtype: int64
Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime               121934
C

Flights that have been delayed or diverted will affect the calculations of delay times and hence have been dropped

In [5]:
#Dropping rows with cancelled or diverted flights:

flight_05=flight_05[flight_05['Cancelled']==0] 
flight_05=flight_05[flight_05['Diverted']==0] 

flight_06=flight_06[flight_06['Cancelled']==0] 
flight_06=flight_06[flight_06['Diverted']==0] 

flight_07=flight_07[flight_07['Cancelled']==0] 
flight_07=flight_07[flight_07['Diverted']==0] 

In [6]:
#Checking for null values

print(flight_05.isna().sum())
print(flight_06.isna().sum())
print(flight_07.isna().sum())

Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
CRSDepTime                 0
ArrTime                    0
CRSArrTime                 0
UniqueCarrier              0
FlightNum                  0
TailNum                    0
ActualElapsedTime          0
CRSElapsedTime             0
AirTime                    0
ArrDelay                   0
DepDelay                   0
Origin                     0
Dest                       0
Distance                   0
TaxiIn                     0
TaxiOut                    0
Cancelled                  0
CancellationCode     6992837
Diverted                   0
CarrierDelay               0
WeatherDelay               0
NASDelay                   0
SecurityDelay              0
LateAircraftDelay          0
dtype: int64
Year                       0
Month                      0
DayofMonth                 0
DayOfWeek                  0
DepTime                    0
C

In [7]:
#Merging the Dataframes:

flight_data = pd.concat([flight_05, flight_06, flight_07],ignore_index=True)

In [8]:
flight_data

Unnamed: 0,Year,Month,DayofMonth,DayOfWeek,DepTime,CRSDepTime,ArrTime,CRSArrTime,UniqueCarrier,FlightNum,...,TaxiIn,TaxiOut,Cancelled,CancellationCode,Diverted,CarrierDelay,WeatherDelay,NASDelay,SecurityDelay,LateAircraftDelay
0,2005,1,28,5,1603.0,1605,1741.0,1759,UA,541,...,4,23,0,,0,0,0,0,0,0
1,2005,1,29,6,1559.0,1605,1736.0,1759,UA,541,...,6,15,0,,0,0,0,0,0,0
2,2005,1,30,7,1603.0,1610,1741.0,1805,UA,541,...,9,18,0,,0,0,0,0,0,0
3,2005,1,31,1,1556.0,1605,1726.0,1759,UA,541,...,11,10,0,,0,0,0,0,0,0
4,2005,1,2,7,1934.0,1900,2235.0,2232,UA,542,...,5,10,0,,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21271923,2007,12,15,6,1558.0,1605,1749.0,1736,DL,58,...,14,17,0,,0,0,0,0,0,0
21271924,2007,12,15,6,1902.0,1851,2110.0,2105,DL,59,...,6,21,0,,0,0,0,0,0,0
21271925,2007,12,15,6,1024.0,1025,1750.0,1735,DL,61,...,14,19,0,,0,0,0,15,0,0
21271926,2007,12,15,6,1353.0,1315,1658.0,1622,DL,62,...,11,14,0,,0,0,0,0,0,36


In [9]:
flight_data.shape

(21271928, 29)

In [10]:
print((flight_data['ArrDelay'] > 0).sum())

9714043


Converting Arrival and departure time to 'DateTime' format : 

In [11]:
flight_data['DepTime'] = pd.to_datetime(flight_data['DepTime'], format='%H%M', errors='coerce')
flight_data['ArrTime'] = pd.to_datetime(flight_data['ArrTime'], format='%H%M', errors='coerce')

### Question 1 : 

In [12]:
#Creating new columns for time of day, day of week and month

time = pd.cut(flight_data['DepTime'].dt.hour, bins=[0, 6, 12, 18, 24], labels=['Night', 'Morning', 'Afternoon', 'Evening'])
day = flight_data['DayOfWeek'].replace({1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', 4: 'Thursday', 5: 'Friday', 6: 'Saturday', 7: 'Sunday'})
month = flight_data['Month'].replace({1: 'January', 2: 'February', 3: 'March', 4: 'April', 5: 'May', 6: 'June', 7: 'July', 8: 'August', 9: 'September', 10: 'October', 11: 'November', 12: 'December'})

In [13]:
#Grouping the data by time of day, day of week and month

flight_data['TimeOfDay'] = time
flight_data['DayOfWeek'] = day
flight_data['Month'] = month

grouped_day = flight_data.groupby('TimeOfDay')
grouped_week = flight_data.groupby('DayOfWeek')
grouped_month = flight_data.groupby('Month')

In [14]:
#Calculating the number of flights in each time period:

time_flights = grouped_day.size()
day_flights = grouped_week.size()
month_flights = grouped_month.size()

In [15]:
#Calculating the total delay

flight_data['TotalDelay']= flight_data['ArrDelay'] + flight_data['DepDelay']

In [16]:
#Calculating adn displaying the Total average delay across each time period :
grouped_day.apply(lambda x: x.sort_values('TotalDelay'))
print(grouped_day['TotalDelay'].mean())

TimeOfDay
Night        -0.977566
Morning       7.397389
Afternoon    22.273795
Evening      47.039560
Name: TotalDelay, dtype: float64


In [None]:
grouped_week.apply(lambda x: x.sort_values('TotalDelay').mean())
print(grouped_week['TotalDelay'].mean())

  grouped_week.apply(lambda x: x.sort_values('TotalDelay').mean())


In [None]:
grouped_month.apply(lambda x: x.sort_values('TotalDelay').mean())
print(grouped_month['TotalDelay'].mean())

In [None]:


# group by time of day and calculate mean delay
grouped_time = flight_data.groupby('TimeOfDay')['TotalDelay'].mean()

# plot the results
grouped_time.plot.bar(x='TimeOfDay', y='TotalDelay', color='blue')
plt.title('Average delay by time of day')
plt.xlabel('Time of day')
plt.ylabel('Average delay (minutes)')
plt.show()

In [None]:
import calendar

# Create new columns for day of week and month name
flight_data['DayOfWeek'] = flight_data['DayOfWeek'].apply(lambda x: calendar.day_name[x])
flight_data['MonthName'] = flight_data['Month'].apply(lambda x: calendar.month_name[x])

# Group flights by day of week and month
grouped_day = flight_data.groupby('DayOfWeek')
grouped_month = flight_data.groupby('MonthName')

# Calculate average delay for each group
day_avg_delay = grouped_day['TotalDelay'].mean()
month_avg_delay = grouped_month['TotalDelay'].mean()

# Sort groups in chronological order
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_avg_delay = day_avg_delay.reindex(day_order)

month_order = list(calendar.month_name)[1:]
month_avg_delay = month_avg_delay.reindex(month_order)

print(day_avg_delay)
print(month_avg_delay)