In [25]:
import math
import numpy as np # linear algebra
import pandas as pd # read_csv and such
import matplotlib.pyplot as plt # plotting and such
from io import StringIO # convert strings to buffers or something like that.

### Get the data and start messing with it
We read the data in. It's rather large so it would be better not to read all of it into memory at once!

In [2]:
# the paths to our flight related data sets
airlines_path = 'datasets/flight_delays_2015/airlines.csv'
airports_path = 'datasets/flight_delays_2015/airports.csv'
flights_path  = 'datasets/flight_delays_2015/flights.csv'
# weather related paths
weather_path = 'datasets/weather/2015.csv'

# read the airlines in and get a dataframe containing the dataset
# Set low_memory to False so that types can be inferred/converted
airlines_df = pd.read_csv(airlines_path, low_memory=False) 
airlines_df.head()

Unnamed: 0,IATA_CODE,AIRLINE
0,UA,United Air Lines Inc.
1,AA,American Airlines Inc.
2,US,US Airways Inc.
3,F9,Frontier Airlines Inc.
4,B6,JetBlue Airways


In [3]:
airports_df = pd.read_csv(airports_path, low_memory=False)
airports_df.head()

Unnamed: 0,IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE
0,ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.4404
1,ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.6819
2,ABQ,Albuquerque International Sunport,Albuquerque,NM,USA,35.04022,-106.60919
3,ABR,Aberdeen Regional Airport,Aberdeen,SD,USA,45.44906,-98.42183
4,ABY,Southwest Georgia Regional Airport,Albany,GA,USA,31.53552,-84.19447


In [4]:
flights_df = pd.read_csv(flights_path, low_memory=False)
flights_df.head()

Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,2015,1,1,4,AS,98,N407AS,ANC,SEA,5,...,408.0,-22.0,0,0,,,,,,
1,2015,1,1,4,AA,2336,N3KUAA,LAX,PBI,10,...,741.0,-9.0,0,0,,,,,,
2,2015,1,1,4,US,840,N171US,SFO,CLT,20,...,811.0,5.0,0,0,,,,,,
3,2015,1,1,4,AA,258,N3HYAA,LAX,MIA,20,...,756.0,-9.0,0,0,,,,,,
4,2015,1,1,4,AS,135,N527AS,SEA,ANC,25,...,259.0,-21.0,0,0,,,,,,


### We have to do some cleaning of the dataset
You can see that some of the data points in the flights dataset come in as NaN.
This is because the CSV file contains some points of data which are `''`.
We will have to process these as a 0 when doing maths on the delay times.

### Types of delays
Each flight's total delay time is a summation of the columns;
`AIR_SYSTEM_DELAY`, `SECURITY_DELAY`, `AIRLINE_DELAY`, `LATE_AIRCRAFT_DELAY`, `WEATHER_DELAY`

### Let's plot some data
Let's see which airlines have the highest total departure delay time, with a barplot!
We see in the data that some of the departure delay times have negative values. This means that the 
flight actually departed earlier than scheduled. From an inital glossing over of the data we can 
see that the time granualrity is measured in minutes.

In [5]:
headers = ['AIRLINE', 'DEPARTURE_DELAY']
airline_delays_df = pd.read_csv(flights_path, usecols=headers, low_memory=False)
airline_delays_df.head()

Unnamed: 0,AIRLINE,DEPARTURE_DELAY
0,AS,-11.0
1,AA,-8.0
2,US,-2.0
3,AA,-5.0
4,AS,-1.0


So, we can grab the columns that we want to plot by. So, let's create a plot.

In [47]:
delay_sums = {} # airline : total delay time
for a in airlines_df['IATA_CODE']:
    delay_sums[a] = 0

Now we iterate the flights dataset, each time summing the delay time into our dictionary of IATA_CODEs

In [50]:
i = 0
for a in airline_delays_df.head(500)['AIRLINE']:
    delay = airline_delays_df['DEPARTURE_DELAY'][i]
    if math.isnan(delay): 
        delay = 0
    delay_sums[a] = delay_sums[a] + delay
    i = i + 1
delay_sums

{'AA': 263.0,
 'AS': -175.0,
 'B6': 224.0,
 'DL': 333.0,
 'EV': 592.0,
 'F9': 100.0,
 'HA': -56.0,
 'MQ': 127.0,
 'NK': 142.0,
 'OO': 350.0,
 'UA': 254.0,
 'US': 10.0,
 'VX': 0,
 'WN': 131.0}