In [1]:
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt

## Airline Data

In [2]:
df_read = pd.read_csv('data/ONTIME_2017.08.csv')
df_read.columns

Index(['YEAR', 'QUARTER', 'MONTH', 'DAY_OF_MONTH', 'DAY_OF_WEEK', 'FL_DATE',
       'OP_UNIQUE_CARRIER', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER', 'TAIL_NUM',
       'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN_AIRPORT_SEQ_ID',
       'ORIGIN_CITY_MARKET_ID', 'ORIGIN', 'ORIGIN_CITY_NAME',
       'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID', 'DEST_CITY_MARKET_ID', 'DEST',
       'DEST_CITY_NAME', 'DEST_STATE_ABR', 'CRS_DEP_TIME', 'DEP_TIME',
       'DEP_DELAY', 'WHEELS_OFF', 'WHEELS_ON', 'CRS_ARR_TIME', 'ARR_TIME',
       'CANCELLED', 'CANCELLATION_CODE', 'DIVERTED', 'CRS_ELAPSED_TIME',
       'ACTUAL_ELAPSED_TIME', 'AIR_TIME', 'FLIGHTS', 'DISTANCE',
       'CARRIER_DELAY', 'WEATHER_DELAY', 'NAS_DELAY', 'SECURITY_DELAY',
       'LATE_AIRCRAFT_DELAY', 'Unnamed: 42'],
      dtype='object')

In [4]:
df_read.sample(5)

Unnamed: 0,YEAR,QUARTER,MONTH,DAY_OF_MONTH,DAY_OF_WEEK,FL_DATE,OP_UNIQUE_CARRIER,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,...,ACTUAL_ELAPSED_TIME,AIR_TIME,FLIGHTS,DISTANCE,CARRIER_DELAY,WEATHER_DELAY,NAS_DELAY,SECURITY_DELAY,LATE_AIRCRAFT_DELAY,Unnamed: 42
27153,2017,3,8,30,3,2017-08-30,DL,19790,DL,N918DH,...,94.0,75.0,1,546,,,,,,
9210,2017,3,8,16,3,2017-08-16,AS,19930,AS,N264AK,...,332.0,295.0,1,2496,0.0,0.0,16.0,0.0,0.0,
14857,2017,3,8,3,4,2017-08-03,WN,19393,WN,N254WN,...,,,1,287,,,,,,
10578,2017,3,8,20,7,2017-08-20,AS,19930,AS,N472AS,...,266.0,241.0,1,1721,,,,,,
14299,2017,3,8,30,3,2017-08-30,AS,19930,AS,N557AS,...,364.0,327.0,1,2640,,,,,,


In [5]:
# Each file on flight data represents one month of data
FlightDataDates = ['2017.08',
                   '2017.09',
                   '2017.10',
                   '2017.11',
                   '2017.12',
                   '2018.01',
                   '2018.02',
                   '2018.03',
                   '2018.04',
                   '2018.05',
                   '2018.06',
                   '2018.07']

#FlightDataDates = ['2017.08',
#                   '2017.09']

In [6]:
# Read the data file and strip out the unused columns
# DataDate - string in the form '2018.04'
# Return a cleaned DataFrame

def ReadFlightData(DataDate):
    df_read = pd.read_csv('data/ONTIME_'+DataDate+'.csv')
    df = df_read[['FL_DATE','YEAR','MONTH','DAY_OF_MONTH','DAY_OF_WEEK','OP_UNIQUE_CARRIER',
                  'OP_CARRIER_FL_NUM','ORIGIN',
                  'DEST','CRS_ARR_TIME','ARR_TIME','DISTANCE','CANCELLED',
                  'CARRIER_DELAY','WEATHER_DELAY','NAS_DELAY', 'SECURITY_DELAY',
                  'LATE_AIRCRAFT_DELAY']]
    return df

In [7]:
df = pd.DataFrame()
for DataDate in FlightDataDates:
    print(DataDate)
    df = df.append(ReadFlightData(DataDate))
              

2017.08
2017.09
2017.10
2017.11
2017.12
2018.01
2018.02
2018.03
2018.04
2018.05
2018.06
2018.07


In [None]:
#df = ReadFlightData('2017.09')
df.head()

In [None]:
df.ORIGIN.value_counts()

In [None]:
df.shape

In [None]:
# Unique Airlines
sorted(df['OP_UNIQUE_CARRIER'].unique())

In [None]:
df2 = df.groupby(['FL_DATE']).size()
print(df2.head())
print(df2.size)
plt.plot(range(365),df2);

In [None]:
# Check for NaN
df.info()

In [8]:
# fill NaN in ARR_TIME with value in CRS_ARR_TIME
df['ARR_TIME'].fillna(df['CRS_ARR_TIME'],inplace=True)

### Add OnTime column
By definition, if the flight ARR_TIME is 15 minutes more than CRS_ARR_TIME (CRS = Computerized Reservations Systems) then the flight is late

In [None]:
df['Delay'] = (df['ARR_TIME'] - df['CRS_ARR_TIME'] >= 15).apply(lambda x: 1 if x == True else 0)

df.head()

In [None]:
# How may flights are on-time (0) and how many delayed (1){}
df['Delay'].value_counts()

In [None]:
df.to_csv('data/FlightData.csv')

In [None]:
DelayCount = {}
OnTimeCount = {}
for index,row in df.iterrows():
    date = row['FL_DATE']
    if row['Delay']:
        if date in DelayCount:
            DelayCount[date] += 1
        else:
            DelayCount[date] = 1
    else:
        if date in OnTimeCount:
            OnTimeCount[date] += 1
        else:
            OnTimeCount[date] = 1

In [None]:
OnTimeCount

In [None]:
plt.bar(DelayCount.keys(), DelayCount.values(), color='r');

In [None]:
plt.bar(OnTimeCount.keys(), OnTimeCount.values(), color='g');

## Weather Data

In [None]:
WeatherData = pd.read_csv('data/WeatherDayAverage.csv')

In [None]:
WeatherData.columns

In [None]:
WeatherData.head()