In [None]:
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set()

# Set Papermill Variables

In [None]:
input_path = "../data/flights/2019-01-flights.csv.zip"
output_path = "../data/output/2019-01-flights-preprocessed.csv"

# Read the data and inspect it

In [None]:
df = pd.read_csv(input_path)

In [None]:
df.head()

- Look at dep time. This is a strange format. We need to parse it.
- Implement a parser for this format. We should write tests, but that is beyond scope for this tutorial.

In [None]:
df.dtypes

In [None]:
# Why is DEP_TIME a float64? Is it sometimes nan?
len(df[df['DEP_TIME'].isna()])

In [None]:
# How many nan times do we have if we ignore cancelled and diverted flights?
tdf = df[(df['CANCELLED'] == 0) & (df['DIVERTED'] == 0)]
len(tdf[tdf['DEP_TIME'].isna()]), len(tdf[tdf['ARR_TIME'].isna()])

In [None]:
# Let's ignore the cancelled and diverted flights for the moment
df = df[(df['CANCELLED'] == 0) & (df['DIVERTED'] == 0)]

# Fix Table Format

## Convert types of time fields and drop irrelevant fields

In [None]:
# Let's convert the time fields to int64 to make conversion to time simpler and drop fields we do not care about
df = df.astype({'DEP_TIME': 'int64', 'ARR_TIME': 'int64'}).drop(['CANCELLED', 'CANCELLATION_CODE', 'DIVERTED'], 1)
df.head()

## Convert FL_DATE to datetime

In [None]:
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df.head()

## Convert CRS_*x*_TIME fields to time fields

### Convert the time fields by directly parsing them

This does not work correctly. It seems that some of the results are off by 24h.

In [None]:
# import datetime
# def int_to_time(inttime):
#     hours = inttime // 100
#     mins = inttime % 100
#     return datetime.time(hours, mins)
# # Should write tests here
# int_to_time(100), int_to_time(1133), int_to_time(2356)

In [None]:
# Convert int times to times
# df['CRS_DEP_TIME'] = df['CRS_DEP_TIME'].apply(int_to_time)

In [None]:
# df['DEP_TIME'].apply(int_to_time).head()
# whoops -- this fails

In [None]:
# import datetime
# def int_to_time(inttime):
#     hours = inttime // 100 % 24
#     mins = inttime % 100
#     return datetime.time(hours, mins)
# # Should write tests here
# int_to_time(100), int_to_time(1133), int_to_time(2356), int_to_time(2405)

In [None]:
# df['DEP_TIME'] = df['DEP_TIME'].apply(int_to_time)
# df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].apply(int_to_time)
# df['ARR_TIME'] = df['ARR_TIME'].apply(int_to_time)

### New approach to convert data

We were definitely handling times > 24 incorrectly. Instead of converting the times to datetime.time objects, let us convert them to timedelta objects to make them offsets from midnight on the takeoff date.

This also does not work.

In [None]:
# import datetime
# def int_to_timedelta(inttime):
#     hours = inttime // 100 * 60
#     mins = inttime % 100
#     return pd.to_timedelta(hours + mins, unit='m')
# # Should write tests here
# int_to_timedelta(100), int_to_timedelta(1133), int_to_timedelta(2356), int_to_timedelta(2405)

In [None]:
# df['CRS_DEP_TIME'] = int_to_timedelta(df['CRS_DEP_TIME'])
# df['DEP_TIME'] = int_to_timedelta(df['DEP_TIME'])
# df['CRS_ARR_TIME'] = int_to_timedelta(df['CRS_ARR_TIME'])
# df['ARR_TIME'] = int_to_timedelta(df['ARR_TIME'])

In [None]:
# df.head()

# Compute Delay

Delay is no longer a difference between two datetime objects, and nor is it the difference between two timedeltas.

Let us just use the delay directly

In [None]:
# def row_to_delay(row):
#     row_date = row['FL_DATE']
#     planned_arrival = row['CRS_ARR_TIME']
#     arrival = row['ARR_TIME']
#     diff = datetime.datetime(row_date.year, row_date.month, row_date.day, 
#                              arrival.hour, arrival.minute) - \
#         datetime.datetime(row_date.year, row_date.month, row_date.day, 
#                           planned_arrival.hour, planned_arrival.minute)
#     return diff
# df['delay_minutes'] = df.apply(row_to_delay, 1).apply(lambda x: x.total_seconds() / 60)

In [None]:
# df['delay_minutes'] = ((df['ARR_TIME'] - df['CRS_ARR_TIME']) / np.timedelta64(1, 'm'))

In [None]:
# df['delay_minutes'] = ((df['CRS_DEP_TIME'] + df['CRS_ELAPSED_TIME']) - 
#     (df['DEP_TIME'] + df['ACTUAL_ELAPSED_TIME'])) / np.timedelta64(1, 'm')

In [None]:
df['delay_minutes'] = df['ARR_DELAY']

In [None]:
df.head()

# Which rows have strange delays?

In [None]:
cols = ['FL_DATE', 'OP_CARRIER', 'ORIGIN', 'DEST', 'CRS_DEP_TIME', 'DEP_TIME', 'CRS_ELAPSED_TIME', 'ACTUAL_ELAPSED_TIME', 'delay_minutes']
df.loc[df['delay_minutes'] > 300, cols].head()

# Let's save the result

In [None]:
analysis_columns = ['FL_DATE', 'OP_CARRIER', 'ORIGIN', 'DEST', 'delay_minutes']
df[analysis_columns].to_csv(output_path)