In [44]:
import numpy as np
import scipy
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

sns.set()

# Set Papermill Variables

In [45]:
input_path = "../data/output/2019-01-flights-cleaned.csv"
output_path = "../data/output/2019-01-flights-delays.csv"

# Read the data

In [50]:
df = pd.read_csv(input_path)
df['FL_DATE'] = pd.to_datetime(df['FL_DATE'])
df.head()

Unnamed: 0.1,Unnamed: 0,FL_DATE,CRS_ARR_TIME,ARR_TIME
0,238,2019-01-01,1250,1301
1,412,2019-01-01,1854,1854
2,481,2019-01-01,1503,1457
3,532,2019-01-01,2259,2349
4,657,2019-01-01,2015,2036


# Compute Delay

### Attempt 1: Convert (int) time to a time object and compute delay from that

In [48]:
import datetime
def int_to_time(inttime):
    hours = inttime // 100 % 24
    mins = inttime % 100
    return datetime.time(hours, mins)


def row_to_datetime(col):
    def rtdt(row):
        row_date = row['FL_DATE']
        timeval = row[col]
        return datetime.datetime(row_date.year, row_date.month, row_date.day, 
                          timeval.hour, timeval.minute)
    return rtdt

df['CRS_ARR_TIME'] = df['CRS_ARR_TIME'].apply(int_to_time)
df['ARR_TIME'] = df['ARR_TIME'].apply(int_to_time)

arr_dt = df.apply(row_to_datetime('ARR_TIME'), 1)
crs_arr_dt = df.apply(row_to_datetime('CRS_ARR_TIME'), 1)
df['delay_minutes'] = (arr_dt - crs_arr_dt) / np.timedelta64(1, 'm')

### Attempt 2: Convert to timedelta

Instead of converting the times to datetime.time objects, let us convert them to timedelta objects to make them offsets from midnight on the takeoff date.

In [51]:
# import datetime
# def int_to_timedelta(inttime):
#     hours = inttime // 100 * 60
#     mins = inttime % 100
#     return pd.to_timedelta(hours + mins, unit='m')
# df['CRS_ARR_TIME'] = int_to_timedelta(df['CRS_ARR_TIME'])
# df['ARR_TIME'] = int_to_timedelta(df['ARR_TIME'])
# df['delay_minutes'] = ((df['ARR_TIME'] - df['CRS_ARR_TIME']) / np.timedelta64(1, 'm'))

### Attempt 3: Use the delay from BTS

In [None]:
# df['delay_minutes'] = df['ARR_DELAY']

In [49]:
df.head()

Unnamed: 0.1,Unnamed: 0,FL_DATE,CRS_ARR_TIME,ARR_TIME,delay_minutes
0,238,2019-01-01,12:50:00,13:01:00,11.0
1,412,2019-01-01,18:54:00,18:54:00,0.0
2,481,2019-01-01,15:03:00,14:57:00,-6.0
3,532,2019-01-01,22:59:00,23:49:00,50.0
4,657,2019-01-01,20:15:00,20:36:00,21.0


# Let's save the result

In [None]:
analysis_columns = ['FL_DATE', 'delay_minutes']
df[analysis_columns].to_csv(output_path)