### Flight Linker

In [1]:
from __future__ import division, print_function # because we're running Python 2
import os                                       # to make file paths independent of the operating system
import numpy as np                              # scientific computing package
import pandas as pd                             # data analysis library
import time                                     # library to handle time
import re                                       # regular expressions
import datetime as dt                           # library to handle dates
import itertools                                # iteration functions
import pdb

In [2]:
def date_range(date1,date2):
    ''' Function to iterate over dates'''
    
    while date1 <= date2:
        yield date1
        date1 = date1 + dt.timedelta(days=1)
    
    
def make_datetime(row):
    ''' Function to combine date, hour and minutes into a datetime field'''
    
    time = str(int(row['TIME']))
    len_ = len(time)
    
    if len_ < 3 :
        hour   = 0
        minute = int(time)
    else:
        hour   = int(time[0:(len_-2)])
        minute = int(time[-2:])
        
    return row['FLIGHT_DATE'] + dt.timedelta(hours=hour) + dt.timedelta(minutes=minute)


def select_dates(row):
    ''' For a given service (row), select all days between start and end data
        that match the days of the week given in the frequency field.
        Return a list of all selected dates for each service.'''
    
    return [date for date in date_range(row['START'], row['END']) if 
            str(dt.datetime.weekday(date) + 1) in row['FREQ']]

In [3]:
# To be replaced with user input
start_date_str = '08/04/2018'
end_date_str   = '08/04/2018'

# Convert strings to dates
start_date = pd.to_datetime(start_date_str, format='%d/%m/%Y')
end_date   = pd.to_datetime(end_date_str, format='%d/%m/%Y')

# Create flight path compatible with all operating systems
data_path = os.path.join(os.getcwd(), 'data', '180312_ACL.csv')

# Import ACL schedule
acl_original = pd.read_csv(data_path, delimiter = ',')

print("There are %s services in this file" % (acl_original.shape[0]-1))
print("Cleaning the file...")

# Delete first row with column names
acl_services = acl_original.drop(0).reset_index(drop=True)

# Delete useless columns
acl_services.drop(['REQ','CLEAR','TERM','WKY','REC','L/N','T/R','LAST'], axis=1, inplace=True)

# Convert columns with dates to date format
acl_services['START'] = pd.to_datetime(acl_services['START'])
acl_services['END']   = pd.to_datetime(acl_services['END'])

# Create a list of weekdays of each service
acl_services['FREQ'] = [re.compile(r'\d').findall(flight) for flight in acl_services['FREQ']]

# Add column with operator
acl_services.insert(1, 'OPERATOR', [service[0:3] for service in acl_services['SERVICE']])

There are 6045 services in this file
Cleaning the file...


In [4]:
# Initialise lists
flight_dates = []
number_dates = []

print("Working out all the flight dates of each service...")

# Create a list of all the dates that should be considered
flight_dates = acl_services.apply(lambda row: select_dates(row), axis=1)

# Create a new column with the number of dates for each service
acl_services['DATE_COUNT'] = flight_dates.apply(len)

Working out all the flight dates of each service...


In [12]:
print("Repeating each service as many times as dates it flies...")

# Create a new dataframe with one row per flight, using the values in the column created above to know
# how many times to repeat each flight
acl_flights = acl_services.loc[np.repeat(acl_services.index.values, acl_services['DATE_COUNT'])]

# Turn off warning
pd.options.mode.chained_assignment = None

# Create a new column containing the flight date
acl_flights['FLIGHT_DATE'] = list(itertools.chain(*flight_dates))

# Drop columns that we don't need anymore
acl_flights.drop(['FREQ','DATE_COUNT','START','END'], axis=1, inplace=True)

Repeating each service as many times as dates it flies...


In [13]:
# Separate departures from arrivals
departures = acl_flights[acl_flights['A/D'] == 'D']
arrivals   = acl_flights[acl_flights['A/D'] == 'A']

In [14]:
# Eliminate those flights for which we are 100% sure they are not relevant for the stand plan week
arrivals =   arrivals[  (arrivals['FLIGHT_DATE']   >= start_date - dt.timedelta(days=15)) &
                        (arrivals['FLIGHT_DATE']   <= end_date)]

departures = departures[(departures['FLIGHT_DATE'] >= start_date) & 
                        (departures['FLIGHT_DATE'] <= end_date + dt.timedelta(days=15))]

In [15]:
print("Cleaning dates and times and adding auxiliary fields...")
# Reset indeces
departures.reset_index(inplace=True, drop=True)
arrivals.reset_index  (inplace=True, drop=True)

# Create a column with date and time combining existing columns
arrivals  ['DATETIME'] = arrivals.apply  (lambda row: make_datetime(row), axis=1)
departures['DATETIME'] = departures.apply(lambda row: make_datetime(row), axis=1)

# Add a unique ID to the departures and arrivals dataframes
departures['ID']   = np.core.defchararray.add('D', departures.index.values.astype(dtype=str))

# Initialise 'LINK' field for connecting arrivals with departures
arrivals  ['LINK'] = ''
departures['LINK'] = 0

Cleaning dates and times and adding auxiliary fields...


In [16]:
# Sort dataframes by datetime
arrivals.sort_values  ('DATETIME', inplace=True)
departures.sort_values('DATETIME', inplace=True)

arrivals.reset_index  (drop=True, inplace=True)
departures.reset_index(drop=True, inplace=True)

In [None]:
# LINKING FLIGHTS: STEP 1:
# ------------------------

# Find linking flight for departures IN stand plan week
# only for flights with turnaround information

# for dep_idx, dep in departures.iterrows():
#     # If the turnaround field is not empty and if the flight departs inside the stand plan window
#     if (dep['TROUND'] != ' '*8) & (dep['FLIGHT_DATE'] <= end_date):
        
#         # Subset of arrivals that arrive before the time of departure and whose flight number
#         # matches the turnaround flight for this arrival
#         subset = arrivals[(arrivals['SERVICE'] == dep['TROUND']) &
#                           (arrivals['DATETIME'] < dep['DATETIME'])]
        
#         # Get the index of the arriving flight with the latest time
#         arr_idx = subset['DATETIME'].idxmax()
#         departures.loc[dep_idx,'LINK'] = 1
#         arrivals.loc  [arr_idx,'LINK'] = departures.iloc[dep_idx]['ID']

# Find linking flight for departures IN stand plan week
# only for flights with turnaround information

for arr_idx, arr in arrivals.iterrows():
    # If the turnaround field is not empty, if this arriving flight has not been assigned to a departure yet and
    # if the flight arrives inside the stand plan window
    if (arr['TROUND'] != ' '*8) & (arr['LINK'] != 1): #& (arr['FLIGHT_DATE'] >= start_date):
        
        # Subset of departures that depart after the time of the arrival and 
        # whose flight number matches the turnaround flight for this arrival
        subset = departures[(departures['SERVICE'] == arr['TROUND']) &
                            (departures['DATETIME'] > arr['DATETIME'])]
        
        if len(subset['DATETIME']) != 0:
            # Get the index of the departing flight with the earliest time
            dep_idx = subset['DATETIME'].idxmin()

            arrivals.loc  [arr_idx,'LINK'] = departures.iloc[dep_idx]['ID']
            departures.loc[dep_idx,'LINK'] = 1

In [None]:
arrivals

In [None]:
# LINKING FLIGHTS: STEP 2:
# ------------------------

# Find linking flight for departures IN stand plan week
# without turnaround information: first in - first out

# for dep_idx, dep in departures.iterrows():
    
#     # If the turnaround field is not empty and if the flight departs inside the stand plan window
#     if (dep['LINK'] == 0) & (dep['FLIGHT_DATE'] <= end_date):
#         # Subset of arrivals candidates
#         subset = arrivals[(arrivals['AC']       == dep['AC'])    &    # must have the same aircraft
#                           (arrivals['OPERATOR'] == dep['OPERATOR']) & # operated by the same airline                           
#                           (arrivals['DATETIME'] <= dep['DATETIME'] - dt.timedelta(minutes=20)) &
#                           (arrivals['LINK']     == '')]               # cannot be linked to another dep
        
#         #if dep['OPERATOR'] == 'BA ': pdb.set_trace()
        
#         # Get the index of the arriving flight with the latest time
#         arr_idx = subset['DATETIME'].idxmax()
#         departures.loc[dep_idx,'LINK'] = 1
#         arrivals.loc  [arr_idx,'LINK'] = departures.iloc[dep_idx]['ID']

In [None]:
for arr_idx, arr in arrivals.iterrows():
    
    # If the turnaround field is not empty and if the flight departs inside the stand plan window
    if (arr['LINK'] == ''): #& (arr['FLIGHT_DATE'] >= start_date):
        # Subset of arrivals candidates
        subset = departures[(departures['AC']       == arr['AC'])    &    # must have the same aircraft
                            (departures['OPERATOR'] == arr['OPERATOR']) & # operated by the same airline                           
                            (departures['DATETIME'] >= arr['DATETIME'] + dt.timedelta(minutes=20)) &
                            (departures['LINK']     == 0)]               # cannot be linked to another dep
        
        #if len(subset['DATETIME']) == 0: pdb.set_trace()
        #if dep['OPERATOR'] == 'BA ': pdb.set_trace()
        
        # Get the index of the departing flight with the earliest time
        dep_idx = subset['DATETIME'].idxmin() 
        arrivals.loc  [arr_idx,'LINK'] = departures.iloc[dep_idx]['ID']
        departures.loc[dep_idx,'LINK'] = 1

In [None]:
departures

In [None]:
arrivals.drop  (['TIME','FLIGHT_DATE'], axis=1, inplace=True)
departures.drop(['TIME','FLIGHT_DATE'], axis=1, inplace=True)

# Perform inner join of departures and arrivals
linked = arrivals.merge(departures, left_on='LINK', right_on='ID', how='inner')
linked.drop(['LINK_x'], axis=1, inplace=True)
linked['TURN_MINUTES'] = [delta.seconds//60 for delta in (linked.DATETIME_y - linked.DATETIME_x)]

In [None]:
arrivals.to_csv('out/arrivals.csv')
departures.to_csv('out/departures.csv')
linked.to_csv('out/inner_turns_for_checks.csv')

In [None]:
linked_clean = linked.copy()
linked_clean.drop(['A/D_x','A/D_y','AC_y','TROUND_y','ID','LINK_y','OPERATOR_y'], axis=1, inplace=True)

In [None]:
linked_clean.head()