<h3> <font face="arial" size =4 color="green"> The dataset below looks like a classic time series data, while it is a time series data giving a pure time series treatment won't help in certain cases. </font> </h3>

In [2]:
'''

## Generate synthetic arrivals data
## Arrivals Information

    DOA (TS)
    # of Flights
    # of Non-transit (NT) passengers
    Terminal id
    # People in the Taxi-Queue
    # Taxis availed
'''


import datetime
import random

COMMA = ','

class SyntheticArrivals():
    terminals = 0
    startdate = None
    enddate = None
    city = ''

    def __init__(self, terminals,city, startdate, enddate=datetime.datetime.now()):
        self.startdate = startdate
        self.enddate = enddate
        self.terminals = terminals
        self.city = city

    def getRandom(self, start_range, end_range):
        return random.randint(start_range, end_range)

    #def isHoliday(current_date):

    def generateData(self):

        next_date_str = datetime.datetime.strftime(self.startdate,'%Y-%m-%d %H:%M:%S')
        next_date = self.startdate
        csv_data = []

        while(True):
            number_of_flights = self.getRandom(1,20)
            number_of_nt_passengers = sum([ i * self.getRandom(350, 853) for i in range (1, number_of_flights + 1)]) # based on A340 (350) - A380 (853) capacity
            terms = ["T"+str(i) for i in range(1,self.terminals+1)]
            #print terms
            terminal = self.getRandom(1, self.terminals)
            #print terminal
            
            # Grave shift for public transport varies for each country so plugin meaningful values here
            if next_date.hour >= 01 and next_date.hour <= 05:
                taxi_share_pct = 0.95
            else:
                taxi_share_pct = .30
                
            # Roughly 30% taxi-share; heavily depends on time of arrival and city    
            # Every city's public transportation system shutdown beyond midnight until early morning. Taxi-share during those hours can spike.
            # For Bali, Indonesia or HCMC Vietnam taxi-share would be higher because there is no public transportation
            
            number_of_people_in_queue = int(number_of_nt_passengers * taxi_share_pct) 
            number_of_taxis = int(number_of_people_in_queue * 0.90)  # sometimes a queue may have a family of 4 that takes a single taxi so there NO 1:1 mapping
            csv_data.append(next_date_str + COMMA + str(number_of_flights) + COMMA + str(number_of_nt_passengers) + COMMA + terms[terminal-1] + COMMA + str(number_of_people_in_queue) + COMMA + str(number_of_taxis))

            #print csv_record
            next_date = next_date + datetime.timedelta(hours=1)
            next_date_str = datetime.datetime.strftime(next_date,'%Y-%m-%d %H:%M:%S')

            if next_date >= self.enddate:
                break
        return csv_data       

sa = SyntheticArrivals(4,'US', datetime.datetime.now() - datetime.timedelta(days=365), datetime.datetime.now())
csv_data =  sa.generateData()

import  pandas as pd
arrivalsdf = pd.DataFrame([record.split(",") for record in csv_data])
arrivalsdf.columns = ['Date of Arrival', 'No of Flights', 'No of NT Passengers', 'Terminal', 'People in queue', 'Taxis in queue']
arrivalsdf['Date of Arrival'] = pd.to_datetime(arrivalsdf['Date of Arrival']) #.astype('datetime64')
arrivalsdf.head(10)


Unnamed: 0,Date of Arrival,No of Flights,No of NT Passengers,Terminal,People in queue,Taxis in queue
0,2017-11-02 11:57:15,15,68742,T2,20622,18559
1,2017-11-02 12:57:15,10,36865,T1,11059,9953
2,2017-11-02 13:57:15,13,51825,T2,15547,13992
3,2017-11-02 14:57:15,16,75476,T1,22642,20377
4,2017-11-02 15:57:15,9,30710,T1,9213,8291
5,2017-11-02 16:57:15,7,14579,T2,4373,3935
6,2017-11-02 17:57:15,5,8850,T2,2655,2389
7,2017-11-02 18:57:15,8,21034,T3,6310,5679
8,2017-11-02 19:57:15,15,74597,T4,22379,20141
9,2017-11-02 20:57:15,14,67533,T4,20259,18233


In [3]:
# Causal Factor Matrix

# Day of week
# Day of month
# Day of year
# Week of year
# Month of year
# Quarter of year
# Hour of the day
# Terminal
# Holidays specific to the Geography
# Chinese New year
# Christmas
# Good friday
# Weather temp, rainy/storm



arrivalsdf['dow'] = arrivalsdf['Date of Arrival'].dt.weekday
arrivalsdf['dom'] = arrivalsdf['Date of Arrival'].dt.day
arrivalsdf['doy'] = arrivalsdf['Date of Arrival'].dt.dayofyear
arrivalsdf['woy'] = arrivalsdf['Date of Arrival'].dt.week
arrivalsdf['moy'] = arrivalsdf['Date of Arrival'].dt.month
arrivalsdf['qoy'] = arrivalsdf['Date of Arrival'].dt.quarter
arrivalsdf['hod'] = arrivalsdf['Date of Arrival'].dt.hour

from datetime import date
import holidays


# This works with any country , lets try and extract features for US.
declared_holidays = {}
for date, name in sorted(holidays.US(years=2018).items()):
    declared_holidays[date] =  name
    
# Let's add a couple of custom holidays which are not in the holidays package
custom_holidays = holidays.HolidayBase()
custom_holidays.append({datetime.date(2018, 2, 16): "Chinese New Year's Day"})
declared_holidays.update(custom_holidays)


def get_holidays(row):
    #print type(row['Date of Arrival'])
    d = datetime.datetime.date(row['Date of Arrival'])
    if d in declared_holidays:
        return declared_holidays[d]
    else:
        return "-"

arrivalsdf['holidays'] = arrivalsdf.apply(get_holidays, axis=1)





In [4]:
holidays_df =  pd.get_dummies(arrivalsdf['holidays'])
holidays_df.drop("-",axis = 1, inplace=True)
holidays_df.columns = ["is_"+col for col in holidays_df.columns]
arrivalsdf = pd.concat([arrivalsdf, holidays_df], axis=1)
arrivalsdf.drop("holidays",axis = 1, inplace=True)

In [5]:
arrivalsdf.head()

Unnamed: 0,Date of Arrival,No of Flights,No of NT Passengers,Terminal,People in queue,Taxis in queue,dow,dom,doy,woy,...,qoy,hod,is_Chinese New Year's Day,is_Columbus Day,is_Independence Day,is_Labor Day,"is_Martin Luther King, Jr. Day",is_Memorial Day,is_New Year's Day,is_Washington's Birthday
0,2017-11-02 11:57:15,15,68742,T2,20622,18559,3,2,306,44,...,4,11,0,0,0,0,0,0,0,0
1,2017-11-02 12:57:15,10,36865,T1,11059,9953,3,2,306,44,...,4,12,0,0,0,0,0,0,0,0
2,2017-11-02 13:57:15,13,51825,T2,15547,13992,3,2,306,44,...,4,13,0,0,0,0,0,0,0,0
3,2017-11-02 14:57:15,16,75476,T1,22642,20377,3,2,306,44,...,4,14,0,0,0,0,0,0,0,0
4,2017-11-02 15:57:15,9,30710,T1,9213,8291,3,2,306,44,...,4,15,0,0,0,0,0,0,0,0
