In [1]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import math

## Dataset Generation - MultiFlight Version

We don't have a single data set containing all the information we need for the algorithm so we will generate a dataset that fits our needs based on real data.<br>The following datasets appear to have matching observations based on the ItineraryID column, but none contain all the information we need. <br><br>Ticket has a distance, FarePerMile and Itinerary fare per flight, however it does not include the destinations for some reason, while the coupon dataset does.<br>Similarly the time dataset contains the appropriate delay and time data for flights from point A and point B. Note that it had 120 columns with a bunch of useless columns and empty data so I pruned it manually when I had a quick look. <br><br> Using all this we will create a single data set simulating all the data required to model the problem for our algorithm.<br><br>

This particular notebook generates a similar dataset to the averaged version, but instead of averaging out the price and all times associated with all the individual flights between an origin and a destination airport, effectively creating a single flight between these airports, we take up to 10 flights per day and associate a random set of times and delays to each price in the data set.

In [2]:
ticket = pd.read_csv('Data/ticket_2019_1/ticket_2019_1.csv')
coupon = pd.read_csv('Data/coupon_2019_1/coupon_2019_1.csv')
time = pd.read_csv('Data/time_2019_1/time_2019_1.csv')

Drop columns we don't need. Note all data points are from the US.

In [3]:
ticket = ticket.drop(
    columns=[
        'Coupons',
        'Year',
        'Quarter',
        'OriginAirportID',
        'OriginAirportSeqID',
        'OriginCityMarketID',
        'OriginStateFips',
        'OriginStateName',
        'OriginState',
        'OriginCountry',
        'OriginWac',
        'DollarCred',
        'Passengers',
        'BulkFare',
        'DistanceGroup',
        'ItinGeoType',
        'RPCarrier',
        'OnLine',
        'MilesFlown',
        'FarePerMile',
        'Distance'
    ]
)

coupon = coupon.drop(
    columns=[
        'MktID',
        'SeqNum',
        'Coupons',
        'OriginAirportID',
        'OriginAirportSeqID',
        'OriginCityMarketID',
        'Quarter',
        'OriginCountry',
        'OriginStateFips',
        'OriginWac',
        'DestAirportID',
        'DestAirportSeqID',
        'DestCityMarketID',
        'DestCountry',
        'DestStateFips',
        'DestWac',
        'CouponType',
        'Passengers',
        'FareClass',
        'Gateway',
        'ItinGeoType',
        'RPCarrier',
        'CouponGeoType',
        'TkCarrier',
        'OpCarrier',
        'DistanceGroup',
        'Year',
        'Break',
        'OriginStateName',
        'OriginState',
        'DestStateName',
        'DestState',
        'Distance'
    ]
)


In [4]:
coupon.head()

Unnamed: 0,ItinID,Origin,Dest
0,201915.0,ABE,ATL
1,201916.0,ABE,ATL
2,201917.0,ABE,ATL
3,201919.0,ABE,ATL
4,2019130.0,ABE,ATL


In [5]:
ticket.head()

Unnamed: 0,ItinID,Origin,RoundTrip,ItinFare
0,201914.0,ABE,0,203
1,201915.0,ABE,0,211
2,201916.0,ABE,0,241
3,201917.0,ABE,0,266
4,201919.0,ABE,0,314


We'll remove the round trips, since we're only interested in flights from A to B.

In [6]:
print("Percent of Round Trips : {:.2f}%".format((len(ticket.loc[ticket['RoundTrip'] == 1]) / len(ticket['RoundTrip']) * 100), 2))

Percent of Round Trips : 50.01%


In [7]:
ticket = ticket[ticket['RoundTrip']==0]
ticket = ticket.drop(columns=['RoundTrip'])
df = pd.merge(ticket, coupon, on=['ItinID', 'Origin']) 
df

Unnamed: 0,ItinID,Origin,ItinFare,Dest
0,2.019150e+05,ABE,211,ATL
1,2.019160e+05,ABE,241,ATL
2,2.019170e+05,ABE,266,ATL
3,2.019190e+05,ABE,314,ATL
4,2.019191e+06,ABE,148,ATL
...,...,...,...,...
53997,2.019200e+10,SFO,142,LAX
53998,2.019200e+10,SFO,262,LAX
53999,2.019200e+10,SFO,339,LAX
54000,2.019200e+10,SFO,107,LAX


In [8]:
df = df.drop(columns=['ItinID'])
df.head()

Unnamed: 0,Origin,ItinFare,Dest
0,ABE,211,ATL
1,ABE,241,ATL
2,ABE,266,ATL
3,ABE,314,ATL
4,ABE,148,ATL


In [9]:
df

Unnamed: 0,Origin,ItinFare,Dest
0,ABE,211,ATL
1,ABE,241,ATL
2,ABE,266,ATL
3,ABE,314,ATL
4,ABE,148,ATL
...,...,...,...
53997,SFO,142,LAX
53998,SFO,262,LAX
53999,SFO,339,LAX
54000,SFO,107,LAX


In [10]:
df = df[df['ItinFare'] < 10000]
df = df[df['ItinFare'] > 50]
df.head

<bound method NDFrame.head of       Origin  ItinFare Dest
0        ABE       211  ATL
1        ABE       241  ATL
2        ABE       266  ATL
3        ABE       314  ATL
4        ABE       148  ATL
...      ...       ...  ...
53997    SFO       142  LAX
53998    SFO       262  LAX
53999    SFO       339  LAX
54000    SFO       107  LAX
54001    SFO      1980  DFW

[53407 rows x 3 columns]>

Now to add the time data into it.

In [11]:
time

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,SBP,SFO,1353.0,-7.0,1444.0,-26.0,51.0
1,IAH,XNA,930.0,-5.0,1119.0,1.0,109.0
2,SGF,IAH,637.0,-6.0,838.0,-17.0,121.0
3,ISN,DEN,1314.0,-21.0,1404.0,-29.0,110.0
4,MKE,DEN,826.0,-10.0,1009.0,-31.0,163.0
...,...,...,...,...,...,...,...
638644,MEM,IAH,634.0,-6.0,853.0,13.0,139.0
638645,MEM,IAH,631.0,-9.0,830.0,-10.0,119.0
638646,MEM,IAH,632.0,-8.0,828.0,-12.0,116.0
638647,MEM,IAH,630.0,-10.0,831.0,-9.0,121.0


In [12]:
time.sample()[['DepTime', 'DepDelay', 'ArrTime', 'ArrTime', 'ArrDelay', 'ActualElapsedTime']]

Unnamed: 0,DepTime,DepDelay,ArrTime,ArrTime.1,ArrDelay,ActualElapsedTime
625269,1837.0,-15.0,2022.0,2022.0,-18.0,105.0


**NOTE:** This is one place where we can change input size as well, since it limits the maximum number of flights between Origin and Destination pairs.

In [13]:
time = time.groupby(['Origin', 'Dest']).head(15).reset_index(drop=True)

In [14]:
time

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,SBP,SFO,1353.0,-7.0,1444.0,-26.0,51.0
1,IAH,XNA,930.0,-5.0,1119.0,1.0,109.0
2,SGF,IAH,637.0,-6.0,838.0,-17.0,121.0
3,ISN,DEN,1314.0,-21.0,1404.0,-29.0,110.0
4,MKE,DEN,826.0,-10.0,1009.0,-31.0,163.0
...,...,...,...,...,...,...,...
80109,CAE,IAH,641.0,-4.0,833.0,-13.0,172.0
80110,CAE,IAH,641.0,-4.0,829.0,-17.0,168.0
80111,CAE,IAH,641.0,-4.0,822.0,-24.0,161.0
80112,CAE,IAH,641.0,-4.0,834.0,-12.0,173.0


We'll assign a random price from the itinerary fare dataset to each flight. Takes a while, there's probably a more efficient way to do this out there.

In [15]:
time['ItinFare'] = np.zeros(len(time))

for index, row in time.iterrows():
    chosen_idx = np.random.choice(len(df), replace = True, size = 1) 
    val = df.iloc[chosen_idx]["ItinFare"].values[0]
    time.at[index,'ItinFare'] = val

time

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime,ItinFare
0,SBP,SFO,1353.0,-7.0,1444.0,-26.0,51.0,229.0
1,IAH,XNA,930.0,-5.0,1119.0,1.0,109.0,116.0
2,SGF,IAH,637.0,-6.0,838.0,-17.0,121.0,64.0
3,ISN,DEN,1314.0,-21.0,1404.0,-29.0,110.0,219.0
4,MKE,DEN,826.0,-10.0,1009.0,-31.0,163.0,139.0
...,...,...,...,...,...,...,...,...
80109,CAE,IAH,641.0,-4.0,833.0,-13.0,172.0,107.0
80110,CAE,IAH,641.0,-4.0,829.0,-17.0,168.0,107.0
80111,CAE,IAH,641.0,-4.0,822.0,-24.0,161.0,167.0
80112,CAE,IAH,641.0,-4.0,834.0,-12.0,173.0,93.0


Just to make working with the data easier I'll convert the times to actual time datatype. 
***NOTE:*** this actually made it harder lol - has been removed in latest dataset used in algorithm 1.

In [16]:
time.dropna(inplace=True)
df = time

In [17]:
"""
df['DepTime'] = df['DepTime'].astype(int)
df['DepDelay'] = df['DepDelay'].astype(int)
df['ArrTime'] = df['ArrTime'].astype(int)
df['DepDelay'] = df['DepDelay'].astype(int)
df['ArrDelay'] = df['ArrDelay'].astype(int)
df['ActualElapsedTime'] = df['ActualElapsedTime'].astype(int)

def help(time):
    if time >= 2400:
        time -= 100
    if time < 100:
        time = 100
        
    str_time = str(time)
    
    if len(str_time) < 4:
        hours = int(str_time[0]) 
        mins = int(str_time[1:3])
        if(hours > 23): hours = 23
        if(hours < 0): hours = 1
        if(mins > 59): mins = 59
        if(mins < 0): hours = 1
        return dt.time(hour=hours, minute=mins)
    else:
        hours = int(str_time[0:2]) 
        mins = int(str_time[2:4])
        if(hours > 23): hours = 23
        if(hours < 0): hours = 1
        if(mins > 59): mins = 59
        if(mins < 0): hours = 1
        return dt.time(hour=hours, minute=mins)

def converTime(df, col):
    return pd.Series(
        [ help(time) for time in df[col] ]
    )
"""

In [18]:
"""
df['DepTime'] = converTime(df, 'DepTime')
df['ArrTime'] = converTime(df, 'ArrTime')
"""
df.dropna(inplace=True)

In [20]:
import os
df.to_csv(os.getcwd() + '\\Data\\processed\\multiFlightData.csv', index=False)

Creating a seperate csv that includes longitude and lattitude data for visualization

In [21]:
import chardet
with open('Data/OpenFlights/airports.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large
    
airports = pd.read_csv('Data/OpenFlights/airports.csv', encoding=result['encoding'])

In [22]:
# aiports = airports[airports['Country']=='United States']
airports.drop(airports.loc[airports['Country']!='United States'].index, inplace=True)
airports.reset_index(drop=True)
aiports = airports.drop(columns=['Name', 'City', 'ICAO', 'Country'])

In [23]:
locations = df.merge(airports,on=['Origin'])
locations = locations.dropna()
locations = locations.drop(columns=['Dest', 'ItinFare', 'DepDelay', 'ArrDelay', 'ActualElapsedTime', 'Name', 'City', 'Country', 'ICAO', 'DepTime', 'ArrTime'])


In [24]:
locations = locations[(locations['Origin'].isin(df['Origin']))|(locations['Origin'].isin(df['Dest']))]
locations = locations.drop_duplicates().reset_index(drop=True)
locations.to_csv(os.getcwd() + '\\Data\\processed\\airportLocations.csv', index=False)

In [25]:
df = df[(df['Origin'].isin(locations['Origin']))&(df['Dest'].isin(locations['Origin']))]
df.dropna(inplace=True)
df.to_csv(os.getcwd() + '\\Data\\processed\\multiFlightData.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [26]:
df

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime,ItinFare
0,SBP,SFO,1353,-7,1444,-26,51,229.0
1,IAH,XNA,930,-5,1119,1,109,116.0
2,SGF,IAH,637,-6,838,-17,121,64.0
3,ISN,DEN,1314,-21,1404,-29,110,219.0
4,MKE,DEN,826,-10,1009,-31,163,139.0
...,...,...,...,...,...,...,...,...
80109,CAE,IAH,641,-4,833,-13,172,107.0
80110,CAE,IAH,641,-4,829,-17,168,107.0
80111,CAE,IAH,641,-4,822,-24,161,167.0
80112,CAE,IAH,641,-4,834,-12,173,93.0
