In [112]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import math

## Dataset Generation - MultiFlight Version

We don't have a single data set containing all the information we need for the algorithm so we will generate a dataset that fits our needs based on real data.<br>The following datasets appear to have matching observations based on the ItineraryID column, but none contain all the information we need. <br><br>Ticket has a distance, FarePerMile and Itinerary fare per flight, however it does not include the destinations for some reason, while the coupon dataset does.<br>Similarly the time dataset contains the appropriate delay and time data for flights from point A and point B. Note that it had 120 columns with a bunch of useless columns and empty data so I pruned it manually when I had a quick look. <br><br> Using all this we will create a single data set simulating all the data required to model the problem for our algorithm.<br><br>

This particular notebook generates a similar dataset to the averaged version, but instead of averaging out the price and all times associated with all the individual flights between an origin and a destination airport, effectively creating a single flight between these airports, we take up to 10 flights per day and associate a random set of times and delays to each price in the data set.

In [113]:
ticket = pd.read_csv('Data/ticket_2019_1/ticket_2019_1.csv')
coupon = pd.read_csv('Data/coupon_2019_1/coupon_2019_1.csv')
time = pd.read_csv('Data/time_2019_1/time_2019_1.csv')

Drop columns we don't need. Note all data points are from the US.

In [114]:
ticket = ticket.drop(
    columns=[
        'Coupons',
        'Year',
        'Quarter',
        'OriginAirportID',
        'OriginAirportSeqID',
        'OriginCityMarketID',
        'OriginStateFips',
        'OriginStateName',
        'OriginState',
        'OriginCountry',
        'OriginWac',
        'DollarCred',
        'Passengers',
        'BulkFare',
        'DistanceGroup',
        'ItinGeoType',
        'RPCarrier',
        'OnLine',
        'MilesFlown',
        'FarePerMile',
        'Distance'
    ]
)

coupon = coupon.drop(
    columns=[
        'MktID',
        'SeqNum',
        'Coupons',
        'OriginAirportID',
        'OriginAirportSeqID',
        'OriginCityMarketID',
        'Quarter',
        'OriginCountry',
        'OriginStateFips',
        'OriginWac',
        'DestAirportID',
        'DestAirportSeqID',
        'DestCityMarketID',
        'DestCountry',
        'DestStateFips',
        'DestWac',
        'CouponType',
        'Passengers',
        'FareClass',
        'Gateway',
        'ItinGeoType',
        'RPCarrier',
        'CouponGeoType',
        'TkCarrier',
        'OpCarrier',
        'DistanceGroup',
        'Year',
        'Break',
        'OriginStateName',
        'OriginState',
        'DestStateName',
        'DestState',
        'Distance'
    ]
)


In [115]:
coupon.head()

Unnamed: 0,ItinID,Origin,Dest
0,201915.0,ABE,ATL
1,201916.0,ABE,ATL
2,201917.0,ABE,ATL
3,201919.0,ABE,ATL
4,2019130.0,ABE,ATL


In [116]:
ticket.head()

Unnamed: 0,ItinID,Origin,RoundTrip,ItinFare
0,201914.0,ABE,0,203
1,201915.0,ABE,0,211
2,201916.0,ABE,0,241
3,201917.0,ABE,0,266
4,201919.0,ABE,0,314


We'll remove the round trips, since we're only interested in flights from A to B.

In [117]:
print("Percent of Round Trips : {:.2f}%".format((len(ticket.loc[ticket['RoundTrip'] == 1]) / len(ticket['RoundTrip']) * 100), 2))

Percent of Round Trips : 50.01%


In [118]:
ticket = ticket[ticket['RoundTrip']==0]
ticket = ticket.drop(columns=['RoundTrip'])
df = pd.merge(ticket, coupon, on=['ItinID', 'Origin']) 
df

Unnamed: 0,ItinID,Origin,ItinFare,Dest
0,2.019150e+05,ABE,211,ATL
1,2.019160e+05,ABE,241,ATL
2,2.019170e+05,ABE,266,ATL
3,2.019190e+05,ABE,314,ATL
4,2.019191e+06,ABE,148,ATL
...,...,...,...,...
53997,2.019200e+10,SFO,142,LAX
53998,2.019200e+10,SFO,262,LAX
53999,2.019200e+10,SFO,339,LAX
54000,2.019200e+10,SFO,107,LAX


In [119]:
df = df.drop(columns=['ItinID'])
df.head()

Unnamed: 0,Origin,ItinFare,Dest
0,ABE,211,ATL
1,ABE,241,ATL
2,ABE,266,ATL
3,ABE,314,ATL
4,ABE,148,ATL


In [120]:
df

Unnamed: 0,Origin,ItinFare,Dest
0,ABE,211,ATL
1,ABE,241,ATL
2,ABE,266,ATL
3,ABE,314,ATL
4,ABE,148,ATL
...,...,...,...
53997,SFO,142,LAX
53998,SFO,262,LAX
53999,SFO,339,LAX
54000,SFO,107,LAX


In [121]:
df = df[df['ItinFare'] < 10000]
df = df[df['ItinFare'] > 50]
df.head

<bound method NDFrame.head of       Origin  ItinFare Dest
0        ABE       211  ATL
1        ABE       241  ATL
2        ABE       266  ATL
3        ABE       314  ATL
4        ABE       148  ATL
...      ...       ...  ...
53997    SFO       142  LAX
53998    SFO       262  LAX
53999    SFO       339  LAX
54000    SFO       107  LAX
54001    SFO      1980  DFW

[53407 rows x 3 columns]>

Now to add the time data into it.

In [122]:
time

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,SBP,SFO,1353.0,-7.0,1444.0,-26.0,51.0
1,IAH,XNA,930.0,-5.0,1119.0,1.0,109.0
2,SGF,IAH,637.0,-6.0,838.0,-17.0,121.0
3,ISN,DEN,1314.0,-21.0,1404.0,-29.0,110.0
4,MKE,DEN,826.0,-10.0,1009.0,-31.0,163.0
...,...,...,...,...,...,...,...
638644,MEM,IAH,634.0,-6.0,853.0,13.0,139.0
638645,MEM,IAH,631.0,-9.0,830.0,-10.0,119.0
638646,MEM,IAH,632.0,-8.0,828.0,-12.0,116.0
638647,MEM,IAH,630.0,-10.0,831.0,-9.0,121.0


In [123]:
time.sample()[['DepTime', 'DepDelay', 'ArrTime', 'ArrTime', 'ArrDelay', 'ActualElapsedTime']]

Unnamed: 0,DepTime,DepDelay,ArrTime,ArrTime.1,ArrDelay,ActualElapsedTime
624472,1745.0,15.0,2030.0,2030.0,23.0,225.0


**NOTE:** This is one place where we can change input size as well, since it limits the maximum number of flights between Origin and Destination pairs.

In [125]:
time = time.groupby(['Origin', 'Dest']).head(2000).reset_index(drop=True)

In [126]:
time

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,SBP,SFO,1353.0,-7.0,1444.0,-26.0,51.0
1,IAH,XNA,930.0,-5.0,1119.0,1.0,109.0
2,SGF,IAH,637.0,-6.0,838.0,-17.0,121.0
3,ISN,DEN,1314.0,-21.0,1404.0,-29.0,110.0
4,MKE,DEN,826.0,-10.0,1009.0,-31.0,163.0
...,...,...,...,...,...,...,...
104313,CAE,IAH,644.0,-1.0,859.0,13.0,195.0
104314,CAE,IAH,641.0,-4.0,838.0,-8.0,177.0
104315,CAE,IAH,641.0,-4.0,901.0,15.0,200.0
104316,CAE,IAH,,,,,


We'll assign a random price from the itinerary fare dataset to each flight. Takes a while, there's probably a more efficient way to do this out there.

In [127]:
time['ItinFare'] = np.zeros(len(time))

for index, row in time.iterrows():
    chosen_idx = np.random.choice(len(df), replace = True, size = 1) 
    val = df.iloc[chosen_idx]["ItinFare"].values[0]
    time.at[index,'ItinFare'] = val

time

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime,ItinFare
0,SBP,SFO,1353.0,-7.0,1444.0,-26.0,51.0,163.0
1,IAH,XNA,930.0,-5.0,1119.0,1.0,109.0,325.0
2,SGF,IAH,637.0,-6.0,838.0,-17.0,121.0,185.0
3,ISN,DEN,1314.0,-21.0,1404.0,-29.0,110.0,277.0
4,MKE,DEN,826.0,-10.0,1009.0,-31.0,163.0,328.0
...,...,...,...,...,...,...,...,...
104313,CAE,IAH,644.0,-1.0,859.0,13.0,195.0,169.0
104314,CAE,IAH,641.0,-4.0,838.0,-8.0,177.0,107.0
104315,CAE,IAH,641.0,-4.0,901.0,15.0,200.0,156.0
104316,CAE,IAH,,,,,,211.0


Just to make working with the data easier I'll convert the times to actual time datatype. 
***NOTE 1:*** this actually made it harder lol - has been removed in latest dataset used in algorithm 1.

In [128]:
time.dropna(inplace=True)
df = time

In [129]:
df['DepTime'] = df['DepTime'].astype(int)
df['DepDelay'] = df['DepDelay'].astype(int)
df['ArrTime'] = df['ArrTime'].astype(int)
df['DepDelay'] = df['DepDelay'].astype(int)
df['ArrDelay'] = df['ArrDelay'].astype(int)
df['ActualElapsedTime'] = df['ActualElapsedTime'].astype(int)

def help(time):
    if time >= 2400:
        time -= 100
    if time < 100:
        time = 100
        
    str_time = str(time)
    
    if len(str_time) < 4:
        hours = int(str_time[0]) 
        mins = int(str_time[1:3])
        if(hours > 23): hours = 23
        if(hours < 0): hours = 1
        if(mins > 59): mins = 59
        if(mins < 0): mins = 1
        return mins + (hours * 60)
    else:
        hours = int(str_time[0:2]) 
        mins = int(str_time[2:4])
        if(hours > 23): hours = 23
        if(hours < 0): hours = 1
        if(mins > 59): mins = 59
        if(mins < 0): mins = 1
        return mins + (hours * 60) 

def converTime(df, col):
    return pd.Series(
        [ help(time) for time in df[col] ]
    )


In [130]:
df['DepTime'] = converTime(df, 'DepTime')
df['ArrTime'] = converTime(df, 'ArrTime')

df.dropna(inplace=True)

In [131]:
import os
df.to_csv(os.getcwd() + '\\Data\\processed\\multiFlightData_600k_flights.csv', index=False)

Creating a seperate csv that includes longitude and lattitude data for visualization

In [132]:
import chardet
with open('Data/OpenFlights/airports.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large
    
airports = pd.read_csv('Data/OpenFlights/airports.csv', encoding=result['encoding'])

In [133]:
# aiports = airports[airports['Country']=='United States']
airports.drop(airports.loc[airports['Country']!='United States'].index, inplace=True)
airports.reset_index(drop=True)
aiports = airports.drop(columns=['Name', 'City', 'ICAO', 'Country'])

In [134]:
locations = df.merge(airports,on=['Origin'])
locations = locations.dropna()
locations = locations.drop(columns=['Dest', 'ItinFare', 'DepDelay', 'ArrDelay', 'ActualElapsedTime', 'Name', 'City', 'Country', 'ICAO', 'DepTime', 'ArrTime'])


In [135]:
locations = locations[(locations['Origin'].isin(df['Origin']))|(locations['Origin'].isin(df['Dest']))]
locations = locations.drop_duplicates().reset_index(drop=True)
locations.to_csv(os.getcwd() + '\\Data\\processed\\airportLocations.csv', index=False)

In [136]:
df = df[(df['Origin'].isin(locations['Origin']))&(df['Dest'].isin(locations['Origin']))]
df.dropna(inplace=True)
df.to_csv(os.getcwd() + '\\Data\\processed\\multiFlightData.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [137]:
df

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime,ItinFare
0,SBP,SFO,833.0,-7,884.0,-26,51,163.0
1,IAH,XNA,570.0,-5,679.0,1,109,325.0
2,SGF,IAH,397.0,-6,518.0,-17,121,185.0
3,ISN,DEN,794.0,-21,844.0,-29,110,277.0
4,MKE,DEN,506.0,-10,609.0,-31,163,328.0
...,...,...,...,...,...,...,...,...
101191,MKE,SLC,402.0,25,521.0,27,228,377.0
101192,SLC,MKE,404.0,-7,539.0,-18,176,160.0
101193,EUG,SLC,401.0,-10,518.0,-14,120,2505.0
101194,LAX,GEG,401.0,19,541.0,12,152,203.0
