In [1]:
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import datetime as dt
import math

## Dataset Generation - Full Version

We don't have a singular data set containing all the information we need for the algorithm so we will generate a dataset that fits our needs based on real data.<br>The following datasets appear to have matching observations based on the ItineraryID column, but none contain all the information we need. <br><br>Ticket has a distance, FarePerMile and Itinerary fare per flight, however it does not include the destinations for some reason, while the coupon dataset does.<br>Similarly the time dataset contains the appropriate delay and time data for flights from point A and point B. Note that it had 120 columns with a bunch of useless columns and empty data so I pruned it manually when I had a quick look. <br><br> Using all this we will create a single data set simulating all the data required to model the problem for our algorithm.<br><br>This particular notebook generates a large dataset of several flights between each origin and destination airport.

In [2]:
ticket = pd.read_csv('Data/ticket_2019_1/ticket_2019_1.csv')
coupon = pd.read_csv('Data/coupon_2019_1/coupon_2019_1.csv')
time = pd.read_csv('Data/time_2019_1/time_2019_1.csv')

Drop columns we don't need. Note all data points are from the US.

In [3]:
ticket = ticket.drop(
    columns=[
        'Coupons',
        'Year',
        'Quarter',
        'OriginAirportID',
        'OriginAirportSeqID',
        'OriginCityMarketID',
        'OriginStateFips',
        'OriginStateName',
        'OriginState',
        'OriginCountry',
        'OriginWac',
        'DollarCred',
        'Passengers',
        'BulkFare',
        'DistanceGroup',
        'ItinGeoType',
        'RPCarrier',
        'OnLine',
        'MilesFlown',
        'FarePerMile',
        'Distance'
    ]
)

coupon = coupon.drop(
    columns=[
        'MktID',
        'SeqNum',
        'Coupons',
        'OriginAirportID',
        'OriginAirportSeqID',
        'OriginCityMarketID',
        'Quarter',
        'OriginCountry',
        'OriginStateFips',
        'OriginWac',
        'DestAirportID',
        'DestAirportSeqID',
        'DestCityMarketID',
        'DestCountry',
        'DestStateFips',
        'DestWac',
        'CouponType',
        'Passengers',
        'FareClass',
        'Gateway',
        'ItinGeoType',
        'RPCarrier',
        'CouponGeoType',
        'TkCarrier',
        'OpCarrier',
        'DistanceGroup',
        'Year',
        'Break',
        'OriginStateName',
        'OriginState',
        'DestStateName',
        'DestState',
        'Distance'
    ]
)


In [4]:
coupon.head()

Unnamed: 0,ItinID,Origin,Dest
0,201915.0,ABE,ATL
1,201916.0,ABE,ATL
2,201917.0,ABE,ATL
3,201919.0,ABE,ATL
4,2019130.0,ABE,ATL


In [5]:
ticket.head()

Unnamed: 0,ItinID,Origin,RoundTrip,ItinFare
0,201914.0,ABE,0,203
1,201915.0,ABE,0,211
2,201916.0,ABE,0,241
3,201917.0,ABE,0,266
4,201919.0,ABE,0,314


We'll remove the round trips, since we're only interested in flights from A to B.

In [6]:
print("Percent of Round Trips : {:.2f}%".format((len(ticket.loc[ticket['RoundTrip'] == 1]) / len(ticket['RoundTrip']) * 100), 2))

Percent of Round Trips : 50.01%


In [7]:
ticket = ticket.loc[ticket['RoundTrip']==0]
ticket = ticket.drop(columns=['RoundTrip'])
df = pd.merge(ticket, coupon, on=['ItinID', 'Origin']) 
df

Unnamed: 0,ItinID,Origin,ItinFare,Dest
0,2.019150e+05,ABE,211,ATL
1,2.019160e+05,ABE,241,ATL
2,2.019170e+05,ABE,266,ATL
3,2.019190e+05,ABE,314,ATL
4,2.019191e+06,ABE,148,ATL
...,...,...,...,...
53997,2.019200e+10,SFO,142,LAX
53998,2.019200e+10,SFO,262,LAX
53999,2.019200e+10,SFO,339,LAX
54000,2.019200e+10,SFO,107,LAX


In [8]:
df = df.drop(columns=['ItinID'])
df.head()

Unnamed: 0,Origin,ItinFare,Dest
0,ABE,211,ATL
1,ABE,241,ATL
2,ABE,266,ATL
3,ABE,314,ATL
4,ABE,148,ATL


Now to add the time data into it. Note that this just generates a lot of data by an inner join. There's probably a better way of doing this IF we actually wanted to generate a massive data set to use in the project.

In [9]:
time.head()

Unnamed: 0,Origin,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,SBP,SFO,1353.0,-7.0,1444.0,-26.0,51.0
1,IAH,XNA,930.0,-5.0,1119.0,1.0,109.0
2,SGF,IAH,637.0,-6.0,838.0,-17.0,121.0
3,ISN,DEN,1314.0,-21.0,1404.0,-29.0,110.0
4,MKE,DEN,826.0,-10.0,1009.0,-31.0,163.0


In [10]:
# df = pd.concat([df, time], axis=1, join="inner")
df = pd.merge(df, time, on=['Origin', 'Dest'], how="inner") 
df.dropna(inplace=True)
df

Unnamed: 0,Origin,ItinFare,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,ABE,211,ATL,557.0,-3.0,819.0,-9.0,142.0
1,ABE,211,ATL,600.0,0.0,825.0,-7.0,145.0
2,ABE,211,ATL,554.0,-6.0,803.0,-28.0,129.0
3,ABE,211,ATL,558.0,-2.0,824.0,-7.0,146.0
4,ABE,211,ATL,555.0,-5.0,813.0,-18.0,138.0
...,...,...,...,...,...,...,...,...
15988182,SFO,107,LAX,643.0,-2.0,802.0,-17.0,79.0
15988183,SFO,107,LAX,1720.0,-10.0,1847.0,-20.0,87.0
15988184,SFO,107,LAX,1338.0,38.0,1445.0,15.0,67.0
15988185,SFO,107,LAX,1509.0,-5.0,1631.0,-16.0,82.0


Just to make working with the data easier I'll convert the times to actual time datatype. 

In [11]:
df['DepTime'] = df['DepTime'].astype(int)
df['DepDelay'] = df['DepDelay'].astype(int)
df['ArrTime'] = df['ArrTime'].astype(int)
df['DepDelay'] = df['DepDelay'].astype(int)
df['ArrDelay'] = df['ArrDelay'].astype(int)
df['ActualElapsedTime'] = df['ActualElapsedTime'].astype(int)

def help(time):
    if time >= 2400:
        time -= 100
    if time < 100:
        time = 100
        
    str_time = str(time)
    
    if len(str_time) < 4:
        hours = int(str_time[0]) 
        mins = int(str_time[1:3])
        if(hours > 23): hours = 23
        if(hours < 0): hours = 1
        if(mins > 59): mins = 59
        if(mins < 0): mins = 1
        return mins + (hours * 60)
    else:
        hours = int(str_time[0:2]) 
        mins = int(str_time[2:4])
        if(hours > 23): hours = 23
        if(hours < 0): hours = 1
        if(mins > 59): mins = 59
        if(mins < 0): mins = 1
        return mins + (hours * 60) 

def converTime(df, col):
    return pd.Series(
        [ help(time) for time in df[col] ]
    )

In [12]:
df['DepTime'] = converTime(df, 'DepTime')
df['ArrTime'] = converTime(df, 'ArrTime')
df.dropna(inplace=True)

In [13]:
df

Unnamed: 0,Origin,ItinFare,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,ABE,211,ATL,357.0,-3,499.0,-9,142
1,ABE,211,ATL,360.0,0,505.0,-7,145
2,ABE,211,ATL,354.0,-6,483.0,-28,129
3,ABE,211,ATL,358.0,-2,504.0,-7,146
4,ABE,211,ATL,355.0,-5,493.0,-18,138
...,...,...,...,...,...,...,...,...
15547001,SEA,188,PHX,403.0,-13,482.0,-22,160
15547002,SEA,188,PHX,1040.0,1,1127.0,8,173
15547003,SEA,188,PHX,818.0,4,885.0,16,185
15547004,SEA,188,PHX,909.0,0,991.0,22,188


In [14]:
import os
df.to_csv(os.getcwd() + '\\Data\\processed\\FullflightData.csv', index=False)

Creating a seperate csv that includes longitude and lattitude data for visualization

In [15]:
import chardet
with open('Data/OpenFlights/airports.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large
    
airports = pd.read_csv('Data/OpenFlights/airports.csv', encoding=result['encoding'])

In [16]:
# aiports = airports[airports['Country']=='United States']
airports.drop(airports.loc[airports['Country']!='United States'].index, inplace=True)
airports.reset_index(drop=True)
aiports = airports.drop(columns=['Name', 'City', 'ICAO', 'Country'])

In [17]:
locations = df.merge(airports,on=['Origin'])
locations = locations.dropna()
locations = locations.drop(columns=['Dest', 'ItinFare', 'DepDelay', 'ArrDelay', 'ActualElapsedTime', 'Name', 'City', 'Country', 'ICAO', 'DepTime', 'ArrTime'])


In [18]:
locations = locations[(locations['Origin'].isin(df['Origin']))|(locations['Origin'].isin(df['Dest']))]
locations = locations.drop_duplicates().reset_index(drop=True)
locations.to_csv(os.getcwd() + '\\Data\\processed\\airportLocations.csv', index=False)

In [19]:
df = df[(df['Origin'].isin(locations['Origin']))&(df['Dest'].isin(locations['Origin']))]
df.dropna(inplace=True)
df.to_csv(os.getcwd() + '\\Data\\processed\\multiFlightData.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [20]:
df

Unnamed: 0,Origin,ItinFare,Dest,DepTime,DepDelay,ArrTime,ArrDelay,ActualElapsedTime
0,ABE,211,ATL,357.0,-3,499.0,-9,142
1,ABE,211,ATL,360.0,0,505.0,-7,145
2,ABE,211,ATL,354.0,-6,483.0,-28,129
3,ABE,211,ATL,358.0,-2,504.0,-7,146
4,ABE,211,ATL,355.0,-5,493.0,-18,138
...,...,...,...,...,...,...,...,...
15547001,SEA,188,PHX,403.0,-13,482.0,-22,160
15547002,SEA,188,PHX,1040.0,1,1127.0,8,173
15547003,SEA,188,PHX,818.0,4,885.0,16,185
15547004,SEA,188,PHX,909.0,0,991.0,22,188
