### Prepping Data Challenge: Longest Flights (Week 24)
 
### Requirements
- Input the data
- Remove the airport names from the From and To fields
  - e.g. New York-JFK should just read New York
- Create a Route field which concatenates the From and To fields with a hyphen
  - e.g. Dubai - Dallas
- Split out the Distance field so that we have one field for the Distance in km and one field for the Distance in miles
  - Ensure these fields are numeric
- Rank the flights based on Distance
  - Use a dense rank in order to match the wikipedia page
- The Scheduled duration is a Date/Time data type. Change this to a string so that we only keep the time element
- Update the First flight field to be a date
- Join on the lat & longs for the From and To cities
- Output the data

In [1]:
import pandas as pd
import numpy as np

In [2]:
#input the data
with pd.ExcelFile('wk24-Input.xlsx') as xl:
    ns = pd.read_excel(xl, 'Non-stop flights',parse_dates=['First flight'])
    world = pd.read_excel(xl, 'World Cities')

In [3]:
ns.head()

Unnamed: 0,From,To,Airline,Flight number,Distance,Scheduled duration,Aircraft,First flight
0,New York–JFK,Singapore,Singapore Airlines,SQ 23,"15,349 km (9,537 mi; 8,288 nmi)",18:50:00,A350-900ULR,2020-11-09
1,Newark,Singapore,Singapore Airlines,SQ 21,"15,344 km (9,534 mi; 8,285 nmi)",18:45:00,A350-900ULR,2022-03-27
2,Auckland,Doha,Qatar Airways,QR 921,"14,535 km (9,032 mi; 7,848 nmi)",18:05:00,777-200LR,2017-02-05
3,Perth,London–Heathrow,Qantas,QF 9,"14,499 km (9,009 mi; 7,829 nmi)",17:25:00,787-9,2018-03-24
4,Auckland,Dubai,Emirates,EK 449,"14,200 km (8,823 mi; 7,667 nmi)",17:10:00,777-200LR,2016-03-02


In [4]:
world.head()

Unnamed: 0,City,Lat,Lng
0,New York,40.6943,-73.9249
1,Newark,40.7245,-74.1725
2,Auckland,-36.85,174.7833
3,Los Angeles,34.1139,-118.4068
4,San Francisco,37.7562,-122.443


In [5]:
#Remove the airport names from the From and To fields (e.g. New York-JFK should just read New York)
ns['From'] = ns['From'].str.replace('([-–/].*)','', regex=True)
ns['To'] = ns['To'].str.replace('([-–].*)','', regex=True)

In [6]:
#Create a Route field which concatenates the From and To fields with a hyphen (e.g. Dubai - Dallas)
ns['Route'] = ns['From']+' - '+ns['To']

In [7]:
#Split out the Distance field so that we have one field for the Distance in km and one field for the Distance in miles
#Ensure these fields are numeric 
ns['Distance - km'] = ns['Distance'].str.extract('(.*) km')
ns['Distance - mi'] = ns['Distance'].str.extract('km \((.*)\smi;')

In [8]:
#Rank the flights based on Distance (Use a dense rank in order to match the wikipedia page)
ns['Rank'] = ns['Distance - mi'].rank(method='dense', ascending=False).astype(int)

In [9]:
ns.head()

Unnamed: 0,From,To,Airline,Flight number,Distance,Scheduled duration,Aircraft,First flight,Route,Distance - km,Distance - mi,Rank
0,New York,Singapore,Singapore Airlines,SQ 23,"15,349 km (9,537 mi; 8,288 nmi)",18:50:00,A350-900ULR,2020-11-09,New York - Singapore,15349,9537,1
1,Newark,Singapore,Singapore Airlines,SQ 21,"15,344 km (9,534 mi; 8,285 nmi)",18:45:00,A350-900ULR,2022-03-27,Newark - Singapore,15344,9534,2
2,Auckland,Doha,Qatar Airways,QR 921,"14,535 km (9,032 mi; 7,848 nmi)",18:05:00,777-200LR,2017-02-05,Auckland - Doha,14535,9032,3
3,Perth,London,Qantas,QF 9,"14,499 km (9,009 mi; 7,829 nmi)",17:25:00,787-9,2018-03-24,Perth - London,14499,9009,4
4,Auckland,Dubai,Emirates,EK 449,"14,200 km (8,823 mi; 7,667 nmi)",17:10:00,777-200LR,2016-03-02,Auckland - Dubai,14200,8823,5


In [10]:
#Join on the lat & longs for the From and To cities 
output1 = pd.merge(ns,world,how='left',left_on='From',right_on='City')\
            .rename(columns={'Lat':'From Lat','Lng':'From Lng'})

In [11]:
output = pd.merge(output1,world,how='left',left_on='To',right_on='City')\
            .rename(columns={'Lat':'To Lat','Lng':'To Lng'})

In [12]:
output = output[['Rank','From','To','Route','Airline','Flight number','Distance - mi','Distance - km','Scheduled duration',
                 'Aircraft','First flight','From Lat','From Lng','To Lat','To Lng']]

In [13]:
output.head(10)

Unnamed: 0,Rank,From,To,Route,Airline,Flight number,Distance - mi,Distance - km,Scheduled duration,Aircraft,First flight,From Lat,From Lng,To Lat,To Lng
0,1,New York,Singapore,New York - Singapore,Singapore Airlines,SQ 23,9537,15349,18:50:00,A350-900ULR,2020-11-09,40.6943,-73.9249,1.3,103.8
1,2,Newark,Singapore,Newark - Singapore,Singapore Airlines,SQ 21,9534,15344,18:45:00,A350-900ULR,2022-03-27,40.7245,-74.1725,1.3,103.8
2,3,Auckland,Doha,Auckland - Doha,Qatar Airways,QR 921,9032,14535,18:05:00,777-200LR,2017-02-05,-36.85,174.7833,25.3,51.5333
3,4,Perth,London,Perth - London,Qantas,QF 9,9009,14499,17:25:00,787-9,2018-03-24,-31.953512,115.957048,51.5072,-0.1275
4,5,Auckland,Dubai,Auckland - Dubai,Emirates,EK 449,8823,14200,17:10:00,777-200LR,2016-03-02,-36.85,174.7833,25.2697,55.3094
5,6,Los Angeles,Singapore,Los Angeles - Singapore,Singapore Airlines,"SQ 37, SQ 35",8770,14114,17:50:00,A350-900,2018-11-02,34.1139,-118.4068,1.3,103.8
6,7,San Francisco,Bengaluru,San Francisco - Bengaluru,Air India,AI 176,8702,14004,17:45:00,777-200LR,2021-01-09,37.7562,-122.443,12.9699,77.598
7,8,Darwin,London,Darwin - London,Qantas,QF 1,8620,13872,17:25:00,787-9,2021-11-01,-12.4381,130.8411,51.5072,-0.1275
8,9,Houston,Sydney,Houston - Sydney,United Airlines,UA 101,8596,13834,17:35:00,787-9,2018-01-18,29.7863,-95.3889,-33.865,151.2094
9,10,Dallas,Sydney,Dallas - Sydney,Qantas,QF 8,8577,13804,17:20:00,787-9,2014-09-29,32.7936,-96.7662,-33.865,151.2094


In [14]:
#output the data 
output.to_excel('wk24-output.xlsx', index=False)