## Imports

In [112]:
import pandas as pd

### Import Changi data

In [113]:
changi = pd.read_csv('data/Changi_Traffic.csv', sep=',')

### Import Japan data

In [114]:
japan = pd.read_csv('data/Japan_Airports.csv', sep=',')

### Import Worldwide data

In [115]:
worldwide = pd.read_csv('data/Traffic - Worldwide_Ranking.csv', sep=',')

### Import Airports data

In [116]:
column_names = [
    "Airport_ID",            
    "Name",                   
    "City",                   
    "Country",                
    "IATA",                   
    "ICAO",                   
    "Latitude",               
    "Longitude",              
    "Altitude",               
    "Timezone",               
    "DST",                    
    "Tz_database_time_zone", 
    "Type",                   
    "Source"                  
]

airports = pd.read_csv("data/airports.dat.txt", header=None, names=column_names)

## Process data

### Remove NaN values

In [117]:
# Remove NaN in changi dataframe
changi.dropna(inplace=True)

# Remove NaN in japan dataframe
japan.dropna(inplace=True)

### Changi Process

#### Rename the column in Changi to indicate that it contains data related to the number of passengers

In [118]:
changi.rename(columns={'Unnamed: 1' : 'Number of Passengers (in Millions)'})

Unnamed: 0,Month,Number of Passengers (in Millions)
0,January 2024,5 430
1,February 2024,5 350
2,March 2024,5 730
3,April 2024,5 400
4,May 2024,5 480
5,June 2024,5 620
6,July 2024,5 700
7,August 2024,5 730
8,September 2024,5 400
9,October 2024,5 650


### Worldwide Process

#### Rename all the columns

In [119]:
worldwide.columns = ['Rank', 'Airport_Name', 'Location', 'Country', 'Code', 'Number_of_Passengers', 'Rank_Change', '%_Change']

#### Remove useless columns

In [120]:
worldwide.drop(columns=['Code', 'Rank_Change', '%_Change'], inplace=True)

#### Clean the column Number_of_Passengers

#### Remove the number between []

In [121]:
# Remove the [] and the number inside
worldwide['Number_of_Passengers'] = worldwide['Number_of_Passengers'].str.replace(r'\[\d+\]', '', regex=True)

#### Change the format of the cell to print Number in M instead of the number

In [122]:
# Remove the thousands separator , 
worldwide['Number_of_Passengers'] = worldwide['Number_of_Passengers'].str.replace(',', '').astype(int)

In [123]:
# Convert to millions
worldwide['Number_of_Passengers'] = worldwide['Number_of_Passengers'].apply(lambda x: f"{x / 1_000_000:.2f}M")

#### Merge worldwide and airports to get the Latitude information and the Longitude information

In [124]:
# Get all the airports that have the same name between airports and worldwide dataframes
# len(set(airports['Name']).intersection(set(worldwide['Airport_Name'])))

In [125]:
# Get all the airports that have different names between airports and worldwide dataframes
# set(worldwide['Airport_Name']).difference(set(airports['Name']))

In [126]:
# Check for each airport name
# airports[airports['Name'].str.contains('Reid')]

In [127]:
# Create a mapping table with on the left the airports names in the airports dataframe that need to be changed, and the equivalence in worldwide
mapping_airports = {
    'Charles de Gaulle International Airport' : 'Charles de Gaulle Airport', 
    'Chengdu Shuangliu International Airport' : 'Chengdu Tianfu International Airport', 
    'Chhatrapati Shivaji International Airport' : 'Chhatrapati Shivaji Maharaj International Airport', 
    'Frankfurt am Main Airport' : 'Frankfurt Airport',
    'George Bush Intercontinental Houston Airport' : 'George Bush Intercontinental Airport',
    'Hartsfield Jackson Atlanta International Airport' : 'Hartsfield–Jackson Atlanta International Airport',
    'London Heathrow Airport' : 'Heathrow Airport', 
    'John F Kennedy International Airport' : 'John F. Kennedy International Airport',
    'Barcelona International Airport' : 'Josep Tarradellas Barcelona–El Prat Airport',
    "Chicago O'Hare International Airport" : "O'Hare International Airport",
    'Roma Airport' : 'Rome Fiumicino Airport',
    'Seattle Tacoma International Airport' : 'Seattle–Tacoma International Airport',
    'Incheon International Airport' : 'Seoul Incheon International Airport',
    'Soekarno-Hatta International Airport' : 'Soekarno–Hatta International Airport',
    'Tokyo Haneda International Airport' : 'Tokyo Haneda Airport',
    'Lester B. Pearson International Airport' : 'Toronto Pearson International Airport', 
    'McCarran International Airport' : 'Harry Reid International Airport',
    'Licenciado Benito Juarez International Airport' :  'Mexico City International Airport'
}

In [128]:
# Replace the values included in the mapping table by their equivalent
airports['Name'] = airports['Name'].replace(mapping_airports)

In [129]:
airports.head()

Unnamed: 0,Airport_ID,Name,City,Country,IATA,ICAO,Latitude,Longitude,Altitude,Timezone,DST,Tz_database_time_zone,Type,Source
0,1,Goroka Airport,Goroka,Papua New Guinea,GKA,AYGA,-6.08169,145.391998,5282,10,U,Pacific/Port_Moresby,airport,OurAirports
1,2,Madang Airport,Madang,Papua New Guinea,MAG,AYMD,-5.20708,145.789001,20,10,U,Pacific/Port_Moresby,airport,OurAirports
2,3,Mount Hagen Kagamuga Airport,Mount Hagen,Papua New Guinea,HGU,AYMH,-5.82679,144.296005,5388,10,U,Pacific/Port_Moresby,airport,OurAirports
3,4,Nadzab Airport,Nadzab,Papua New Guinea,LAE,AYNZ,-6.569803,146.725977,239,10,U,Pacific/Port_Moresby,airport,OurAirports
4,5,Port Moresby Jacksons International Airport,Port Moresby,Papua New Guinea,POM,AYPY,-9.44338,147.220001,146,10,U,Pacific/Port_Moresby,airport,OurAirports


In [130]:
# Rename Airport_Name from worldwide to match with airports Name
worldwide.rename(columns={'Airport_Name' : 'Name'}, inplace=True)

# Merge worldwide and airports on the column Name
world_airports = pd.merge(worldwide, airports, how='left', on='Name')

### Keep only the relevant columns

In [131]:
# Create a list of columns to keep
to_keep = ['Rank', 'Name', 'City', 'Country_x', 'Number_of_Passengers', 'Latitude', 'Longitude']

# Keep the columns included in this list
world_airports = world_airports[to_keep]

# Rename the column Country_x into Country
world_airports.rename(columns={'Country_x' : 'Country'}, inplace=True)

## Export

### world_airports dataframe

In [135]:
world_airports

Unnamed: 0,Rank,Name,City,Country,Number_of_Passengers,Latitude,Longitude
0,1,Hartsfield–Jackson Atlanta International Airport,Atlanta,United States,108.07M,33.6367,-84.428101
1,2,Dubai International Airport,Dubai,United Arab Emirates,92.30M,25.2528,55.364399
2,3,Dallas Fort Worth International Airport,Dallas-Fort Worth,United States,87.82M,32.896801,-97.038002
3,4,Tokyo Haneda Airport,Tokyo,Japan,85.00M,35.552299,139.779999
4,5,Heathrow Airport,London,United Kingdom,83.88M,51.4706,-0.461941
5,6,Denver International Airport,Denver,United States,82.36M,39.861698,-104.672997
6,7,O'Hare International Airport,Chicago,United States,80.04M,41.9786,-87.9048
7,8,Istanbul Airport,Istanbul,Turkey,79.99M,41.275278,28.751944
8,9,Indira Gandhi International Airport,Delhi,India,77.82M,28.5665,77.103104
9,10,Shanghai Pudong International Airport,Shanghai,China,76.79M,31.1434,121.805


In [None]:
world_airports.to_csv('world_airports.csv')