In [1]:
import pandas as pd
import requests
import json
import folium
from collections import defaultdict

# DATA COLLECTION

To collect data about flights, FlightLabsAPI has been used. This web API provides flights' information for almost every European airport. These information include the source, destination, departure time, delay, airline's name, airline's IATA identifier... In particular, for the sake of this project, data for the 50 most crowded (on a yearly basis) European airports has been used (mostly taken from https://gettocenter.com/airports/continent/europe) and has be defined hereafter:

In [None]:
tag_airports = [
    "LHR", # London Heathrow Airport
    "CDG", # Paris-Charles de Gaulle Airport
    "AMS", # Amsterdam Airport Schiphol
    "FRA", # Frankfurt Airport
    "IST", # Istanbul Airport
    "BCN", # Barcelona-El Prat Airport
    "MAD", # Adolfo Suárez Madrid–Barajas Airport
    "MUC", # Munich Airport
    "SAW", # Sabiha Gökçen International Airport
    "FCO", # Leonardo da Vinci-Fiumicino Airport
    "LGW", # London Gatwick Airport
    "DME", # Moscow Domodedovo Airport
    "ORY", # Paris Orly Airport
    "VIE", # Vienna International Airport
    "ZRH", # Zurich Airport
    "OTP", # Henri Coandă International Airport
    "SVO", # Sheremetyevo International Airport
    "CPH", # Copenhagen Airport
    "ATH", # Athens International Airport
    "ARN", # Stockholm Arlanda Airport
    "DUB", # Dublin Airport
    "OSL", # Oslo Airport, Gardermoen
    "HAM", # Hamburg Airport
    "BRU", # Brussels Airport
    "LIS", # Lisbon Portela Airport
    "BGY", # Milan Bergamo Airport
    "HEL", # Helsinki Airport
    "MAN", # Manchester Airport
    "BUD", # Budapest Ferenc Liszt International Airport
    "PRG", # Prague Václav Havel Airport
    "WAW", # Warsaw Chopin Airport
    "EDI", # Edinburgh Airport
    "GVA", # Geneva Airport
    "DUS", # Düsseldorf Airport
    "SXF", # Berlin Schönefeld Airport
    "BLL", # Billund Airport
    "AGP", # Málaga Costa del Sol Airport
    "NCE", # Nice Côte d'Azur Airport
    "PMI", # Palma de Mallorca Airport
    "STR", # Stuttgart Airport
    "NAP", # Naples International Airport
    "LED", # Pulkovo Airport
    "LTN", # London Luton Airport
    "TLS", # Toulouse Blagnac Airport
    "BHX", # Birmingham Airport
    "LPA", # Gran Canaria Airport
    "KEF", # Keflavík International Airport
    "SZG", # Salzburg Airport
    "AYT", # Antalya Airport
    "MXP" # Milan Malpensa Airport
]

A function to calculate time intervals was then defined given that the API used only allows extraction for a time window of a maximum of ***** days for big airports, category in which many airports amongst the ones under consideration fall. The function was defined as follows:

In [None]:
import datetime
def time_period(date_from, date_to):
    interval_d = datetime.timedelta(days=1)
    interval_4d = datetime.timedelta(days=3)
    out = []
    while date_from <= date_to:
        i = datetime.datetime.strptime(date_from, "%Y-%m-%d")
        j = (i + interval_4d).strftime("%Y-%m-%d")
        date_from = (i+ (interval_d + interval_4d)).strftime("%Y-%m-%d")
        i = i.strftime("%Y-%m-%d")
        out.append((i,j))
    return out

The main information about all flights has then been extracted for the period under analysis (January 2023 - April 2023) and stored in a dataframe (the try/except architecture was necessary as sometimes FlightLabs does not provide some particular information, maybe beacuse of a mulfunctioning of the measuring instruments, though, amongst the flights APIs tried, FlightLabs seemed to be the most complete):

In [None]:
def get_flight(airports,date_from,date_to):
    df = defaultdict(list)
    interval = time_period(date_from,date_to)
    for w,k in interval:
        for j in airports:
            api_key='insert_API_key_here'
            response = requests.get(f'https://app.goflightlabs.com/historical/{w}?access_key={api_key}&code={j}&type=departure&date_to={k}')
            data=response.json()["data"]
            
            for i in range(len(data)):
                try:
                    df['dep_iata'].append(data[i]['departure']['iataCode'])
                    df['dep_sch'].append(data[i]['departure']['scheduledTime'])
                    if 'delay' in data[i]['departure']:
                        df['dep_delay'].append(data[i]['departure']['delay'])
                    else:
                        df['dep_delay'].append(None)
                    if 'estimatedTime' in data[i]['departure']:
                        df['dep_est'].append(data[i]['departure']['estimatedTime'])
                    else:
                        df['dep_est'].append(None)
                    
                    if 'actualTime' in data[i]['departure']:
                        df['dep_act'].append(data[i]['departure']['actualTime'])
                    else:
                        df['dep_act'].append(None)
                    if 'iataCode' in data[i]['arrival']:
                        df['arr_iata'].append(data[i]['arrival']['iataCode'])
                    else:
                        df['arr_iata'].append(None)
                    if 'scheduledTime' in data[i]['arrival']:
                        df['arr_sch'].append(data[i]['arrival']['scheduledTime'])
                    else:
                        df['arr_sch'].append(None)
                    if 'iataCode' in data[i]['airline']:
                        df['iata_airline'].append(data[i]['airline']['iataCode'])
                    else:
                        df['iata_airline'].append(None)
                    if 'name' in data[i]['airline']:
                        df['name_airline'].append(data[i]['airline']['name'])
                    else:
                        df['name_airline'].append(None)
                    if 'iataNumber' in data[i]['flight']:
                        df['iata_flight'].append(data[i]['flight']['iataNumber'])
                    else:
                        df['iata_flight'].append(None)
                except:
                    df['dep_iata'].append(None)
                    df['dep_sch'].append(None)
                    df['dep_delay'].append(None)
                    df['dep_est'].append(None)
                    df['dep_act'].append(None)
                    df['arr_iata'].append(None)
                    df['arr_sch'].append(None)
                    df['iata_airline'].append(None)
                    df['name_airline'].append(None)
                    df['iata_flight'].append(None)
                    
    return pd.DataFrame(df)

The above function was then called for all the months of the analysis (namely January, February, March and April 2023):

In [None]:
data_frame = get_flight(tag_airports,'2023-01-1','2023-01-31')

Then, only the flights departing and landing within the 50 airports under analysis were kept (this represented the vast majority):

In [None]:
final=data_frame[data_frame['arr_iata'].isin([x.lower() for x in tag_airports])].reset_index(drop=True)

At this point, there were some dublicated flights due to the presence of codeshares. Codesharing is a practice in the airline industry where two or more airlines jointly market and operate a flight under their own airline codes (typically referred to as the IATA airline codes). This means that passengers can book a flight with one airline, but the actual flight may be operated by another airline. In this analysis' case, the problem was that the dataframe created with the steps above was not able to distinguish between the two airlines. For example, if a flight from London to Paris was operated by both Air France and British Airways, the dataframe would have considered it as two different flights, one operated by Air France and one by British Airways. The following function was them defined to drop duplicates and keep only the "real flight" of the "real company" that has operated it:

In [None]:
l=[]
for i in range(0,len(final)-1):
    if final.loc[i,"dep_iata"]==final.loc[i+1,"dep_iata"] and final.loc[i,"arr_iata"]==final.loc[i+1,"arr_iata"] and final.loc[i,"dep_sch"]==final.loc[i+1,"dep_sch"] and final.loc[i,"arr_sch"]==final.loc[i+1,"arr_sch"]:
        l.append(i)
final=final.drop(l).reset_index(drop=True)

Then, information about the 50 airports under analysis was extracted from the airports endpoint of the FlightLabs API. This information include longitude and latitude necessary for the plotting procedure conducted below in the code. Once again, the function stores the result into a dataframe.

In [None]:
def get_airport(airports):
    df = defaultdict(list)
    for j in airports:
        api_key='insert_API_key_here'
        response = requests.get(f'https://app.goflightlabs.com/airports?access_key={api_key}&codeIataAirport={j}')
        data=response.json()["data"]
        df["iata"].append(data[0]["codeIataAirport"])
        df["longitude"].append(data[0]["longitudeAirport"])
        df["latitude"].append(data[0]["latitudeAirport"])
        df["name"].append(data[0]["nameAirport"])
        df["country"].append(data[0]["nameCountry"])         
    return pd.DataFrame(df)

In [None]:
arp=get_airport(tag_airports)

All the dataframes were then saved as csv files so as not to have to make other calls to the API again. As a matter of facts making extra requestes to the FlightLabs API is not only time consuming but is also costly.

In [None]:
arp.to_csv('../data/airports_top50.csv',index=False)

In [None]:
final.to_csv('../data/flights_january2023.csv',index=False)

Then, the databases about flights have been (one at a time) cleaned and the date columns have been formatted. Moreover, they have been saved once again in their final form ready for the epidemy simulation:

In [None]:
final=pd.read_csv('../data/flights_january2023.csv')

In [None]:
final["dep_sch_f"]=final["dep_sch"].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f'))
final["arr_sch_f"]=final["arr_sch"].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%S.%f'))
final["dep_date"] = final["dep_sch_f"].apply(lambda x: x.date())
final["dep_time"] = final["dep_sch_f"].apply(lambda x: x.time())
final["arr_date"] = final["arr_sch_f"].apply(lambda x: x.date())
final["arr_time"] = final["arr_sch_f"].apply(lambda x: x.time())

In [None]:
final.drop(columns=["dep_sch","arr_sch","dep_delay","dep_est","dep_act","dep_sch_f","arr_sch_f"],inplace=True)

In [None]:
final.to_csv('../data/flights_january2023_epidemy.csv',index=False)

Then, a visual plot of the network has been obtained with for the 1st January 2023:

In [None]:
final=pd.read_csv('../data/flights_january2023_epidemy.csv')

In [None]:
final_modd=final.groupby(['dep_iata','dep_date','arr_iata']).count().reset_index()

In [None]:
import math

def data_to_map(final_modd, arp, day):
    final_modd = final_modd[final_modd['dep_date'] == day]
    # Create map
    map = folium.Map(location=[51.5072, -0.1276], zoom_start=4)

    # Define color function based on count of flights
    color_scale = folium.LinearColormap(
        ["green", "yellow", "red"],
        vmin=final_modd["name_airline"].min(),
        vmax=final_modd["name_airline"].max(),
    )

    # Add markers for airports
    for index, airport in arp.iterrows():
        folium.Marker(
            location=[airport["latitude"], airport["longitude"]],
            popup=f"{airport['name']}, {airport['country']} ({airport['iata']})",
            icon=folium.Icon(color="blue", icon="plane"),
        ).add_to(map)

    # Add connections between airports
    for index, route in final_modd.iterrows():
        source = route["dep_iata"].upper()
        dest = route["arr_iata"].upper()
        if source and dest:
            coords = [
                (float(arp[arp["iata"]==source]["latitude"]), float(arp[arp["iata"]==source]["longitude"])),
                (float(arp[arp["iata"]==dest]["latitude"]), float(arp[arp["iata"]==dest]["longitude"])),
            ]
            count = route["name_airline"]
            color = color_scale(count)
            weight = 5 if count > 10 else 3
            arrow_pos = 1.0 - weight / 20
            arrow_angle = math.atan2(coords[1][1] - coords[0][1], coords[1][0] - coords[0][0]) * 180 / math.pi
            arrow_icon = folium.Icon(
                icon="arrow-up",
                prefix="fa",
                color=color,
                angle=arrow_angle,
            )
            folium.PolyLine(
                locations=coords,
                color=color,
                weight=weight,
                opacity=1,
                smooth_factor=1,
                tooltip=str(count),
                dash_array=[],
                dash_offset="0",
                fill=False,
                line_cap="round",
                line_join="round",
                show=True,
                plugin_data=None,
                clustered_marker=False,
                icon=arrow_icon,
                arrow_pos=arrow_pos,
                arrow_style="simple",
                arrow_size=10,
                arrow_fill_color=color,
                arrow_stroke_color=color,
            ).add_to(map)

    # Add color legend
    color_scale.add_to(map)

    # Return map object
    return map

In [None]:
map=data_to_map(final_modd,arp,'2023-01-01')
display(map)

  arrow_icon = folium.Icon(


___

## Final database preparation

First of all, the previously created databases have been loaded and merged into a single one which will be then the one used into the simulation:

In [None]:
fj=pd.read_csv("../data/flights_january2023_epidemy.csv")
ff=pd.read_csv("../data/flights_february2023_epidemy.csv")
fm=pd.read_csv("../data/flights_march2023_epidemy.csv")
fa=pd.read_csv("../data/flights_april2023_epidemy.csv")
ffinal=pd.concat([fj,ff,fm,fa]).reset_index(drop=True)

Then, given that the aim of the study is to analyze the spread of an epidemy, we will drop all the cargo flights which do not carry passengers and hence are not vectors for the disease. To do so, all those flights which have as "iata_airline" (the IATA code of the flight) one corresponding to a major cargo airline have been dropped:

In [None]:
to_drop=[]
for i in range(len(ffinal)):
    if ffinal.loc[i,"iata_airline"] in ["5y", "lh", "af", "cv", "tk", "d0", "ba", "ek", "ay", "lx", "ru", "fx"]:
        to_drop.append(i)
ffinal.drop(to_drop,inplace=True)

And then the newly obtained dataframe was saved so that it can later be used also for the other scenarios:

In [None]:
ffinal.to_csv("flights_2023_epidemy.csv",index=False)