# Components

### Packages & libraries

In [None]:
import numpy as np
import pandas as pd
import os
import datetime as dt
import json
import requests
from bs4 import BeautifulSoup

### Global variables

In [None]:
file_flights = 'data/flights_kabfusab1b231239bdahb312421.xlsx'
path_passengers = 'data/passengers/'
path_output = 'output/'
sheet_flights = 'pax_per_flight'
sheet_pax = 'pax_data'
sheet_log = 'warnings-errors'
url_iata = 'https://nl.wikipedia.org/wiki/Vliegvelden_gesorteerd_naar_IATA-code'

### Functions

In [None]:
def save_to_excel(filename, dfs):
    """
    Given a dict of dataframes, for example:
    dfs = {'gadgets': df_gadgets, 'widgets': df_widgets}
    
    Function courtesy of T. Hellemans, course Data Science, Syntra, 2022-23
    """

    writer = pd.ExcelWriter(filename, engine='xlsxwriter')
    for sheetname, df in dfs.items():  # loop through `dict` of dataframes
        df.to_excel(writer, sheet_name=sheetname, index=False)  # send df to writer
        worksheet = writer.sheets[sheetname]  # pull worksheet object
        for idx, col in enumerate(df):  # loop through all columns
            series = df[col]
            max_len = max((
                series.astype(str).map(len).max(),  # len of largest item
                len(str(series.name))  # len of column name/header
                )) + 1  # adding a little extra space
            worksheet.set_column(idx, idx, max_len)  # set column width
    writer.save()
    return


In [None]:
def build_datetime(date, time):
    """
    Parameters:
    - date: string with date in format DD/MM/YYYY
    - time: string with time in format HH:MM
    
    Result:
    - datetime object
    """

    year = pd.to_datetime(date, format='%d/%m/%Y').year
    month = pd.to_datetime(date, format='%d/%m/%Y').month
    day = pd.to_datetime(date, format='%d/%m/%Y').day
    hours = pd.to_datetime(time, format='%H:%M').hour
    minutes = pd.to_datetime(time, format='%H:%M').minute
    
    string = str(year) + '-' + str(month) + '-' + str(day) + '-' + str(hours) + '-' + str(minutes)
    
    return(pd.to_datetime(string, format='%Y-%m-%d-%H-%M'))

In [None]:
def iata_codes_from_wikipedia(url):
    """
    Parameters:
    - URL to data source (nl.wikipedia.org)
    
    Result:
    - dataframe with IATA-codes and location information
    """
    column_headers = []
    data = []
    idx = []

# Get data from website
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")
    html_table = soup.find('table', attrs={'class': 'wikitable sortable'})
    html_table_body = html_table.find('tbody') # tbody = table body
    header = html_table_body.find_all('th')    # th = table header
    table = html_table_body.find_all('tr')     # tr = table row

# Extract column headers
    for element in header:
        column_headers.append(element.text.strip())   # 'strip' to remove a.o. line feed

# Extract data
    for row in table:
        data_line = {}
        line = row.find_all('td')              # td = table data
        if len(line) > 0:                      # table header is also a 'tr' line but without 'td'
            for i, element in enumerate(line):
                if i == 0:
                    idx.append(element.text.strip())
                else:
                    # Sometimes apparently there is also a text for the flag shown
                    # --> only keep text for country
                    data_line[column_headers[i]] = element.text.strip().split('\xa0')[-1]   
            data.append(data_line)

# Build dataframe
    df = pd.DataFrame(data, index = idx)
    return(df)

In [None]:
def map_iata(airport, df):
    """
    Parameters:
    - airport: IATA airport code
    - df: dataframe with IATA codes and country (in column 'Land')
    
    Result:
    - country of airport (fixed value '#MV' in case of error)
    """
    try:
        country = df.loc[airport]['Land']
    except Exception as error:
        country = '#MV'
    
    return(country)

# Main program

### Read flight data

In [None]:
df_flights = pd.read_excel(file_flights)
df_flights['sched_datetime'] = df_flights.apply(lambda x: build_datetime(x['date'], x['time']), axis=1)
display(df_flights.head(5))
df_flights.dtypes

### Read passenger data & build worksheet <em>pax_data</em>

In [None]:
content = []
log = []

for index, row in df_flights.iterrows():
    lv_code = row['code'][-5:]
    
    if lv_code.isdigit():
        
        lv_path = path_passengers + lv_code + '.json'
    
        try:
            with open(lv_path, 'r') as pax_file:
                pax_list = json.load(pax_file)
        except Exception as error:
            log_line = ' '.join([f"No passenger file found for flight {row['code'][-5:]} of airline {row['airline']}",
                                 f"{'arriving from' if row['A/D'] == 'A' else 'departing to'} {row['airport']}",
                                 f"on {row['sched_datetime'].strftime(format='%Y/%m/%d %H:%M')}."])
            log.append({'Note' : log_line})
            
        for pax_row in pax_list:
            row2 = {'FlightCode' : row['code'], \
                    'FirstName' : pax_row['first_name'], 'LastName' : pax_row['last_name'], \
                    'Airline' : row['airline'], 'A/D' : row['A/D'], 'Airport' : row['airport'], \
                    'PRM' : pax_row['needs_assistance'], 'sched_datetime' : row['sched_datetime'] }
            content.append(row2)
    
    else:
        # Invalid flight no.
        log_line = ' '.join([f"Invalid flight no. for flight on line {index + 2} of airline {row['airline']}",
                             f"{'arriving from' if row['A/D'] == 'A' else 'departing to'}  {row['airport']}",
                             f"on {row['sched_datetime'].strftime(format='%Y/%m/%d %H:%M')}."])
        log.append({'Note' : log_line})
        
df_pax = pd.DataFrame(content)
df_pax['PRM'] = df_pax['PRM'].astype(bool)
display(df_pax.head(5))
display(log)
df_pax.dtypes

In [None]:
# Reverse check: Do the flights of all passenger files exist in the main file ?
for flightno in os.listdir(path='data/passengers/'):
    if str('FLIGHT_' + flightno[0:5]) not in df_flights['code'].unique():
        log_line = f"No flight data found for flight {flightno[0:5]}."
        log.append({'Note' : f"No flight data found for flight {flightno[0:5]}."})

display(log)

### Build worksheet <em>pax_per_flight</em>

In [None]:
df_count = df_pax.groupby('FlightCode').agg({'PRM' : 'count'})
df_count.rename({'PRM' : 'pax'},axis=1, inplace=True)

df_prm = df_pax.groupby(['FlightCode', 'sched_datetime', 'Airline', 'A/D', 'Airport'], as_index=False).agg({'PRM' : 'sum'})

df_ppf = pd.merge(df_prm, df_count, on='FlightCode')

df_ppf.drop('FlightCode', axis=1, inplace=True)
display(df_ppf.head(5))
df_ppf.dtypes

### Complete IATA data

In [None]:
# Get additional IATA data
df_iata = iata_codes_from_wikipedia(url_iata)
display(df_iata)

In [None]:
df_ppf['Country'] = df_ppf.apply(lambda x: map_iata(x['Airport'], df_iata), axis=1)
display(df_ppf)

### Prep output

In [None]:
# Convert log to dataframe
df_log = pd.DataFrame(log)

# Format PRM to 'Y', 'N'
df_pax['PRM'] = df_pax['PRM'].transform(lambda x: 'Y' if x == True else 'N')

# Sort worksheets
df_ppf.sort_values(by=['sched_datetime', 'Airline', 'A/D', 'Airport'], inplace=True, ignore_index=True)
df_pax.sort_values(by=['sched_datetime', 'FlightCode'], inplace=True, ignore_index=True)

# filename <- flight date
file_output = path_output + df_ppf['sched_datetime'].iloc[0].strftime('%d-%m-%Y') + '.xlsx'

display(df_log.head())
display(df_ppf.head())
display(df_pax.head())

In [None]:
# datetime -> string
df_ppf['ScheduledTime'] = df_ppf.apply(lambda x: x['sched_datetime'].strftime('%Y/%m/%d %H:%M'), axis=1)
df_ppf = df_ppf[['ScheduledTime', 'Country', 'Airline', 'A/D', 'Airport', 'PRM', 'pax']]

df_pax['ScheduledTime'] = df_pax.apply(lambda x: x['sched_datetime'].strftime('%Y/%m/%d %H:%M'), axis=1)
df_pax.drop('sched_datetime', axis=1, inplace=True)

In [None]:
display(df_log.head(5))
display(df_ppf.head(5))
display(df_pax.head(5))

### Output

In [None]:
# Generate Excel file
workbook = {sheet_log : df_log, sheet_flights : df_ppf, sheet_pax : df_pax}
save_to_excel(file_output, workbook)