# Data Pre-Processing

### Import and present data

In [1]:
import pandas as pd
import os
from io import StringIO
import re

# Function to read a CSV file, skip comment lines starting with '%', and stopping at lines starting with '#'
def read_csv_with_comments(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Filter out comment lines and stop at the ending line
    data_lines = []
    for line in lines:
        if line.startswith('#'):
            break
        if not line.startswith('%'):
            data_lines.append(line.strip())

    # Check if data_lines is empty
    if len(data_lines) == 0:
        return None  # Return None if the file has no data

    return data_lines


# Select the data folder and import all the data files
data_folder = 'Data/ROADEF 2009/PoC-scenario'

aircraft_file = data_folder + '/aircraft.csv'
airports_file = data_folder + '/airports.csv'
alt_aircraft_file = data_folder + '/alt_aircraft.csv'
alt_airports_file = data_folder + '/alt_airports.csv'
alt_flights_file = data_folder + '/alt_flights.csv'
config_file = data_folder + '/config.csv'
dist_file = data_folder + '/dist.csv'
flights_file = data_folder + '/flights.csv'
itineraries_file = data_folder + '/itineraries.csv'
positions_file = data_folder + '/position.csv'
rotations_file = data_folder + '/rotations.csv'


# Printing data in this instance

for file in os.listdir(data_folder):
    if file.endswith('.csv'):
        file_path = os.path.join(data_folder, file)
        data_lines = read_csv_with_comments(file_path)
        if data_lines is not None:
            print(file)
            print(data_lines)
            print()
        else:
            # print the file name if it has no data
            print(file)
            print('No data')
            print()


aircraft.csv
['B767#1 B767 Boeing 10/20/160 12250 3000.0 75 50 CDG CDG-10/01/08-06:00-10/01/08-10:00-20', 'B767#3 B767 Boeing 10/20/160 12250 3000.0 75 50 BKK NULL', 'A320#1 A320 Airbus 0/20/160 5700 2000.0 40 40 NCE NCE-10/01/08-17:00-11/01/08-12:00-10', 'A320#2 A320 Airbus 0/20/160 5700 2000.0 40 40 NCE NULL', 'B777#1 B777 Boeing 20/45/310 14000 4000.0 90 60 CDG NULL', 'B777#4 B777 Boeing 20/45/310 14000 4000.0 90 60 LHR CDG-10/01/08-12:00-10/01/08-22:00-80']

alt_flights.csv
['3 10/01/08 60']

alt_airports.csv
No data

alt_aircraft.csv
No data

config.csv
['10/01/08 06:00 10/01/08 23:30', 'F D 0.75 F C 1.08 F I 1.5 B D 0.5 B C 0.75 B I 1 E D 0.33 E C 0.5 E I 0.66', 'F D 1000.0 F C 2000.0 F I 3000.0 B D 650.0 B C 1300.0 B I 2000.0 E D 350.0 E C 650.0 E I 1000.0', 'F D 3000.0 F C 6000.0 F I 9000.0 B D 2000.0 B C 4000.0 B I 6000.0 E D 1000.0 E C 2000.0 E I 3000.0', 'F B D 150.0 F B C 300.0 F B I 450.0 F E D 250.0 F E C 500.0 F E I 750.0 B E D 100.0 B E C 200.0 B E I 300.0', '20000.0 50

## Each file has its own format

### Config file

In [2]:
# Function to convert config file lines to a dictionary with corresponding values
def parse_config(data_lines):
    config_dict = {}
    config_dict['RecoveryPeriod'] = {
        'StartDate': data_lines[0].split()[0],
        'StartTime': data_lines[0].split()[1],
        'EndDate': data_lines[0].split()[2],
        'EndTime': data_lines[0].split()[3]
    }

    def parse_costs(line):
        parts = re.split(r'\s+', line)
        costs = []
        for i in range(0, len(parts), 3):
            costs.append({'Cabin': parts[i], 'Type': parts[i+1], 'Cost': float(parts[i+2])})
        return costs

    config_dict['DelayCosts'] = parse_costs(data_lines[1])
    config_dict['CancellationCostsOutbound'] = parse_costs(data_lines[2])
    config_dict['CancellationCostsInbound'] = parse_costs(data_lines[3])

    def parse_downgrading_costs(line):
        parts = re.split(r'\s+', line)
        costs = []
        for i in range(0, len(parts), 4):
            costs.append({'FromCabin': parts[i], 'ToCabin': parts[i+1], 'Type': parts[i+2], 'Cost': float(parts[i+3])})
        return costs

    config_dict['DowngradingCosts'] = parse_downgrading_costs(data_lines[4])
    config_dict['PenaltyCosts'] = [float(x) for x in re.split(r'\s+', data_lines[5])]
    config_dict['Weights'] = [float(x) for x in re.split(r'\s+', data_lines[6])]
    return config_dict

# Read the config file and parse the data
config_lines = read_csv_with_comments(config_file)
if config_lines:
    config_dict = parse_config(config_lines)
    print(config_dict)


{'RecoveryPeriod': {'StartDate': '10/01/08', 'StartTime': '06:00', 'EndDate': '10/01/08', 'EndTime': '23:30'}, 'DelayCosts': [{'Cabin': 'F', 'Type': 'D', 'Cost': 0.75}, {'Cabin': 'F', 'Type': 'C', 'Cost': 1.08}, {'Cabin': 'F', 'Type': 'I', 'Cost': 1.5}, {'Cabin': 'B', 'Type': 'D', 'Cost': 0.5}, {'Cabin': 'B', 'Type': 'C', 'Cost': 0.75}, {'Cabin': 'B', 'Type': 'I', 'Cost': 1.0}, {'Cabin': 'E', 'Type': 'D', 'Cost': 0.33}, {'Cabin': 'E', 'Type': 'C', 'Cost': 0.5}, {'Cabin': 'E', 'Type': 'I', 'Cost': 0.66}], 'CancellationCostsOutbound': [{'Cabin': 'F', 'Type': 'D', 'Cost': 1000.0}, {'Cabin': 'F', 'Type': 'C', 'Cost': 2000.0}, {'Cabin': 'F', 'Type': 'I', 'Cost': 3000.0}, {'Cabin': 'B', 'Type': 'D', 'Cost': 650.0}, {'Cabin': 'B', 'Type': 'C', 'Cost': 1300.0}, {'Cabin': 'B', 'Type': 'I', 'Cost': 2000.0}, {'Cabin': 'E', 'Type': 'D', 'Cost': 350.0}, {'Cabin': 'E', 'Type': 'C', 'Cost': 650.0}, {'Cabin': 'E', 'Type': 'I', 'Cost': 1000.0}], 'CancellationCostsInbound': [{'Cabin': 'F', 'Type': 'D', 

### Airports file

In [3]:
def parse_airports(data_lines):
    airports_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        airport = parts[0]
        capacities = []
        for i in range(1, len(parts), 4):
            capacities.append({
                'Dep/h': int(parts[i]),
                'Arr/h': int(parts[i+1]),
                'StartTime': parts[i+2],
                'EndTime': parts[i+3]
            })
        airports_dict[airport] = capacities
    return airports_dict


airports_lines = read_csv_with_comments(airports_file)
if airports_lines:
    airports_dict = parse_airports(airports_lines)
    print(airports_dict)




{'BKK': [{'Dep/h': 1, 'Arr/h': 1, 'StartTime': '00:00', 'EndTime': '03:00'}, {'Dep/h': 2, 'Arr/h': 2, 'StartTime': '03:00', 'EndTime': '05:00'}, {'Dep/h': 4, 'Arr/h': 4, 'StartTime': '05:00', 'EndTime': '00:00'}], 'CDG': [{'Dep/h': 1, 'Arr/h': 2, 'StartTime': '00:00', 'EndTime': '05:00'}, {'Dep/h': 4, 'Arr/h': 4, 'StartTime': '05:00', 'EndTime': '21:00'}, {'Dep/h': 2, 'Arr/h': 2, 'StartTime': '21:00', 'EndTime': '00:00'}], 'JFK': [{'Dep/h': 1, 'Arr/h': 2, 'StartTime': '00:00', 'EndTime': '04:00'}, {'Dep/h': 3, 'Arr/h': 3, 'StartTime': '04:00', 'EndTime': '00:00'}], 'LHR': [{'Dep/h': 3, 'Arr/h': 4, 'StartTime': '00:00', 'EndTime': '05:00'}, {'Dep/h': 6, 'Arr/h': 6, 'StartTime': '05:00', 'EndTime': '23:00'}, {'Dep/h': 3, 'Arr/h': 4, 'StartTime': '23:00', 'EndTime': '00:00'}], 'NCE': [{'Dep/h': 0, 'Arr/h': 0, 'StartTime': '00:00', 'EndTime': '04:00'}, {'Dep/h': 1, 'Arr/h': 2, 'StartTime': '04:00', 'EndTime': '06:00'}, {'Dep/h': 4, 'Arr/h': 4, 'StartTime': '06:00', 'EndTime': '22:00'}, {'D

### Distance file

In [4]:

def parse_dist(data_lines):
    dist_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        origin = parts[0]
        destination = parts[1]
        dist = int(parts[2])
        type_ = parts[3]
        dist_dict[(origin, destination)] = {'Dist': dist, 'Type': type_}
    return dist_dict



dist_lines = read_csv_with_comments(dist_file)
if dist_lines:
    dist_dict = parse_dist(dist_lines)
    print(dist_dict)


{('BKK', 'CDG'): {'Dist': 739, 'Type': 'I'}, ('BKK', 'JFK'): {'Dist': 1077, 'Type': 'I'}, ('BKK', 'LHR'): {'Dist': 749, 'Type': 'I'}, ('BKK', 'NCE'): {'Dist': 742, 'Type': 'I'}, ('BKK', 'SYD'): {'Dist': 594, 'Type': 'I'}, ('BKK', 'ORY'): {'Dist': 741, 'Type': 'I'}, ('CDG', 'BKK'): {'Dist': 739, 'Type': 'I'}, ('CDG', 'JFK'): {'Dist': 468, 'Type': 'I'}, ('CDG', 'LHR'): {'Dist': 55, 'Type': 'C'}, ('CDG', 'NCE'): {'Dist': 77, 'Type': 'D'}, ('CDG', 'SYD'): {'Dist': 1302, 'Type': 'I'}, ('CDG', 'ORY'): {'Dist': 34, 'Type': 'P'}, ('JFK', 'BKK'): {'Dist': 1077, 'Type': 'I'}, ('JFK', 'CDG'): {'Dist': 468, 'Type': 'I'}, ('JFK', 'LHR'): {'Dist': 446, 'Type': 'I'}, ('JFK', 'NCE'): {'Dist': 495, 'Type': 'I'}, ('JFK', 'SYD'): {'Dist': 1232, 'Type': 'I'}, ('JFK', 'ORY'): {'Dist': 468, 'Type': 'I'}, ('LHR', 'BKK'): {'Dist': 749, 'Type': 'I'}, ('LHR', 'CDG'): {'Dist': 55, 'Type': 'C'}, ('LHR', 'JFK'): {'Dist': 446, 'Type': 'I'}, ('LHR', 'NCE'): {'Dist': 100, 'Type': 'C'}, ('LHR', 'SYD'): {'Dist': 1308, 

### Flights file

In [5]:



def parse_flights(data_lines):
    flights_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        flight = int(parts[0])
        flights_dict[flight] = {
            'Orig': parts[1],
            'Dest': parts[2],
            'DepTime': parts[3],
            'ArrTime': parts[4],
            'PrevFlight': int(parts[5])
        }
    return flights_dict


flights_lines = read_csv_with_comments(flights_file)
if flights_lines:
    flights_dict = parse_flights(flights_lines)
    print(flights_dict)



{1: {'Orig': 'BKK', 'Dest': 'CDG', 'DepTime': '18:00', 'ArrTime': '06:15+1', 'PrevFlight': 0}, 2: {'Orig': 'NCE', 'Dest': 'CDG', 'DepTime': '08:00', 'ArrTime': '09:30', 'PrevFlight': 0}, 3: {'Orig': 'CDG', 'Dest': 'LHR', 'DepTime': '10:20', 'ArrTime': '11:20', 'PrevFlight': 0}, 4: {'Orig': 'CDG', 'Dest': 'LHR', 'DepTime': '08:30', 'ArrTime': '09:30', 'PrevFlight': 0}, 5: {'Orig': 'LHR', 'Dest': 'JFK', 'DepTime': '12:00', 'ArrTime': '19:30', 'PrevFlight': 0}, 6: {'Orig': 'NCE', 'Dest': 'LHR', 'DepTime': '09:30', 'ArrTime': '11:15', 'PrevFlight': 0}, 7: {'Orig': 'CDG', 'Dest': 'BKK', 'DepTime': '11:30', 'ArrTime': '21:45', 'PrevFlight': 0}, 8: {'Orig': 'LHR', 'Dest': 'CDG', 'DepTime': '12:30', 'ArrTime': '13:30', 'PrevFlight': 0}, 9: {'Orig': 'CDG', 'Dest': 'NCE', 'DepTime': '14:35', 'ArrTime': '16:05', 'PrevFlight': 0}, 10: {'Orig': 'LHR', 'Dest': 'NCE', 'DepTime': '13:10', 'ArrTime': '14:55', 'PrevFlight': 0}, 11: {'Orig': 'JFK', 'Dest': 'LHR', 'DepTime': '22:30', 'ArrTime': '06:00+1',

### Aircraft file

In [6]:


def parse_aircraft(data_lines):
    aircraft_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        aircraft = parts[0]
        aircraft_dict[aircraft] = {
            'Model': parts[1],
            'Family': parts[2],
            'Config': parts[3],
            'Dist': int(parts[4]),
            'Cost/h': float(parts[5]),
            'TurnRound': int(parts[6]),
            'Transit': int(parts[7]),
            'Orig': parts[8],
            'Maint': parts[9] if len(parts) > 9 else None
        }
    return aircraft_dict

aircraft_lines = read_csv_with_comments(aircraft_file)
if aircraft_lines:
    aircraft_dict = parse_aircraft(aircraft_lines)
    print(aircraft_dict)

    


{'B767#1': {'Model': 'B767', 'Family': 'Boeing', 'Config': '10/20/160', 'Dist': 12250, 'Cost/h': 3000.0, 'TurnRound': 75, 'Transit': 50, 'Orig': 'CDG', 'Maint': 'CDG-10/01/08-06:00-10/01/08-10:00-20'}, 'B767#3': {'Model': 'B767', 'Family': 'Boeing', 'Config': '10/20/160', 'Dist': 12250, 'Cost/h': 3000.0, 'TurnRound': 75, 'Transit': 50, 'Orig': 'BKK', 'Maint': 'NULL'}, 'A320#1': {'Model': 'A320', 'Family': 'Airbus', 'Config': '0/20/160', 'Dist': 5700, 'Cost/h': 2000.0, 'TurnRound': 40, 'Transit': 40, 'Orig': 'NCE', 'Maint': 'NCE-10/01/08-17:00-11/01/08-12:00-10'}, 'A320#2': {'Model': 'A320', 'Family': 'Airbus', 'Config': '0/20/160', 'Dist': 5700, 'Cost/h': 2000.0, 'TurnRound': 40, 'Transit': 40, 'Orig': 'NCE', 'Maint': 'NULL'}, 'B777#1': {'Model': 'B777', 'Family': 'Boeing', 'Config': '20/45/310', 'Dist': 14000, 'Cost/h': 4000.0, 'TurnRound': 90, 'Transit': 60, 'Orig': 'CDG', 'Maint': 'NULL'}, 'B777#4': {'Model': 'B777', 'Family': 'Boeing', 'Config': '20/45/310', 'Dist': 14000, 'Cost/h'

### Rotations file

In [7]:

def parse_rotations(data_lines):
    rotations_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        flight = int(parts[0])
        rotations_dict[flight] = {
            'DepDate': parts[1],
            'Aircraft': parts[2]
        }
    return rotations_dict

rotations_lines = read_csv_with_comments(rotations_file)
if rotations_lines:
    rotations_dict = parse_rotations(rotations_lines)
    print(rotations_dict)



{1: {'DepDate': '10/01/08', 'Aircraft': 'B767#3'}, 7: {'DepDate': '10/01/08', 'Aircraft': 'B767#1'}, 2: {'DepDate': '10/01/08', 'Aircraft': 'A320#1'}, 3: {'DepDate': '10/01/08', 'Aircraft': 'A320#1'}, 8: {'DepDate': '10/01/08', 'Aircraft': 'A320#1'}, 9: {'DepDate': '10/01/08', 'Aircraft': 'A320#1'}, 4: {'DepDate': '10/01/08', 'Aircraft': 'B777#1'}, 5: {'DepDate': '10/01/08', 'Aircraft': 'B777#1'}, 11: {'DepDate': '10/01/08', 'Aircraft': 'B777#1'}, 12: {'DepDate': '10/01/08', 'Aircraft': 'B777#4'}, 6: {'DepDate': '10/01/08', 'Aircraft': 'A320#2'}, 10: {'DepDate': '10/01/08', 'Aircraft': 'A320#2'}}


### Itineraries file

In [8]:

def parse_itineraries(data_lines):
    itineraries_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        ident = int(parts[0])
        itineraries_dict[ident] = {
            'Type': parts[1],
            'Price': float(parts[2]),
            'Count': int(parts[3]),
            'Flights': parts[4:]
        }
    return itineraries_dict

itineraries_lines = read_csv_with_comments(itineraries_file)
if itineraries_lines:
    itineraries_dict = parse_itineraries(itineraries_lines)
    print(itineraries_dict)



{1: {'Type': 'A', 'Price': 2800.0, 'Count': 5, 'Flights': ['1', '10/01/08', 'F']}, 2: {'Type': 'A', 'Price': 2200.5, 'Count': 7, 'Flights': ['1', '10/01/08', 'B']}, 3: {'Type': 'A', 'Price': 1000.2, 'Count': 75, 'Flights': ['1', '10/01/08', 'E']}, 4: {'Type': 'R', 'Price': 2200.5, 'Count': 9, 'Flights': ['1', '10/01/08', 'B']}, 5: {'Type': 'R', 'Price': 1000.2, 'Count': 60, 'Flights': ['1', '10/01/08', 'E']}, 6: {'Type': 'A', 'Price': 400.25, 'Count': 30, 'Flights': ['2', '10/01/08', 'E']}, 7: {'Type': 'R', 'Price': 400.25, 'Count': 20, 'Flights': ['2', '10/01/08', 'E']}, 8: {'Type': 'A', 'Price': 350.0, 'Count': 30, 'Flights': ['3', '10/01/08', 'E']}, 9: {'Type': 'R', 'Price': 1200.17, 'Count': 1, 'Flights': ['3', '10/01/08', 'B']}, 10: {'Type': 'R', 'Price': 350.4, 'Count': 30, 'Flights': ['3', '10/01/08', 'E']}, 11: {'Type': 'A', 'Price': 1700.0, 'Count': 3, 'Flights': ['2', '10/01/08', 'B', '3', '10/01/08', 'B']}, 12: {'Type': 'A', 'Price': 900.18, 'Count': 20, 'Flights': ['2', '10

### Positions file

In [9]:

def parse_positions(data_lines):
    positions_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        airport = parts[0]
        model = parts[1]
        config = parts[2]
        count = int(parts[3])
        if airport not in positions_dict:
            positions_dict[airport] = []
        positions_dict[airport].append({'Model': model, 'Config': config, 'Count': count})
    return positions_dict


positions_lines = read_csv_with_comments(positions_file)
if positions_lines:
    positions_dict = parse_positions(positions_lines)
    print(positions_dict)


    


{'CDG': [{'Model': 'B777', 'Config': '20/45/310', 'Count': 1}], 'LHR': [{'Model': 'B777', 'Config': '20/45/310', 'Count': 1}], 'NCE': [{'Model': 'A320', 'Config': '0/20/160', 'Count': 2}], 'BKK': [{'Model': 'B767', 'Config': '10/20/160', 'Count': 1}]}


### Flights disruptions file

In [10]:

def parse_alt_flights(data_lines):
    alt_flights_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        flight = int(parts[0])
        alt_flights_dict[flight] = {
            'DepDate': parts[1],
            'Delay': int(parts[2])
        }
    return alt_flights_dict

alt_flights_lines = read_csv_with_comments(alt_flights_file)
if alt_flights_lines:
    alt_flights_dict = parse_alt_flights(alt_flights_lines)
    print(alt_flights_dict)




{3: {'DepDate': '10/01/08', 'Delay': 60}}


### Aircraft disruptions file

In [11]:


def parse_alt_aircraft(data_lines):
    alt_aircraft_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        aircraft = parts[0]
        alt_aircraft_dict[aircraft] = {
            'StartDate': parts[1],
            'StartTime': parts[2],
            'EndDate': parts[3],
            'EndTime': parts[4]
        }
    return alt_aircraft_dict

alt_aircraft_lines = read_csv_with_comments(alt_aircraft_file)
if alt_aircraft_lines:
    alt_aircraft_dict = parse_alt_aircraft(alt_aircraft_lines)
    print(alt_aircraft_dict)



### Airport disruptions file

In [12]:


def parse_alt_airports(data_lines):
    alt_airports_dict = {}
    for line in data_lines:
        parts = re.split(r'\s+', line)
        airport = parts[0]
        alt_airports_dict[airport] = {
            'StartDate': parts[1],
            'StartTime': parts[2],
            'EndDate': parts[3],
            'EndTime': parts[4],
            'Dep/h': int(parts[5]),
            'Arr/h': int(parts[6])
        }
    return alt_airports_dict


alt_airports_lines = read_csv_with_comments(alt_airports_file)
if alt_airports_lines:
    alt_airports_dict = parse_alt_airports(alt_airports_lines)
    print(alt_airports_dict)
