# Comparing train and plane travel times

## Import libraries

In [1]:
import pandas as pd
from fuzzywuzzy import fuzz

## Import train times

In [2]:
trains = pd.read_csv('./Rail_transport_performance_in_Europe_2021.csv')
trains

Unnamed: 0,country_larger_city,HDC_code_larger,larger_city,pop_larger_city,country_smaller_city,HDC_code_smaller,smaller_city,pop_smaller_city,crossborder,distance_centroids,av_dist_opti,time_opti,speed_opti,av_dist_avt,time_avt,speed_avt
0,FR,GEOSTAT11_450,Paris,8900172,BE,GEOSTAT11_349,Bruxelles / Brussel,1233487,True,more than 120 km,260.8,1.44,182.1,261.6,1.71,153.5
1,FR,GEOSTAT11_450,Paris,8900172,FR,GEOSTAT11_361,Lille,911478,False,more than 120 km,202.0,1.00,202.1,201.9,1.29,157.3
2,FR,GEOSTAT11_450,Paris,8900172,FR,GEOSTAT11_418,Rouen,298297,False,114.6,110.0,1.24,88.7,110.0,1.67,65.9
3,FR,GEOSTAT11_450,Paris,8900172,FR,GEOSTAT11_434,Cergy-Pontoise,273138,False,30.6,4.8,0.07,67.9,4.8,0.17,27.9
4,FR,GEOSTAT11_450,Paris,8900172,FR,GEOSTAT11_499,Tours,211915,False,more than 120 km,187.7,0.86,218.2,200.4,1.54,130.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6594,HU,GEOSTAT11_506,Szolnok,51457,HU,GEOSTAT11_500,Érd,51285,False,99.6,,,,,,
6595,HU,GEOSTAT11_500,Érd,51285,HU,GEOSTAT11_516,Veszprém,50141,False,82.2,,,,,,
6596,IT,GEOSTAT11_770,Bitonto,50961,IT,GEOSTAT11_734,Manfredonia,50684,False,86.8,,,,,,
6597,IT,GEOSTAT11_770,Bitonto,50961,IT,GEOSTAT11_761,Cerignola,50074,False,68.5,,,,,,


## Import plane times

In [3]:
flights = pd.read_csv('/Users/slavicagjorgieva/Desktop/TU/WS23/DOPP/Ex2/flight_times.csv')
flights

Unnamed: 0.1,Unnamed: 0,route,time
0,0,"{'AAL', 'AMS'}",85.0
1,1,"{'AAQ', 'LED'}",170.0
2,2,"{'AMS', 'ABZ'}",85.0
3,3,"{'ABZ', 'LGW'}",95.0
4,4,"{'ACE', 'AGP'}",130.0
...,...,...,...
1464,1464,"{'ZRH', 'OTP'}",150.0
1465,1465,"{'ZRH', 'PRN'}",127.5
1466,1466,"{'ZRH', 'RVN'}",205.0
1467,1467,"{'ZRH', 'STR'}",45.0


In [4]:
flights.drop(flights.columns[0], axis=1, inplace=True)
flights

Unnamed: 0,route,time
0,"{'AAL', 'AMS'}",85.0
1,"{'AAQ', 'LED'}",170.0
2,"{'AMS', 'ABZ'}",85.0
3,"{'ABZ', 'LGW'}",95.0
4,"{'ACE', 'AGP'}",130.0
...,...,...
1464,"{'ZRH', 'OTP'}",150.0
1465,"{'ZRH', 'PRN'}",127.5
1466,"{'ZRH', 'RVN'}",205.0
1467,"{'ZRH', 'STR'}",45.0


In [5]:
flights['departure airport'] = flights['route'].apply(lambda x: x.split(',')[0].strip("{'"))
flights['arrival airport'] = flights['route'].apply(lambda x: x.split(',')[1].strip(" }'"))
flights

Unnamed: 0,route,time,departure airport,arrival airport
0,"{'AAL', 'AMS'}",85.0,AAL,AMS
1,"{'AAQ', 'LED'}",170.0,AAQ,LED
2,"{'AMS', 'ABZ'}",85.0,AMS,ABZ
3,"{'ABZ', 'LGW'}",95.0,ABZ,LGW
4,"{'ACE', 'AGP'}",130.0,ACE,AGP
...,...,...,...,...
1464,"{'ZRH', 'OTP'}",150.0,ZRH,OTP
1465,"{'ZRH', 'PRN'}",127.5,ZRH,PRN
1466,"{'ZRH', 'RVN'}",205.0,ZRH,RVN
1467,"{'ZRH', 'STR'}",45.0,ZRH,STR


## Import airport codes

In [8]:
# Check the delimiter in the 'trythis.csv' file
delimiter = ';'  # Make sure this delimiter matches the one in your CSV file

# Read the 'trythis.csv' file with the correct delimiter
airports = pd.read_csv('/Users/slavicagjorgieva/Desktop/TU/WS23/DOPP/Ex2/trythis.csv', encoding=encoding, delimiter=delimiter)

# Print the first few rows to verify the data
print(airports.head())

# Now select the columns 'IATA airport code' and 'Airport'
cities = airports[['IATA airport code', 'Airport']]

# Print the 'cities' DataFrame
print(cities.head())

      Airport  Country IATA airport code ICAO airport code
0      Tirana  Albania               TIA              LATI
1     Yerevan  Armenia               EVN              UDYZ
2        Graz  Austria               GRZ              LOWG
3   Innsbruck  Austria               INN              LOWI
4  Klagenfurt  Austria               KLU              LOWK
  IATA airport code     Airport
0               TIA      Tirana
1               EVN     Yerevan
2               GRZ        Graz
3               INN   Innsbruck
4               KLU  Klagenfurt


In [9]:
cities = airports[['IATA airport code', 'Airport']]
cities

Unnamed: 0,IATA airport code,Airport
0,TIA,Tirana
1,EVN,Yerevan
2,GRZ,Graz
3,INN,Innsbruck
4,KLU,Klagenfurt
...,...,...
259,SEN,London
260,STN,London
261,MAN,Manchester
262,NCL,Newcastle


In [10]:
# add departure city names
flights = pd.merge(flights, cities, left_on='departure airport', right_on='IATA airport code', how='left')
# remove the redundant column
flights.drop('IATA airport code', axis=1, inplace=True)
# rename the column
flights.rename(columns={'Airport': 'departure city'}, inplace=True)
# add arrival city names
flights = pd.merge(flights, cities, left_on='arrival airport', right_on='IATA airport code', how='left')
# remove the redundant column
flights.drop('IATA airport code', axis=1, inplace=True)
# rename the column
flights.rename(columns={'Airport': 'arrival city'}, inplace=True)
flights

Unnamed: 0,route,time,departure airport,arrival airport,departure city,arrival city
0,"{'AAL', 'AMS'}",85.0,AAL,AMS,Aalborg,Amsterdam
1,"{'AAQ', 'LED'}",170.0,AAQ,LED,,Saint
2,"{'AMS', 'ABZ'}",85.0,AMS,ABZ,Amsterdam,Aberdeen
3,"{'ABZ', 'LGW'}",95.0,ABZ,LGW,Aberdeen,London
4,"{'ACE', 'AGP'}",130.0,ACE,AGP,Lanzarote,Malaga
...,...,...,...,...,...,...
1464,"{'ZRH', 'OTP'}",150.0,ZRH,OTP,Zurich,Bucharest
1465,"{'ZRH', 'PRN'}",127.5,ZRH,PRN,Zurich,Pristina
1466,"{'ZRH', 'RVN'}",205.0,ZRH,RVN,Zurich,Rovaniemi
1467,"{'ZRH', 'STR'}",45.0,ZRH,STR,Zurich,Stuttgart


Does the 'municipality' variable refer to the actual administrative division that the airport is located in, or the big city it's next to? Let's check.

In [11]:
flights[flights['departure city'].str.contains('London')]

ValueError: Cannot mask with non-boolean array containing NA / NaN values

In [12]:
flights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1469 entries, 0 to 1468
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   route              1469 non-null   object 
 1   time               1469 non-null   float64
 2   departure airport  1469 non-null   object 
 3   arrival airport    1469 non-null   object 
 4   departure city     1333 non-null   object 
 5   arrival city       1366 non-null   object 
dtypes: float64(1), object(5)
memory usage: 80.3+ KB


Now we need to check that the train and plane datasets are using the same city names. We'll do this by checking that the city names in the plane dataset are a subset of the city names in the train dataset.

In [13]:
train_cities = set(trains['larger_city'].unique()) | set(trains['smaller_city'].unique())
print(f'Number of cities in the trains dataset: {len(train_cities)}')
train_cities

Number of cities in the trains dataset: 799


{"'s-Hertogenbosch",
 'A Coruña',
 'Aachen',
 'Aalborg',
 'Aberdeen',
 'Acerra',
 'Acireale',
 'Aix-en-Provence',
 'Albacete',
 'Alcalá de Guadaíra',
 'Alcalá de Henares',
 'Alcoi/Alcoy',
 'Aldwick / Felpham',
 'Alessandria',
 'Algeciras',
 'Alicante/Alacant',
 'Alkmaar',
 'Almada',
 'Almelo',
 'Almere',
 'Almería',
 'Alphen aan den Rijn',
 'Altamura',
 'Amersfoort',
 'Amiens',
 'Amora',
 'Amsterdam',
 'Ancona',
 'Andria',
 'Angers',
 'Annecy',
 'Antibes',
 'Antwerpen',
 'Anzio / Nettuno',
 'Apeldoorn',
 'Arad',
 'Arcozelo / Espinho',
 'Arezzo',
 'Arnhem',
 'Arras',
 'Ashford',
 'Assen',
 'Asti',
 'Augsburg',
 'Avellino',
 'Avignon',
 'Avilés',
 'Aylesbury',
 'Ayr / Prestwick',
 'Bacău',
 'Bad Soden am Taunus / Kelkheim',
 'Badajoz',
 'Bagheria',
 'Baia Mare',
 'Bamberg',
 'Bangor',
 'Banská Bystrica',
 'Barcelona',
 'Bari',
 'Barletta',
 'Barnsley',
 'Barreiro',
 'Basel',
 'Basildon',
 'Basingstoke and Deane',
 'Bath',
 'Battipaglia',
 'Bayonne',
 'Bayreuth',
 'Bedford',
 'Belchatow',

In [14]:
departure_cities = flights['departure city'].unique()
arrival_cities = flights['arrival city'].unique()

airport_cities = set(departure_cities) | set(arrival_cities)
print(f'Number of cities in the flights dataset: {len(airport_cities)}')
airport_cities

Number of cities in the flights dataset: 191


{'Aalborg',
 'Aarhus',
 'Aberdeen',
 'Ajaccio',
 'Alghero',
 'Alicante',
 'Allg\x8au',
 'Almeria',
 'Amsterdam',
 'Ancona',
 'Asturias',
 'Athens',
 'Barcelona',
 'Bari',
 'Basel',
 'Bastia',
 'Belfast',
 'Belgrade',
 'Bergamo',
 'Bergen',
 'Bilbao',
 'Billund',
 'Birmingham',
 'Bod¿',
 'Bologna',
 'Bordeaux',
 'Bratislava',
 'Bremen',
 'Brindisi',
 'Bristol',
 'Brussels',
 'Bucharest',
 'Budapest',
 'Burgas',
 'Cagliari',
 'Cardiff',
 'Catania',
 'Chania',
 'Charleroi',
 'Chisinau',
 'Cluj-Napoca',
 'Cologne',
 'Copenhagen',
 'Corfu',
 'Debrecen',
 'Dortmund',
 'Dresden',
 'Dubrovnik',
 'D\x9fsseldorf',
 'East',
 'Edinburgh',
 'Eindhoven',
 'Faro',
 'Figari',
 'Florence',
 'Frankfurt',
 'Frankfurt-Hahn',
 'Fuerteventura',
 'Gdansk',
 'Geneva',
 'Genoa',
 'Girona',
 'Glasgow',
 'Gothenburg',
 'Granada-Ja\x8en',
 'Graz',
 'Hamburg',
 'Hanover',
 'Haugesund',
 'Helsinki',
 'Iasi',
 'Ibiza',
 'Innsbruck',
 'Jerez',
 'Jersey',
 'Karlsruhe',
 'Katowice',
 'Kaunas',
 'Kittil\x8a',
 'Klagenfu

Which cities are in the airport dataset but not the train dataset?

In [15]:
len(airport_cities - train_cities)

118

211 out of 299 is a lot...

In [16]:
airport_cities - train_cities

{'Aarhus',
 'Ajaccio',
 'Alghero',
 'Alicante',
 'Allg\x8au',
 'Almeria',
 'Asturias',
 'Athens',
 'Bastia',
 'Belgrade',
 'Bergen',
 'Billund',
 'Bod¿',
 'Bordeaux',
 'Brussels',
 'Bucharest',
 'Cagliari',
 'Chania',
 'Chisinau',
 'Cologne',
 'Copenhagen',
 'Corfu',
 'Dortmund',
 'Dubrovnik',
 'D\x9fsseldorf',
 'East',
 'Faro',
 'Figari',
 'Florence',
 'Frankfurt',
 'Frankfurt-Hahn',
 'Fuerteventura',
 'Gdansk',
 'Geneva',
 'Genoa',
 'Gothenburg',
 'Granada-Ja\x8en',
 'Hanover',
 'Haugesund',
 'Iasi',
 'Ibiza',
 'Jerez',
 'Jersey',
 'Katowice',
 'Kittil\x8a',
 'Ko_ice',
 'Kos',
 'Krak\x97w',
 'Krasnodar',
 'Kristiansand',
 'La',
 'Lamezia',
 'Lanzarote',
 'Leeds',
 'Lesvos',
 'Liege',
 'Lisbon',
 'Lyon-Saint',
 'Madeira',
 'Malaga',
 'Malm\x9a',
 'Malta',
 'Manchester',
 'Menorca',
 'Milan',
 'Minsk',
 'Moscow',
 'Munich',
 'Mykonos',
 'M\x9fnster',
 'Naples',
 'Newcastle',
 'Ni_',
 'Nuremberg',
 'Olbia',
 'Oulu',
 'Podgorica',
 'Ponta',
 'Poznan',
 'Prague',
 'Pristina',
 'Pula',
 'R

In [31]:
'Genève' in train_cities


True

In [44]:
matched_train_cities = []


def find_best_match(train_city):
    best_match = None
    highest_score = 0
    
    # Iterate through the airport cities and calculate the fuzzy similarity score
    for airport_city in airports['municipality']:
        score = fuzz.token_sort_ratio(train_city, airport_city)
        
        # If the score is higher than the current highest score, update the best match
        if score > highest_score:
            best_match = airport_city
            highest_score = score
    
    return best_match

# Iterate through the train cities and find the best match for each
for train_city in train_cities:
    best_match = find_best_match(train_city)
    matched_train_cities.append((train_city, best_match))

# Create a DataFrame to store the matched cities
matched_cities_df = pd.DataFrame(matched_train_cities, columns=['Train City', 'Matched Airport City'])

# Display the matched cities
print(matched_cities_df.head())

      Train City Matched Airport City
0   Stalowa Wola         Stalowa Wola
1     Winterthur           Winterthur
2     Darlington           Darlington
3        Manresa              Manresa
4  Kidderminster            Ilminster


In [46]:
print(matched_cities_df)


        Train City Matched Airport City
0     Stalowa Wola         Stalowa Wola
1       Winterthur           Winterthur
2       Darlington           Darlington
3          Manresa              Manresa
4    Kidderminster            Ilminster
..             ...                  ...
794      Kecskemét           KecskemÃ©t
795       Enschede             Enschede
796   Herzogenrath       Herzogenaurach
797    Delmenhorst             Elmhurst
798         Monaco               Monaco

[799 rows x 2 columns]
