# Importing Libraries

In [49]:
import pandas as pd

In [50]:
import plotly.express as px
import plotly.graph_objects as go

# Preprocessing Data

#### Dataset Choice:

    Chosen Datasets: We worked with datasets containing information on flight times, train times, and IATA airport codes, which we cretaed using API's.
    Why?:  There was no accessiable data which contained all the information we need, therefore we had try to fetch it ourselves. These datasets were fundamental for comparing travel times between different modes of transportation and analyzing geographical distribution based on airports.

#### Data Cleaning/Transformation & Handling missing values:

    Actions Taken: We merged datasets on city pairs, converted time units (from seconds to minutes), handled NaN values, and renamed columns for clarity. There were very few missing values and we decided to exclude them since they were not significant compared to the amount of data we had. We focused more on finding reliable sources for API's to avoid such issues in the beigning.  We also excluded very few duplicate values.
    Why?: These steps were necessary to make the data consistent and suitable for comparison and analysis, ensuring accuracy in our findings.


In [54]:
flights = pd.read_csv('./datasets/final_flight_routes.csv')
flights

Unnamed: 0,route,time,departure airport,arrival airport,departure city,arrival city
0,"{'AAL', 'AMS'}",85.0,AAL,AMS,Aalborg,Amsterdam
1,"{'AAQ', 'LED'}",170.0,AAQ,LED,Novorossiysk,Saint Petersburg
2,"{'AMS', 'ABZ'}",85.0,AMS,ABZ,Amsterdam,Aberdeen
3,"{'ABZ', 'LGW'}",95.0,ABZ,LGW,Aberdeen,London
4,"{'ACE', 'AGP'}",130.0,ACE,AGP,Arrecife,Málaga
...,...,...,...,...,...,...
1333,"{'ZRH', 'OTP'}",150.0,ZRH,OTP,Zürich,Bucharest
1334,"{'ZRH', 'PRN'}",127.5,ZRH,PRN,Zürich,Pristina
1335,"{'ZRH', 'RVN'}",205.0,ZRH,RVN,Zürich,Rovaniemi
1336,"{'ZRH', 'STR'}",45.0,ZRH,STR,Zürich,Stuttgart


In [56]:
trains =  pd.read_csv('./datasets/train_distances3/combined_train_distances3.csv')
trains

Unnamed: 0,departure city,arrival city,transit_distance [m],transit_time [s]
0,Arrecife,Barcelona,-1,-1
1,Alicante,Barcelona,525005,19080
2,Amsterdam,Barcelona,1627336,44340
3,Athens,Barcelona,-1,-1
4,Málaga,Barcelona,1110453,22380
...,...,...,...,...
1333,Vienna,Santander,-1,-1
1334,Vienna,Tromsø,3970237,197151
1335,Moscow,Groznyy,3147441,126600
1336,Moscow,Samara,1569205,52860


In [57]:
print("Number of rows in flights:", flights.shape[0])
print("Number of rows in trains:", trains.shape[0])

Number of rows in flights: 1338
Number of rows in trains: 1338


In [58]:
merged_data = pd.merge(flights, trains, on=['departure city', 'arrival city'])
print("Number of rows in merged data:", merged_data.shape[0])

Number of rows in merged data: 1572


In [59]:
merged_data

Unnamed: 0,route,time,departure airport,arrival airport,departure city,arrival city,transit_distance [m],transit_time [s]
0,"{'AAL', 'AMS'}",85.0,AAL,AMS,Aalborg,Amsterdam,1014627,43500
1,"{'AAQ', 'LED'}",170.0,AAQ,LED,Novorossiysk,Saint Petersburg,2538536,110220
2,"{'AAQ', 'LED'}",170.0,AAQ,LED,Novorossiysk,Saint Petersburg,2538536,110220
3,"{'GDZ', 'LED'}",180.0,GDZ,LED,Novorossiysk,Saint Petersburg,2538536,110220
4,"{'GDZ', 'LED'}",180.0,GDZ,LED,Novorossiysk,Saint Petersburg,2538536,110220
...,...,...,...,...,...,...,...,...
1567,"{'ZRH', 'OTP'}",150.0,ZRH,OTP,Zürich,Bucharest,1923309,95580
1568,"{'ZRH', 'PRN'}",127.5,ZRH,PRN,Zürich,Pristina,1622886,254489
1569,"{'ZRH', 'RVN'}",205.0,ZRH,RVN,Zürich,Rovaniemi,-1,-1
1570,"{'ZRH', 'STR'}",45.0,ZRH,STR,Zürich,Stuttgart,418160,15960


## Removing duplicates

In [60]:
cleaned_data = merged_data.drop_duplicates()
print("Number of rows after removing duplicates:", cleaned_data.shape[0])
cleaned_data.info()

Number of rows after removing duplicates: 1338
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1338 entries, 0 to 1571
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   route                 1338 non-null   object 
 1   time                  1338 non-null   float64
 2   departure airport     1338 non-null   object 
 3   arrival airport       1338 non-null   object 
 4   departure city        1338 non-null   object 
 5   arrival city          1338 non-null   object 
 6   transit_distance [m]  1338 non-null   int64  
 7   transit_time [s]      1338 non-null   int64  
dtypes: float64(1), int64(2), object(5)
memory usage: 94.1+ KB


In [61]:
cleaned_data.describe()

Unnamed: 0,time,transit_distance [m],transit_time [s]
count,1338.0,1338.0,1338.0
mean,127.338254,1356371.0,60274.839312
std,51.661933,1119806.0,54533.294224
min,10.0,-1.0,-1.0
25%,90.0,487764.2,15711.0
50%,120.0,1229288.0,45990.0
75%,160.0,2069596.0,95055.0
max,402.5,8877409.0,301620.0


### Removing -1 (NaN) values for distance & time

In [117]:
filtered_data = cleaned_data[cleaned_data['transit_distance [m]'] != -1]
cleaned_df = filtered_data[filtered_data['transit_time [s]'] != -1]
cleaned_df
cleaned_df.to_csv('datasets/merged_df.csv')
cleaned_df

Unnamed: 0,route,time,departure airport,arrival airport,departure city,arrival city,transit_distance [m],transit_time [s]
0,"{'AAL', 'AMS'}",85.0,AAL,AMS,Aalborg,Amsterdam,1014627,43500
1,"{'AAQ', 'LED'}",170.0,AAQ,LED,Novorossiysk,Saint Petersburg,2538536,110220
3,"{'GDZ', 'LED'}",180.0,GDZ,LED,Novorossiysk,Saint Petersburg,2538536,110220
5,"{'AMS', 'ABZ'}",85.0,AMS,ABZ,Amsterdam,Aberdeen,1435877,46020
6,"{'ABZ', 'LGW'}",95.0,ABZ,LGW,Aberdeen,London,841037,27540
...,...,...,...,...,...,...,...,...
1565,"{'ZRH', 'MUC'}",55.0,ZRH,MUC,Zürich,Munich,331898,12660
1566,"{'ZRH', 'OPO'}",160.0,ZRH,OPO,Zürich,Porto,2960675,191400
1567,"{'ZRH', 'OTP'}",150.0,ZRH,OTP,Zürich,Bucharest,1923309,95580
1568,"{'ZRH', 'PRN'}",127.5,ZRH,PRN,Zürich,Pristina,1622886,254489


In [118]:
total_nans = merged_data.isna().sum().sum()
print(f"Total NaN values in merged_data: {total_nans}")

Total NaN values in merged_data: 0


## Preprocessing Data

In [63]:
filtered_data = filtered_data.rename(columns={
    'time': 'flight_time', 
    'transit_time [s]': 'train_time', 
    'transit_distance [m]': 'train_distance'
})


In [64]:
# Convert train time from seconds to minutes using .loc
filtered_data.loc[:, 'train_time'] = filtered_data['train_time'] / 60

print(filtered_data.head())

            route  flight_time departure airport arrival airport  \
0  {'AAL', 'AMS'}         85.0               AAL             AMS   
1  {'AAQ', 'LED'}        170.0               AAQ             LED   
3  {'GDZ', 'LED'}        180.0               GDZ             LED   
5  {'AMS', 'ABZ'}         85.0               AMS             ABZ   
6  {'ABZ', 'LGW'}         95.0               ABZ             LGW   

  departure city      arrival city  train_distance  train_time  
0        Aalborg         Amsterdam         1014627       725.0  
1   Novorossiysk  Saint Petersburg         2538536      1837.0  
3   Novorossiysk  Saint Petersburg         2538536      1837.0  
5      Amsterdam          Aberdeen         1435877       767.0  
6       Aberdeen            London          841037       459.0  


### Adding country code for each city (arrival & departure) and their geo-coordinates(longitude & latitude)

In [65]:
iata_codes_country = pd.read_csv('/Users/slavicagjorgieva/Desktop/TU/WS23/DOPP/Ex2/DOPP_group18/datasets/fixed-airports-codes.csv')

In [81]:
iata_codes_country

Unnamed: 0,id,ident,type,name,latitude_deg,longitude_deg,elevation_ft,continent,iso_country,iso_region,municipality,scheduled_service,gps_code,iata_code,local_code,home_link,wikipedia_link,keywords
0,6523,00A,heliport,Total RF Heliport,40.070985,-74.933689,11.0,,US,US-PA,Bensalem,no,K00A,,00A,https://www.penndot.pa.gov/TravelInPA/airports...,,
1,323361,00AA,small_airport,Aero B Ranch Airport,38.704022,-101.473911,3435.0,,US,US-KS,Leoti,no,00AA,,00AA,,,
2,6524,00AK,small_airport,Lowell Field,59.947733,-151.692524,450.0,,US,US-AK,Anchor Point,no,00AK,,00AK,,,
3,6525,00AL,small_airport,Epps Airpark,34.864799,-86.770302,820.0,,US,US-AL,Harvest,no,00AL,,00AL,,,
4,506791,00AN,small_airport,Katmai Lodge Airport,59.093287,-156.456699,80.0,,US,US-AK,King Salmon,no,00AN,,00AN,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78148,46378,ZZ-0001,heliport,Sealand Helipad,51.894444,1.482500,40.0,EU,GB,GB-ENG,Sealand,no,,,,http://www.sealandgov.org/,https://en.wikipedia.org/wiki/Principality_of_...,Roughs Tower Helipad
78149,307326,ZZ-0002,small_airport,Glorioso Islands Airstrip,-11.584278,47.296389,11.0,AF,TF,TF-U-A,Grande Glorieuse,no,,,,,,
78150,346788,ZZ-0003,small_airport,Fainting Goat Airport,32.110587,-97.356312,690.0,,US,US-TX,Blum,no,87TX,,87TX,,,
78151,342102,ZZZW,closed,Scandium City Heliport,69.355287,-138.939310,4.0,,CA,CA-YT,(Old) Scandium City,no,,,,,,"ZZZW, ZZZW, ZYW, YK96"


In [82]:
merged_departure = pd.merge(filtered_data, iata_codes_country, left_on='departure airport', right_on='iata_code', how='left')

# Rename columns from the first merge
merged_departure.rename(columns={'latitude_deg': 'dep_latitude_deg', 'longitude_deg': 'dep_longitude_deg', 'iso_country': 'dep_iso_country'}, inplace=True)

# Drop extra columns after the first merge
columns_to_drop_first_merge = ['id', 'ident', 'type', 'name', 'elevation_ft', 'continent', 'iso_region', 'municipality', 'scheduled_service', 'gps_code', 'iata_code', 'local_code', 'home_link', 'wikipedia_link', 'keywords']
merged_departure.drop(columns_to_drop_first_merge, axis=1, inplace=True)

# Second, merge for arrival airport
merged_data = pd.merge(merged_departure, iata_codes_country, left_on='arrival airport', right_on='iata_code', how='left')

# Rename columns from the second merge
merged_data.rename(columns={'latitude_deg': 'arr_latitude_deg', 'longitude_deg': 'arr_longitude_deg', 'iso_country': 'arr_iso_country'}, inplace=True)

# Drop extra columns after the second merge
columns_to_drop_second_merge = ['id', 'ident', 'type', 'name', 'elevation_ft', 'continent', 'iso_region', 'municipality', 'scheduled_service', 'gps_code', 'iata_code', 'local_code', 'home_link', 'wikipedia_link', 'keywords']
merged_data.drop(columns_to_drop_second_merge, axis=1, inplace=True)


print(merged_data.head())

            route  flight_time departure airport arrival airport  \
0  {'AAL', 'AMS'}         85.0               AAL             AMS   
1  {'AAQ', 'LED'}        170.0               AAQ             LED   
2  {'GDZ', 'LED'}        180.0               GDZ             LED   
3  {'AMS', 'ABZ'}         85.0               AMS             ABZ   
4  {'ABZ', 'LGW'}         95.0               ABZ             LGW   

  departure city      arrival city  train_distance  train_time  \
0        Aalborg         Amsterdam         1014627       725.0   
1   Novorossiysk  Saint Petersburg         2538536      1837.0   
2   Novorossiysk  Saint Petersburg         2538536      1837.0   
3      Amsterdam          Aberdeen         1435877       767.0   
4       Aberdeen            London          841037       459.0   

   dep_latitude_deg  dep_longitude_deg dep_iso_country  arr_latitude_deg  \
0         57.094763           9.849930              DK         52.308601   
1         45.002102          37.347301    

In [83]:
merged_data.describe()

Unnamed: 0,flight_time,train_distance,train_time,dep_latitude_deg,dep_longitude_deg,arr_latitude_deg,arr_longitude_deg
count,1083.0,1083.0,1083.0,1083.0,1083.0,1083.0,1083.0
mean,122.51285,1675739.0,1241.120191,48.532025,12.304472,48.166043,10.753863
std,46.300586,1006887.0,852.592627,5.828496,13.789245,6.690596,12.933604
min,10.0,83338.0,63.0,36.6749,-9.13592,36.151199,-9.35523
25%,85.0,906189.5,544.0,43.810001,2.11278,42.5728,2.07846
50%,117.5,1536602.0,1166.166667,49.012798,10.887303,48.353802,9.160452
75%,152.5,2269107.0,1731.0,52.165699,19.7206,52.308601,17.9186
max,325.0,8877409.0,5027.0,68.7817,92.492437,69.683296,92.492437


# Statistical Analysis

In [84]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1083 entries, 0 to 1082
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   route              1083 non-null   object 
 1   flight_time        1083 non-null   float64
 2   departure airport  1083 non-null   object 
 3   arrival airport    1083 non-null   object 
 4   departure city     1083 non-null   object 
 5   arrival city       1083 non-null   object 
 6   train_distance     1083 non-null   int64  
 7   train_time         1083 non-null   float64
 8   dep_latitude_deg   1083 non-null   float64
 9   dep_longitude_deg  1083 non-null   float64
 10  dep_iso_country    1083 non-null   object 
 11  arr_latitude_deg   1083 non-null   float64
 12  arr_longitude_deg  1083 non-null   float64
 13  arr_iso_country    1083 non-null   object 
dtypes: float64(6), int64(1), object(7)
memory usage: 126.9+ KB


In [85]:
mean_flight_time = merged_data['flight_time'].mean()
mean_train_time = merged_data['train_time'].mean()

median_flight_time = merged_data['flight_time'].median()
median_train_time = merged_data['train_time'].median()

std_flight_time = merged_data['flight_time'].std()
std_train_time = merged_data['train_time'].std()


summary_data = {
    "Statistic": ["Mean", "Median", "Standard Deviation"],
    "Flight Time (hours)": [mean_flight_time / 60, median_flight_time / 60, std_flight_time / 60],
    "Flight Time (min)": [mean_flight_time, median_flight_time, std_flight_time],
    "Train Time (hours)": [mean_train_time / 60, median_train_time / 60, std_train_time / 60],
    "Train Time (min)": [mean_train_time, median_train_time, std_train_time]
}

summary_df = pd.DataFrame(summary_data)

print(summary_df)

            Statistic  Flight Time (hours)  Flight Time (min)  \
0                Mean             2.041881         122.512850   
1              Median             1.958333         117.500000   
2  Standard Deviation             0.771676          46.300586   

   Train Time (hours)  Train Time (min)  
0           20.685337       1241.120191  
1           19.436111       1166.166667  
2           14.209877        852.592627  


In [138]:
summary_stats = {
    "Mean": merged_data[['flight_time', 'train_time', 'train_time_hours', 'flight_time_plus_90', 'flight_time_hours_with_extra']].mean(),
    "Median": merged_data[['flight_time', 'train_time', 'train_time_hours', 'flight_time_plus_90', 'flight_time_hours_with_extra']].median(),
    "Standard Deviation": merged_data[['flight_time', 'train_time', 'train_time_hours', 'flight_time_plus_90', 'flight_time_hours_with_extra']].std(),
    "Minimum": merged_data[['flight_time', 'train_time', 'train_time_hours', 'flight_time_plus_90', 'flight_time_hours_with_extra']].min(),
    "Maximum": merged_data[['flight_time', 'train_time', 'train_time_hours', 'flight_time_plus_90', 'flight_time_hours_with_extra']].max()
}

# Create a DataFrame from the summary statistics
summary_df = pd.DataFrame(summary_stats)

# Display the summary statistics DataFrame
print(summary_df)



                                     Mean       Median  Standard Deviation  \
flight_time                    122.512850   117.500000           46.300586   
train_time                    1241.120191  1166.166667          852.592627   
train_time_hours                20.685337    19.436111           14.209877   
flight_time_plus_90            212.512850   207.500000           46.300586   
flight_time_hours_with_extra     3.541881     3.458333            0.771676   

                                 Minimum      Maximum  
flight_time                    10.000000   325.000000  
train_time                     63.000000  5027.000000  
train_time_hours                1.050000    83.783333  
flight_time_plus_90           100.000000   415.000000  
flight_time_hours_with_extra    1.666667     6.916667  


### Country Representation in Dataset

In [119]:
all_countries = pd.concat([merged_data['dep_iso_country'], merged_data['arr_iso_country']])

# Create a histogram of the countries
country_histogram = px.histogram(all_countries, x=all_countries)
country_histogram.update_layout(
    title='Country Representation in Dataset',
    xaxis_title='Country',
    yaxis_title='Count'
)


country_histogram.show()

### Plotting in Minutes

In [131]:
scatter_fig = px.scatter(merged_data, x='train_time', y='flight_time', hover_data=['departure city', 'arrival city'])
scatter_fig.update_layout(title='Comparison of Train Time vs Flight Time in minutes', xaxis_title='Train Time (minutes)', yaxis_title='Flight Time (minutes)')
scatter_fig.show()

In [90]:
box_fig = go.Figure()
box_fig.add_trace(go.Box(y=merged_data['flight_time'], name='Flight Time'))
box_fig.add_trace(go.Box(y=merged_data['train_time'], name='Train Time'))
box_fig.update_layout(title='Distribution of Travel Times in minutes', yaxis_title='Time (minutes)')
box_fig.show()

In [104]:
def remove_outliers(data):
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    return data[(data >= lower_bound) & (data <= upper_bound)]

# Filter out outliers from flight and train times
flight_time_no_outliers = remove_outliers(merged_data['flight_time'])
train_time_no_outliers = remove_outliers(merged_data['train_time'])

# Create a figure for the box plots
box_fig = go.Figure()

# Add box plot for filtered flight times
box_fig.add_trace(go.Box(y=flight_time_no_outliers, name='Flight Time'))

# Add box plot for filtered train times
box_fig.add_trace(go.Box(y=train_time_no_outliers, name='Train Time'))

# Update layout
box_fig.update_layout(
    title='Distribution of Travel Times in Minutes (Without Outliers)',
    yaxis_title='Time (minutes)'
)

# Show the figure
box_fig.show()

## How can estimates of travel time to and from airports be included? 
Why we added 90 minutes to our flight time:

- boarding is closing 30 minutes before flight,
- there is need for passport control sometimes,
- baggage control always, 
- 

+ 90 minutes added to airport times, as there is an anarage of 90 minutes spent extra on airports, before and after flight

In [171]:
merged_data['train_time_hours'] = merged_data['train_time'] / 60

# Add 90 minutes to flight time to account for pre-flight and post-flight activities
merged_data['flight_time_plus_90'] = merged_data['flight_time'] + 90

# Convert flight time (with the added 90 minutes) to hours
merged_data['flight_time_hours_with_extra'] = merged_data['flight_time_plus_60'] / 60

print(merged_data[['flight_time', 'train_time', 'train_time_hours', 'flight_time_plus_60', 'flight_time_hours_with_extra']].head())

   flight_time  train_time  train_time_hours  flight_time_plus_60  \
0         85.0       725.0         12.083333                175.0   
1        170.0      1837.0         30.616667                260.0   
2        180.0      1837.0         30.616667                270.0   
3         85.0       767.0         12.783333                175.0   
4         95.0       459.0          7.650000                185.0   

   flight_time_hours_with_extra  
0                      2.916667  
1                      4.333333  
2                      4.500000  
3                      2.916667  
4                      3.083333  


In [172]:
scatter_fig = px.scatter(merged_data, x='flight_time_hours_with_extra', y='train_time_hours', hover_data=['arrival city', 'departure city'])
scatter_fig.update_layout(title='Comparison of Flight Time vs Train Time in Hours',
                          xaxis_title='Flight Time (hours, including 90min extra)',
                          yaxis_title='Train Time (hours)')

# Add line where flight time equals train time (y = x)
scatter_fig.add_shape(
    # Line representing y = x
    type="line",
    line=dict(dash='dash', color='red'),
    xref="x", yref="y",
    x0=0, y0=0,  # Starting point of the line (bottom left)
    x1=max(merged_data['flight_time_hours_with_extra'].max(), merged_data['train_time_hours'].max()),  # Use the max value from your data
    y1=max(merged_data['flight_time_hours_with_extra'].max(), merged_data['train_time_hours'].max())   # Use the max value from your data
)

# Show the plot
scatter_fig.show()

If we zoom in on the 

In [173]:
box_fig = go.Figure()
box_fig.add_trace(go.Box(y=merged_data['flight_time_hours_with_extra'], name='Flight Time'))
box_fig.add_trace(go.Box(y=merged_data['train_time_hours'], name='Train Time'))
box_fig.update_layout(title='Distribution of Travel Times in hours', yaxis_title='Time (hours)')
box_fig.show()

In [174]:
# Histogram for All Flight Times
hist_flight_all = px.histogram(merged_data, x='flight_time_hours_with_extra', nbins=50)
hist_flight_all.update_layout(title='Histogram of All Flight Times (hours, including 90min extra)',
                              xaxis_title='Flight Time (hours)',
                              yaxis_title='Count')
hist_flight_all.show()

# Histogram for All Train Times
hist_train_all = px.histogram(merged_data, x='train_time_hours', nbins=50)
hist_train_all.update_layout(title='Histogram of All Train Times (hours)',
                             xaxis_title='Train Time (hours)',
                             yaxis_title='Count')
hist_train_all.show()

# Combined Histogram for Both Flight and Train Times
histogram_fig_combined = px.histogram(stacked_times, x='Time', color='Type', barmode='overlay', nbins=50)
histogram_fig_combined.update_layout(title='Combined Histogram of All Flight and Train Times (Hours)',
                                     xaxis_title='Time (hours)',
                                     yaxis_title='Count',
                                     legend_title_text='Type')
histogram_fig_combined.show()

## Are there routes on which high-speed rail leads to shorter journey times than air travel? 
Answer: Yes, a few of them when time is added for airport transport, if we exclude time than no.

In [125]:
shorter_train_routes = merged_data[merged_data['train_time'] < merged_data['flight_time']]

# Check if there are any such routes
if not shorter_train_routes.empty:
    print("Routes where train time is shorter than flight time:")
    print(shorter_train_routes[['arrival city', 'departure city', 'flight_time', 'train_time']])
else:
    print("There are no routes where train time is shorter than flight time.")

There are no routes where train time is shorter than flight time.


In [145]:
shorter_train_routes = merged_data[merged_data['train_time'] < merged_data['flight_time_plus_90']]

# Check if there are any such routes
if not shorter_train_routes.empty:
    print("Routes where train time is shorter than flight time:")
    print(shorter_train_routes[['arrival city', 'departure city', 'flight_time_plus_90', 'train_time']])
else:
    print("There are no routes where train time is shorter than flight time.")

Routes where train time is shorter than flight time:
              arrival city departure city  flight_time_plus_90  train_time
31                  Madrid       Alicante           160.000000  148.000000
109              Vila Real       Bragança           110.000000   80.000000
197               Bordeaux          Paris           175.000000  150.000000
198               Brussels          Paris           160.000000   91.000000
287              Frankfurt        Cologne           140.000000   63.000000
329                Bologna           Rome           150.000000  123.000000
337               Florence           Rome           145.000000   87.000000
378   Freiburg im Breisgau      Frankfurt           145.000000  132.000000
382                Cologne      Frankfurt           140.000000   67.000000
389                Koblenz      Frankfurt           135.000000  130.000000
398              Nuremberg      Frankfurt           135.000000  123.000000
402              Stuttgart      Frankfurt      

In [151]:
shorter_train_routes = shorter_train_routes.reset_index(drop=True)

# Create the long-format DataFrame with flight and train times separately
flight_times = shorter_train_routes[['departure city', 'arrival city', 'flight_time_plus_90']].copy()
train_times = shorter_train_routes[['departure city', 'arrival city', 'train_time']].copy()

# Rename the columns to have a common 'Time' column and a 'Type' column to differentiate between flight and train
flight_times.rename(columns={'flight_time_plus_90': 'Time'}, inplace=True)
train_times.rename(columns={'train_time': 'Time'}, inplace=True)
flight_times['Type'] = 'Flight Time'
train_times['Type'] = 'Train Time'

# Concatenate the flight and train times DataFrames
long_format = pd.concat([flight_times, train_times])

# Now create the bar chart
bar_fig = px.bar(long_format, 
                 x='Time', 
                 y=long_format['departure city'] + " to " + long_format['arrival city'], 
                 color='Type', 
                 barmode='group',
                 orientation='h',
                 title='Comparison of Flight Time (with extra 90min) and Train Time for Each Route',
                 labels={'x': 'Time (minutes)', 'y': 'Route'},
                 color_discrete_map={'Flight Time': 'blue', 'Train Time': 'orange'})

# Show the bar chart
bar_fig.show()

Notes for High Speed trains and their max speed

Rome – Naples	-> max speed: 300 km/h(186 mph)

Florence – Rome -> max speed:	250 km/h (155 mph)

Cologne – Frankfurt -> max speed: 300 km/h (186 mph)

Valencia - Madrid -> max speed: 300 km/h (186 mph)

### Analysing longest routes

In [178]:
# top 10 longest flight routes
top_flights = merged_data.nlargest(10, 'flight_time_hours_with_extra')

#top 10 longest train routes
top_trains = merged_data.nlargest(10, 'train_time_hours')

top_routes = pd.concat([top_flights, top_trains]).drop_duplicates().reset_index(drop=True)

# Melt the dataframe to have 'Type' and 'Time' columns for plotting
top_routes_long = top_routes.melt(
    id_vars=['departure city', 'arrival city'], 
    value_vars=['flight_time_hours_with_extra', 'train_time_hours'], 
    var_name='Type', 
    value_name='Time'
)


top_routes_long['Route'] = top_routes_long['departure city'] + " to " + top_routes_long['arrival city']


bar_fig = px.bar(
    top_routes_long,
    x='Time',
    y='Route',
    color='Type',
    barmode='group',
    orientation='h',
    title='Top 10 Longest Routes by Flight and Train Time',
    labels={'Time': 'Time (hours)', 'Route': 'Route'},
    color_discrete_map={
        'flight_time_hours_with_extra': 'blue',
        'train_time_hours': 'orange'
    }
)

# Reverse the order to have the longest at the top
bar_fig.update_layout(yaxis={'categoryorder':'total ascending'})


bar_fig.show()

## Which is the most well-connected city in Europe in terms of minimising travel times to other cities? 
Answer: London & Frankfurt

In [None]:
merged_data.to_csv('datasets/merged_df.csv')