In [2]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns
from matplotlib.animation import FuncAnimation
import plotly.express as px

# Import data
passenger_data = pd.read_csv("passenger_new.csv", index_col='Country')
infrastructure_road_data = pd.read_csv("infrastructure_road_new.csv", index_col='Country')
infrastructure_train_data = pd.read_csv("infrastructure_train_new.csv", index_col='Country')
population_data = pd.read_csv("population_new.csv", index_col='Country')
gdp_data = pd.read_csv("gdp_new.csv", index_col='Country')
gdp_per_capita_data = pd.read_csv("GDP_per_capita_new.csv", index_col='Country')

# Set the data type of the used datasets to float

infrastructure_train_data = infrastructure_train_data.astype(float)
gdp_data = gdp_data.astype(float)

# Determine percentage of GDP that is invested in railway infrastructure
infrastructure_gdp = infrastructure_train_data / gdp_data*100

passenger_data = passenger_data.astype(float)
population_data = population_data.astype(float)

# Determine amount of kilometers travelled per passenger
# Note that passenger data is the total amount of kilometers travelled in millions
km_per_passenger = passenger_data / population_data*1000000

# Exclude the 2020 data of investments before we determine the average
infrastructure_gdp_up_to_2019 = infrastructure_gdp.iloc[:, :-1]

# Determine the average for all years of both of the datasets
year_average_km_per_passenger = km_per_passenger.mean(axis=1)
year_average_infrastructure_gdp_up_to_2019 = infrastructure_gdp_up_to_2019.mean(axis=1)
print(year_average_km_per_passenger)
print(year_average_infrastructure_gdp_up_to_2019)

# Investments year average comparing with passenger transport year average dataframe making
df3 = pd.DataFrame(year_average_infrastructure_gdp_up_to_2019)
df4 = pd.DataFrame(year_average_km_per_passenger)

# Combine the two data frames into a seperate data frame
df5 = pd.concat([df3, df4], axis=1)

#Set the column names
df5.columns = ['Year average infrastructure gdp', 'Year average passenger km per person'] + list(df.columns[2:])

# Include the index as one of the columns as well
df5_reset = df5.reset_index()
print(df5_reset)

# Create a scatter plot using Plotly Express
fig = px.scatter(df5_reset, x='Year average infrastructure gdp', y='Year average passenger km per person', color= 'Country',
                 title='Scatter Plot for Average Values', labels={'Investments': 'Average Investments', 'Passengers': 'Average Passengers'})

# Customize the layout
fig.update_layout(title='Investments vs. Passengers average over the years',
                  xaxis_title='Average Investments in railway infrastructure corrected by GDP', yaxis_title='Kilometres travelled per passenger')

# Show the plot
fig.show()


Country
AUS    12913.455031
AZE     1789.267842
CZE     7552.008289
FIN    13157.742081
FRA    12833.204304
DEU    11694.672477
HUN     7136.509977
ITA    13189.816665
MKD     3711.436504
NOR    12884.177900
RUS      970.007579
ESP     8234.177264
SWE    12180.539981
CHE    11602.311270
TUR     3173.803264
dtype: float64
Country
AUS    0.231066
AZE    0.053463
CZE    0.360389
FIN    0.167324
FRA    0.257465
DEU    0.185248
HUN    0.330675
ITA    0.300809
MKD    0.065399
NOR    0.171458
RUS    0.523162
ESP    0.373117
SWE    0.280336
CHE    0.517311
TUR    0.139565
dtype: float64
   Country  Year average infrastructure gdp  \
0      AUS                         0.231066   
1      AZE                         0.053463   
2      CZE                         0.360389   
3      FIN                         0.167324   
4      FRA                         0.257465   
5      DEU                         0.185248   
6      HUN                         0.330675   
7      ITA                         0.3