In [26]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
import seaborn as sns
from matplotlib.animation import FuncAnimation
import plotly.express as px

# Import data
passenger_data = pd.read_csv("passenger_new.csv", index_col='Country')
infrastructure_road_data = pd.read_csv("infrastructure_road_new.csv", index_col='Country')
infrastructure_train_data = pd.read_csv("infrastructure_train_new.csv", index_col='Country')
population_data = pd.read_csv("population_new.csv", index_col='Country')
gdp_data = pd.read_csv("gdp_new.csv", index_col='Country')
gdp_per_capita_data = pd.read_csv("GDP_per_capita_new.csv", index_col='Country')

# Set the data type of the used datasets to float

infrastructure_train_data = infrastructure_train_data.astype(float)
gdp_data = gdp_data.astype(float)

# Determine percentage of GDP that is invested in railway infrastructure
infrastructure_gdp = infrastructure_train_data / gdp_data*100

passenger_data = passenger_data.astype(float)
population_data = population_data.astype(float)

# Determine amount of kilometers travelled per passenger
# Note that passenger data is the total amount of kilometers travelled in millions
km_per_passenger = passenger_data / population_data*1000000

# Exclude the 2020 data of investments before we determine the average
infrastructure_gdp_up_to_2019 = infrastructure_gdp.iloc[:, :-1]

# Determine the average for all years of the infrastructure data
year_average_infrastructure_gdp_up_to_2019 = infrastructure_gdp_up_to_2019.mean(axis=1)

# Select columns from 2000 to 2020 from the kilometers per passenger data
km_per_passengers_2000_2020 = km_per_passenger.loc[:, '2000':'2020']

# Calculate percentage growth for each country
passenger_growth = ((km_per_passengers_2000_2020.iloc[:, -1] - km_per_passengers_2000_2020.iloc[:, 0]) / km_per_passengers_2000_2020.iloc[:, 0]) * 100

# Investments year average comparing with passenger transport year average dataframe making
df3 = pd.DataFrame({'Country': year_average_infrastructure_gdp_up_to_2019.index, 'Investments average': year_average_infrastructure_gdp_up_to_2019})
df4 = pd.DataFrame({'Country': passenger_growth.index, 'Passenger growth': passenger_growth.values})

# Drop the existing "Country" column, if it exists
if 'Country' in df3.columns:
    df3 = df3.drop('Country', axis=1)

# Reset the index of df3 to make "Country" a regular column
df3_reset = df3.reset_index()

# Reset the index of df4 to make "Country" a regular column
df4_reset = df4.reset_index()

# Merge the two DataFrames based on the "Country" column
df5 = pd.merge(df3_reset, df4_reset, left_on='Country', right_on='Country')

df5 = df5.drop(df5.columns[2], axis=1)



print(df5)

# Create a scatter plot using Plotly Express
fig = px.scatter(df5, x='Investments average', y='Passenger growth', color= 'Country',
                 title='Scatter Plot for Average Values', labels={'Investments': 'Average Investments', 'Passengers': 'Average Passengers'})

# Customize the layout
fig.update_layout(title='Investments vs. Passengers average over the years',
                  xaxis_title='Average Investments in railway infrastructure corrected by GDP', yaxis_title='Kilometres travelled per passenger')

# Show the plot
fig.show()


   Country  Investments average  Passenger growth
0      AUS             0.232915        -10.351159
1      AZE             0.064069        126.925370
2      CZE             0.385187         21.564163
3      FIN             0.177384         10.454374
4      FRA             0.259002          5.825877
5      DEU             0.201057          6.920792
6      HUN             0.340019         38.547422
7      ITA             0.321021         -1.169693
8      JPN             0.195861         -3.609343
9      NOR             0.167108          8.970910
10     POL             0.104046         41.818819
11     RUS             0.561608        -29.223883
12     ESP             0.395886         -1.781088
13     SWE             0.291527        -18.776321
14     CHE             0.543179          9.829690
15     TUR             0.134914         40.462400
16     GBR             0.349940         -0.922153
