In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load the datasets
df_emission = pd.read_csv("../notebooks/cache/Schiphol_Emissie_Cleaned.csv")
df_yearly = pd.read_csv("../notebooks/cache/Yearly_Traffic_Data_Cleaned.csv")

# Preprocess the emission dataset
df_emission['EMISSIE (kg)'] = df_emission['EMISSIE (kg)'].str.replace(',', '.').astype(float)

# Filter the data for "Koolstofdioxide", "Fijnstof (PM10)", and "Fijnstof (PM2,5)"
df_filtered = df_emission[df_emission['STOFNAAM'].isin(['Koolstofdioxide', 'Fijnstof (PM10)', 'Fijnstof (PM2,5)'])]

# Aggregate the emissions data by year and substance
df_aggregated_emission = df_filtered.groupby(['EMISSIEJAAR', 'STOFNAAM'])['EMISSIE (kg)'].sum().unstack().reset_index()

# Merge with the yearly traffic data
df_combined = pd.merge(df_yearly, df_aggregated_emission, left_on='Year', right_on='EMISSIEJAAR', how='inner')

# Drop unnecessary columns and handle any missing values if needed
df_combined.drop(columns=['EMISSIEJAAR'], inplace=True)
df_combined.fillna(0, inplace=True)

# Select the relevant columns for correlation
df_relevant = df_combined[['Air_Transport_Movements_Total', 'Passengers_Total', 'Cargo (tonnes)', 'Koolstofdioxide', 'Fijnstof (PM10)', 'Fijnstof (PM2,5)']]

# Calculate the correlation matrix
correlation_matrix = df_relevant.corr()

# Create the heatmap
fig = px.imshow(correlation_matrix, 
                text_auto=True, 
                aspect="auto", 
                labels=dict(color="Correlation"),
                x=correlation_matrix.columns, 
                y=correlation_matrix.columns,
                color_continuous_scale='RdBu_r')

# Update layout for the plot
fig.update_layout(
    title="Correlation Heatmap",
    xaxis_title="Variables",
    yaxis_title="Variables",
    height=600,
    width=600,
    margin=dict(l=50, r=50, t=50, b=50)
)

# Show the plot
fig.show()
