In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"

In [2]:
import wbdata #used to fetch economic and developmeent data from the world bank API
import pandas as pd
import numpy as np

import plotly.express as px
import datetime 
import time


from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

import ipywidgets as widgets
from IPython.display import display 


In [3]:
eu_countries = [            # List of ISO-2 codes representing all 27 EU member countries
    'AT', 'BE', 'BG', 'HR', 'CY', 'CZ', 'DK', 'EE', 'FI',
    'FR', 'DE', 'GR', 'HU', 'IE', 'IT', 'LV', 'LT', 'LU',
    'MT', 'NL', 'PL', 'PT', 'RO', 'SK', 'SI', 'ES', 'SE'
] 

# Dictionary mapping world  bank indicator codes to human - readable labels
indicators = {
    'NY.GDP.PCAP.CD': 'GDP per capita',                            # GDP per person (US dollars)
    'SE.XPD.TOTL.GD.ZS': 'Education spending (% GDP)',            # % of GDP spent on education
    'SH.XPD.CHEX.PC.CD': 'Health spending per capita (US$)',      # Health spending per person
    'SP.POP.TOTL': 'Population',                                   # Total population
    'NE.GDI.TOTL.ZS': 'Investment (% GDP)',                        # Gross capital formation
    'SL.TLF.CACT.ZS': 'Labor force participation (%)' 
}

In [4]:
# Fetching data from the World Bank API 
df_raw = wbdata.get_dataframe(indicators, country=eu_countries)

# Reseting the multi-index to convert 'country' and 'date' into columns
df_raw.reset_index(inplace=True)

In [5]:
# data cleaning and preprocessing 
# converts the 'date' column to datetime format
df_raw['date'] = pd.to_datetime(df_raw['date'])

# Extracting just the year from the datetime object into a new 'year' column
df_raw['year'] = df_raw['date'].dt.year

# filtering all the rows  before the year 2000 and droping na values
df_raw = df_raw[df_raw['year'] >= 2000].dropna()

# Mapping countries names to ISO-2 codes (for consistency or future mapping)

df_raw['country_code'] = df_raw['country'].map({
    'Austria': 'AT', 'Belgium': 'BE', 'Bulgaria': 'BG', 'Croatia': 'HR',
    'Cyprus': 'CY', 'Czech Republic': 'CZ', 'Denmark': 'DK', 'Estonia': 'EE',
    'Finland': 'FI', 'France': 'FR', 'Germany': 'DE', 'Greece': 'GR',
    'Hungary': 'HU', 'Ireland': 'IE', 'Italy': 'IT', 'Latvia': 'LV',
    'Lithuania': 'LT', 'Luxembourg': 'LU', 'Malta': 'MT', 'Netherlands': 'NL',
    'Poland': 'PL', 'Portugal': 'PT', 'Romania': 'RO', 'Slovakia': 'SK',
    'Slovenia': 'SI', 'Spain': 'ES', 'Sweden': 'SE'
})

print('EU dataset shape:', df_raw.shape)
df_raw.head(5)

EU dataset shape: (600, 10)


Unnamed: 0,country,date,GDP per capita,Education spending (% GDP),Health spending per capita (US$),Population,Investment (% GDP),Labor force participation (%),year,country_code
2,Austria,2022-01-01,52176.664914,4.774471,5851.962402,9041851.0,29.22677,60.882,2022,AT
3,Austria,2021-01-01,53648.719074,5.49411,6520.490723,8955797.0,27.673244,60.631,2021,AT
4,Austria,2020-01-01,48716.40989,5.57127,5531.023438,8916864.0,25.653074,60.364,2020,AT
5,Austria,2019-01-01,49885.994736,5.21779,5263.390625,8879920.0,25.509008,60.898,2019,AT
6,Austria,2018-01-01,51194.074984,5.22655,5337.218262,8840521.0,25.898276,60.919,2018,AT


## Country Clustering Dashboard â€“ KMeans Clustering on Socioeconomic Indicators

### Variables Used:
- GDP per capita
- Health spending per capita (US$)
- Education spending (% GDP)
- Investment (% GDP)
- Labor force participation (%)
- Population

### Computing average values per country

In [6]:
#Grouping data by country and calculating the mean for each selected variable
#droping any countries with missing values
average_df = df_raw.groupby('country')[[
    'GDP per capita',
    'Health spending per capita (US$)',
    'Education spending (% GDP)',
    'Investment (% GDP)',
    'Labor force participation (%)',
    'Population'
]].mean().dropna()

In [10]:
average_df.head()

Unnamed: 0_level_0,GDP per capita,Health spending per capita (US$),Education spending (% GDP),Investment (% GDP),Labor force participation (%),Population,Cluster
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Austria,43823.456807,4527.115287,5.405545,24.813772,59.687739,8474308.0,1
Belgium,40936.719665,4150.535023,6.173329,23.546246,53.068864,10920790.0,1
Bulgaria,6486.193407,471.888055,3.96809,23.454446,53.229318,7408372.0,2
Croatia,12427.614187,886.08002,4.52541,22.60092,51.996727,4205810.0,2
Cyprus,26551.151811,1737.904538,6.016542,19.777837,63.863045,1135802.0,2


In [7]:
# standardizing all columns in the dataset so that they have a mean of 0 and standard deviation of = 1
scaler  = StandardScaler()
scaled_data = scaler.fit_transform(average_df)

In [13]:
print(f"Scaled_data: {scaled_data}")

Scaled_data: [[ 0.67323956  1.13248846  0.36667183  0.6003761   0.42015547 -0.36593666]
 [ 0.53166725  0.91481862  1.16285892  0.14711504 -1.04948649 -0.25170433]
 [-1.15786675 -1.21150225 -1.12396051  0.11428781 -1.01385948 -0.41570778]
 [-0.86648564 -0.97209221 -0.54602286 -0.1909289  -1.28754152 -0.56524312]
 [-0.17383448 -0.47972307  1.00027165 -1.20044903  1.34723219 -0.70858919]
 [-0.60373832 -0.73537438 -0.84685675  1.69269352  0.34654723 -0.27481101]
 [ 1.19258669  1.6360046   2.72612917 -0.53389946  1.32586783 -0.50065615]
 [-0.69751884 -0.90664622  0.13114897  2.22771907  0.68147139 -0.69904692]
 [ 0.65042902  0.76133086  1.37161921  0.18517684  0.55935048 -0.51093484]
 [ 0.35239806  0.89932497  0.48418698 -0.39205453 -0.47019941  2.26800095]
 [ 0.54771357  1.12102246 -0.42766739 -0.90667568  0.38051686  3.07563427]
 [-0.44564823 -0.43189186 -1.2232462  -1.66740724 -1.39094059 -0.25246969]
 [-0.85676552 -0.96028253 -0.26965352  0.5711542  -0.81099748 -0.29645506]
 [ 1.3664732

In [8]:
# applying K-Means Clustering 
number_clusters = 4
kmeans= KMeans(n_clusters=number_clusters, random_state=0, n_init=10)

#fitting KMeans to the scaled data and assing each country to a cluster 
average_df['Cluster'] = kmeans.fit_predict(scaled_data)

In [9]:
#Reseting index so 'country' becomes acolumn again for plotting 
visualization_df = average_df.reset_index()

In [15]:
# Creating a dropdown for X-axis variable
# it lets users choose a variable to compare with GDP per capita
x_dropdown = widgets.Dropdown(
    options=[
        col for col in average_df.columns if col not in ['GDP per capita', 'Cluster']
        ],
    description='Compare to:',
    layout=widgets.Layout(width='50%')
)

In [17]:
# Defining the ploting function

def change_plot(x_axis):
    fig = px.scatter(
        data_frame=visualization_df,
        x=x_axis,
        y='GDP per capita',
        color="Cluster",
        size='GDP per capita',
        hover_name='country',
        hover_data={
            x_axis: ':.2f',
            'GDP per capita': ':.2f',
            'Education spending (% GDP)': ':.2f',
            'Health spending per capita (US$)': ':.2f',
            'Investment (% GDP)': ':.2f',
            'Labor force participation (%)': ':.2f',
            'Population': ':,'
        },
        title=f"GDP per Capita vs {x_axis}",
        template='plotyly_white',
        width=950,
        height=600
    )
    
    fig.update_traces(
        marker=dict(
            opacity=0.85,
            line=dict(width=0.85, color='gray')
        )
    )
    
    fig.update_layout(legend_title_text='Cluster Group')
    fig.show()