In [2]:
# Environment and Dependencies:
# - Python 3.8.10
# - pandas 1.4.2
# - numpy 1.22.3
# - scipy 1.7.3
# - scikit-learn 1.0.2
# - matplotlib 3.5.1

import pandas as pd
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.stats import kruskal
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# 1. Load the extracted dataset (R6_extracted_from_WDI.csv)
#    Ensure this CSV contains Country, Period, and the eight indicator columns
df = pd.read_csv('R6_extracted_from_WDI.csv')

# 2. Define analysis periods
periods = ["2002-2005", "2006-2009", "2010-2013"]

# 3. Initialize storage for cluster results
cluster_results = {}

# 4. Perform hierarchical clustering for each period
def perform_clustering(sub_df):
    # Standardize feature values
    scaler = StandardScaler()
    X = scaler.fit_transform(sub_df.values)
    # Find best k by silhouette score
    silhouette_scores = {}
    for k in range(2, 6):
        Z_tmp = linkage(X, method='ward')
        labels_tmp = fcluster(Z_tmp, k, criterion='maxclust')
        silhouette_scores[k] = silhouette_score(X, labels_tmp)
    best_k = max(silhouette_scores, key=silhouette_scores.get)
    # Final clustering
    Z = linkage(X, method='ward')
    labels = fcluster(Z, best_k, criterion='maxclust')
    return best_k, Z, labels, silhouette_scores

for period in periods:
    # 4.1 Filter data and set Country as index
    sub = df[df['Period'] == period].set_index('Country')
    # 4.2 Drop non-feature columns (only leave numeric indicators)
    features = sub.drop(columns=['Period'])
    # 4.3 Run clustering
    best_k, Z, labels, sil_scores = perform_clustering(features)
    # 4.4 Save labels and results
    sub = features.copy()
    sub['Cluster'] = labels
    cluster_results[period] = {
        'data': sub,
        'linkage_matrix': Z,
        'best_k': best_k,
        'silhouette_scores': sil_scores
    }

# 5. Report silhouette scores and selected k
print("==== Silhouette Scores and Best k for Each Period ====")
for period, res in cluster_results.items():
    print(f">>> Period: {period}")
    for k, score in res['silhouette_scores'].items():
        print(f"   k = {k}, silhouette score = {score:.4f}")
    print(f"   -> Best k = {res['best_k']}\n")

# 6. Build Country × Period × Cluster table
combined = []
for period, res in cluster_results.items():
    df_tmp = res['data'][['Cluster']].reset_index()
    df_tmp['Period'] = period
    combined.append(df_tmp)
combined_df = pd.concat(combined, ignore_index=True)
print("==== Country × Period × Cluster Labels ====")
print(combined_df.to_string(index=False))

# 7. Kruskal–Wallis test for each feature across clusters
print("\n==== Kruskal–Wallis Significance Test Results ====")
for period, res in cluster_results.items():
    data = res['data']
    k = res['best_k']
    print(f"\n--- Period: {period}, k = {k} ---")
    for feature in data.columns.drop('Cluster'):
        groups = [data[data['Cluster'] == cid][feature].values for cid in range(1, k+1)]
        flat = np.concatenate(groups)
        if np.all(flat == flat[0]):
            print(f"{feature}: all values identical → H=0, p=1")
            continue
        stat, p = kruskal(*groups)
        print(f"{feature}: H = {stat:.4f}, p = {p:.4f}")

# 8. Generate and save dendrograms per period
for period, res in cluster_results.items():
    Z = res['linkage_matrix']
    plt.figure(figsize=(8, 6))
    dendrogram(Z,
               labels=res['data'].index.tolist(),
               orientation='right',
               color_threshold=0)
    plt.title(f"Dendrogram for {period} (Ward)")
    plt.xlabel("Distance")
    plt.tight_layout()
    plt.savefig(f"dendrogram_{period}.png", dpi=300)
    plt.close()
    print(f">>> Saved dendrogram: dendrogram_{period}.png")

# End of script


==== Silhouette Scores and Best k for Each Period ====
>>> Period: 2002-2005
   k = 2, silhouette score = 0.2115
   k = 3, silhouette score = 0.1927
   k = 4, silhouette score = 0.2200
   k = 5, silhouette score = 0.1967
   -> Best k = 4

>>> Period: 2006-2009
   k = 2, silhouette score = 0.3084
   k = 3, silhouette score = 0.2446
   k = 4, silhouette score = 0.2270
   k = 5, silhouette score = 0.2160
   -> Best k = 2

>>> Period: 2010-2013
   k = 2, silhouette score = 0.3246
   k = 3, silhouette score = 0.2272
   k = 4, silhouette score = 0.2368
   k = 5, silhouette score = 0.2318
   -> Best k = 2

==== Country × Period × Cluster Labels ====
        Country    Period  Cluster
        Austria 2002-2005        1
        Belgium 2002-2005        1
       Bulgaria 2002-2005        2
        Croatia 2002-2005        3
         Cyprus 2002-2005        1
        Denmark 2002-2005        1
        Estonia 2002-2005        4
        Finland 2002-2005        1
         France 2002-2005        1