# 1. Imports, settings, and constants

In [11]:
# --- Block 1: imports & constants (UPDATED) ---
import os, glob, json, warnings
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score, adjusted_rand_score
from sklearn.metrics import pairwise_distances

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
warnings.filterwarnings("ignore")

# ✅ Render Plotly inline in the notebook AND still allow HTML export
# If 'notebook_connected' isn't available in your env, it falls back gracefully.
try:
    pio.renderers.default = "notebook_connected"
except Exception:
    try:
        pio.renderers.default = "notebook"
    except Exception:
        pio.renderers.default = "browser"

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

INDICATORS = [
    "Inflation",
    "Unemployment",
    "Trade",
    "Debt",
    "Credit_to_Private_Sector",
    "NPLs",
    "GDP_Growth",
    "GDP"  # log10 transformed later
]

PERIODS = {
    "2000–2009": (2000, 2009),
    "2010–2014": (2010, 2014),
    "2015–2019": (2015, 2019)
}

# ✅ New output folder name (clear for Task 1)
OUTDIR = "Task1_grouped_data_output"
os.makedirs(OUTDIR, exist_ok=True)

# --- Block 1.1: helper to save + show (NEW) ---
def save_and_show(fig, html_name):
    """
    Saves the Plotly figure as interactive HTML in OUTDIR,
    then renders it inline in the notebook.
    """
    path = os.path.join(OUTDIR, html_name)
    fig.write_html(path, include_plotlyjs="cdn")
    print(f"Saved interactive HTML → {path}")
    fig.show()



# 2. Robust loader + basic coercions

In [12]:
# --- Block 2: robust data loading ---
CANDIDATES = [
    "WDI_cleaned_1975_2023.csv",
    "./Preprocessed-Data/WDI_cleaned_1975_2023.csv",
    "../Preprocessed-Data/WDI_cleaned_1975_2023.csv"
]

DATA = None
for p in CANDIDATES:
    if os.path.exists(p):
        DATA = p
        break

if DATA is None:
    # quick recursive search for speed
    hits = []
    for pat in ["*", "*/*", "*/*/*"]:
        hits.extend(glob.glob(os.path.join(os.getcwd(), pat, "WDI_cleaned_1975_2023.csv")))
    if hits:
        hits = sorted(hits, key=len)
        DATA = hits[0]

if DATA is None:
    raise FileNotFoundError("Place 'WDI_cleaned_1975_2023.csv' in this folder or set DATA to its full path.")

print("Loading:", DATA)
df = pd.read_csv(DATA, low_memory=False, encoding="utf-8")

required = ["Country Name","Country Code","Year"] + INDICATORS
missing = [c for c in required if c not in df.columns]
if missing:
    raise ValueError(f"Missing required columns: {missing}")

# Ensure numeric types for indicators and Year
for c in INDICATORS + ["Year"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Log-transform GDP to reduce dominance in PCA / size encodings
df["GDP"] = np.where(df["GDP"] > 0, np.log10(df["GDP"]), np.nan)

# Region fallback if not present
if "Region" not in df.columns:
    df["Region"] = "Unknown"

print("Data shape:", df.shape, "| Years:", int(df["Year"].min()), "-", int(df["Year"].max()))


Loading: ./Preprocessed-Data/WDI_cleaned_1975_2023.csv
Data shape: (11929, 12) | Years: 1975 - 2023


# 3. Helpers: period aggregation, impute+standardize, metrics, confidence

In [13]:
# --- Block 3: helper functions ---

def aggregate_period(df_full, years, indicators, min_non_null=2):
    """Aggregate per country mean over a year window; keep countries meeting data sufficiency."""
    lo, hi = years
    tmp = df_full[(df_full["Year"] >= lo) & (df_full["Year"] <= hi)].copy()
    agg = tmp.groupby(["Country Name","Country Code","Region"], as_index=False)[indicators].mean()
    # Keep rows with at least min_non_null indicators present (before impute)
    valid = agg[indicators].notna().sum(axis=1) >= min_non_null
    agg = agg.loc[valid].reset_index(drop=True)
    return agg

def impute_and_standardize(agg_df, indicators):
    """Median impute then z-score standardize; returns (imputed_df, standardized_df, scaler)."""
    imp = agg_df.copy()
    for c in indicators:
        med = imp[c].median(skipna=True)
        imp[c] = imp[c].fillna(med)
    # Standardize per period for clustering fairness
    scaler = StandardScaler()
    Z = imp.copy()
    Z[indicators] = scaler.fit_transform(imp[indicators].values)
    return imp, Z, scaler

def kmeans_and_metrics(Z, indicators, k=3):
    """Run KMeans and compute quality metrics; returns labels, model, metrics dict."""
    km = KMeans(n_clusters=k, n_init=50, random_state=RANDOM_STATE)
    labels = km.fit_predict(Z[indicators].values)
    X = Z[indicators].values
    sil = silhouette_score(X, labels) if len(np.unique(labels)) > 1 else np.nan
    db  = davies_bouldin_score(X, labels)
    ch  = calinski_harabasz_score(X, labels)
    inertia = km.inertia_
    metrics = dict(Silhouette=sil, Davies_Bouldin=db, Calinski_Harabasz=ch, Inertia=inertia)
    return labels, km, metrics

def centroid_table(Z, labels, indicators):
    """Centroid z-means and interpretation table: top positive/negative drivers."""
    tab = (
        pd.DataFrame(Z[indicators].values, columns=indicators)
        .assign(cluster=labels)
        .groupby("cluster")[indicators]
        .mean()
        .reset_index()
    )
    # For readable interpretation
    interprets = []
    for _, row in tab.iterrows():
        pos = sorted([(c, row[c]) for c in indicators], key=lambda x: -x[1])[:3]
        neg = sorted([(c, row[c]) for c in indicators], key=lambda x: x[1])[:3]
        pos_str = "; ".join([f"{k}(+{v:.2f}z)" for k, v in pos])
        neg_str = "; ".join([f"{k}({v:.2f}z)" if v<0 else f"{k}(+{v:.2f}z)" for k, v in neg])
        interprets.append({"cluster": int(row["cluster"]), "top_positive": pos_str, "top_negative": neg_str})
    return tab, pd.DataFrame(interprets)

def margin_scores(Z_mat, km):
    """Distance margin to second-closest centroid (higher → more confident)."""
    D = pairwise_distances(Z_mat, km.cluster_centers_)
    best = D.min(axis=1)
    second = np.partition(D, 1, axis=1)[:,1]
    return (second - best)


# 4. Build period datasets, standardize, cluster (k=3), evaluate

In [14]:
# --- Block 4: run pipeline per period (k=3) ---
period_data = {}      # name -> dict with agg, imp, Z, labels, km, metrics
concat_for_global = []  # for global PCA comparability

for pname, years in PERIODS.items():
    agg = aggregate_period(df, years, INDICATORS, min_non_null=2)
    imp, Z, scaler = impute_and_standardize(agg, INDICATORS)
    labels, km, metrics = kmeans_and_metrics(Z, INDICATORS, k=3)
    tab, interp = centroid_table(Z, labels, INDICATORS)

    # enrich frames for plotting and export
    plot_df = pd.DataFrame(Z[INDICATORS], columns=INDICATORS)
    plot_df["Country"] = agg["Country Name"].values
    plot_df["Country Code"] = agg["Country Code"].values
    plot_df["Region"] = agg["Region"].values
    plot_df["Period"] = pname
    plot_df["Cluster"] = labels
    plot_df["Cluster_Margin"] = margin_scores(Z[INDICATORS].values, km)
    # store original (imputed) numeric values for hover
    for c in INDICATORS:
        plot_df[f"raw_{c}"] = imp[c].values

    period_data[pname] = dict(
        agg=agg, imp=imp, Z=Z, labels=labels, km=km, metrics=metrics,
        centroids=tab, interpretation=interp, plot=plot_df
    )
    concat_for_global.append(plot_df.copy())

# Quality summary table
metrics_rows = []
for pname in PERIODS.keys():
    m = period_data[pname]["metrics"]
    metrics_rows.append(dict(
        Period=pname,
        Silhouette=m["Silhouette"],
        Calinski_Harabasz=m["Calinski_Harabasz"],
        Davies_Bouldin=m["Davies_Bouldin"],
        Inertia=m["Inertia"]
    ))
metrics_df = pd.DataFrame(metrics_rows)
metrics_df.to_csv(os.path.join(OUTDIR, "metrics_k3_per_period.csv"), index=False, encoding="utf-8-sig")
metrics_df


Unnamed: 0,Period,Silhouette,Calinski_Harabasz,Davies_Bouldin,Inertia
0,2000–2009,0.165417,39.674613,1.929406,1581.730202
1,2010–2014,0.183307,44.309683,1.812164,1546.672591
2,2015–2019,0.204246,42.822958,1.417863,1560.094477


# 5. Label stability across periods (Adjusted Rand Index)

In [15]:
# --- Block 5: stability across time (ARI) ---
def ari_between(period_A, period_B):
    A = period_data[period_A]["agg"][["Country Name"]].copy()
    B = period_data[period_B]["agg"][["Country Name"]].copy()
    A["label"] = period_data[period_A]["labels"]
    B["label"] = period_data[period_B]["labels"]
    common = A.merge(B, on="Country Name", suffixes=("_A","_B"))
    if len(common) < 5:
        return np.nan, len(common)
    return adjusted_rand_score(common["label_A"], common["label_B"]), len(common)

ari_00_10, n1 = ari_between("2000–2009","2010–2014")
ari_10_15, n2 = ari_between("2010–2014","2015–2019")

stability_table = pd.DataFrame([
    {"From→To":"2000–2009 → 2010–2014", "ARI": ari_00_10, "CommonCountries": n1},
    {"From→To":"2010–2014 → 2015–2019", "ARI": ari_10_15, "CommonCountries": n2},
])
stability_table.to_csv(os.path.join(OUTDIR,"ari_stability.csv"), index=False, encoding="utf-8-sig")
stability_table


Unnamed: 0,From→To,ARI,CommonCountries
0,2000–2009 → 2010–2014,0.527848,258
1,2010–2014 → 2015–2019,0.293064,259


# 6. Global PCA for comparable axes (applied to all periods together)

In [16]:
# --- Block 6: global PCA (comparable axes across time) ---
plot_all = pd.concat([period_data[p]["plot"] for p in PERIODS.keys()], ignore_index=True)

# PCA on standardized z-features only (keep GDP in hover via raw_ columns)
Zmat = plot_all[INDICATORS].values
pca = PCA(n_components=2, random_state=RANDOM_STATE)
PC = pca.fit_transform(Zmat)
plot_all["PC1"] = PC[:,0]
plot_all["PC2"] = PC[:,1]

expl_var = pca.explained_variance_ratio_
with open(os.path.join(OUTDIR,"pca_var.json"), "w", encoding="utf-8") as f:
    json.dump({"PC1": float(expl_var[0]), "PC2": float(expl_var[1])}, f, indent=2)

expl_var


array([0.20674855, 0.17201505])

# 7. HD-grade interactive PCA (facet by period, rich hover, confidence)

In [17]:
# --- Block 7: HD interactive cluster map (facet + hover + confidence) [UPDATED] ---
hover_cols = ["Country","Region","Cluster_Margin"] + [f"raw_{c}" for c in INDICATORS]

fig = px.scatter(
    plot_all,
    x="PC1", y="PC2",
    color="Cluster",
    size="raw_GDP",
    facet_col="Period",
    hover_data=hover_cols,
    title="PCA Clustering by Period (2000–2009, 2010–2014, 2015–2019)"
)
fig.update_layout(title_x=0.5, legend_title_text="Cluster", font=dict(size=12))

# ✅ show inline + save HTML (Task 1 folder)
save_and_show(fig, "pca_clusters_timelineB.html")


Saved interactive HTML → Task1_grouped_data_output\pca_clusters_timelineB.html


# 8. Cluster centroid radar (z-profiles) per period

In [18]:
# --- Block 8: centroid radar per period (z-score profiles) [UPDATED] ---
def radar_from_centroids(cent_df, period_name):
    categories = ["Inflation","Unemployment","Trade","Debt","Credit_to_Private_Sector","NPLs","GDP_Growth"]
    fig = go.Figure()
    for _, row in cent_df.iterrows():
        r = [row[c] for c in categories] + [row[categories[0]]]
        fig.add_trace(go.Scatterpolar(
            r=r, theta=categories + [categories[0]],
            name=f"Cluster {int(row['cluster'])}",
            mode="lines+markers"
        ))
    fig.update_layout(
        title=f"Cluster Centroid Profiles (z-score) – {period_name}",
        polar=dict(radialaxis=dict(visible=True)),
        showlegend=True
    )
    return fig

for pname in PERIODS.keys():
    f = radar_from_centroids(period_data[pname]["centroids"], pname)
    # ✅ show inline + save HTML
    save_and_show(f, f"radar_{pname.replace('–','-')}.html")


Saved interactive HTML → Task1_grouped_data_output\radar_2000-2009.html


Saved interactive HTML → Task1_grouped_data_output\radar_2010-2014.html


Saved interactive HTML → Task1_grouped_data_output\radar_2015-2019.html


# 9.  k-selector: compare k ∈ {2,3,4,5}

In [19]:
# --- Block 9: optional k-selector robustness [UPDATED] ---
from sklearn.cluster import KMeans

def k_labels_for_period(Z_sub, ks=(2,3,4,5)):
    out = {}
    for k in ks:
        km = KMeans(n_clusters=k, n_init=50, random_state=RANDOM_STATE).fit(Z_sub[INDICATORS].values)
        out[k] = km.labels_
    return out

labels_by_k = {}
for pname in PERIODS.keys():
    idx = plot_all["Period"] == pname
    Z_sub = plot_all.loc[idx, INDICATORS]
    labels_by_k[pname] = k_labels_for_period(Z_sub)

def make_fig_for_k(k):
    data = plot_all.copy()
    for pname in PERIODS.keys():
        idx = data["Period"] == pname
        data.loc[idx, "ClusterK"] = labels_by_k[pname][k]
    return px.scatter(
        data, x="PC1", y="PC2", color="ClusterK", size="raw_GDP",
        facet_col="Period", hover_data=hover_cols,
        title=f"PCA Clustering – k={k}"
    )

fig_k = make_fig_for_k(3)
fig_k.update_layout(
    updatemenus=[dict(
        type="dropdown", x=0.02, y=1.1,
        buttons=[dict(label=f"k={k}", method="update",
                      args=[make_fig_for_k(k).data, make_fig_for_k(k).layout])
                 for k in [2,3,4,5]]
    )],
    title_x=0.5
)

# ✅ show inline + save HTML
save_and_show(fig_k, "pca_clusters_k_selector.html")


Saved interactive HTML → Task1_grouped_data_output\pca_clusters_k_selector.html


# 10. Export evidence tables for the report

In [20]:
# --- Block 10: export evidence tables (UPDATED OUTDIR) ---
metrics_df.to_csv(os.path.join(OUTDIR, "metrics_k3_per_period.csv"), index=False, encoding="utf-8-sig")
stability_table.to_csv(os.path.join(OUTDIR,"ari_stability.csv"), index=False, encoding="utf-8-sig")

for pname in PERIODS.keys():
    period_data[pname]["centroids"].to_csv(os.path.join(OUTDIR, f"centroids_{pname}.csv"), index=False, encoding="utf-8-sig")
    period_data[pname]["interpretation"].to_csv(os.path.join(OUTDIR, f"drivers_{pname}.csv"), index=False, encoding="utf-8-sig")
    period_data[pname]["plot"].to_csv(os.path.join(OUTDIR, f"plotdata_{pname}.csv"), index=False, encoding="utf-8-sig")

print("Saved files in:", OUTDIR)
print(sorted(os.listdir(OUTDIR)))


Saved files in: Task1_grouped_data_output
['ari_stability.csv', 'centroids_2000–2009.csv', 'centroids_2010–2014.csv', 'centroids_2015–2019.csv', 'drivers_2000–2009.csv', 'drivers_2010–2014.csv', 'drivers_2015–2019.csv', 'metrics_k3_per_period.csv', 'pca_clusters_k_selector.html', 'pca_clusters_timelineB.html', 'pca_var.json', 'plotdata_2000–2009.csv', 'plotdata_2010–2014.csv', 'plotdata_2015–2019.csv', 'radar_2000-2009.html', 'radar_2010-2014.html', 'radar_2015-2019.html']
