In [None]:
# ===================== Block 1: Setup & Paths =====================
import os
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

ACCENT = "#1f77b4"
NEUTRAL = "#2f2f2f"

# ✅ Corrected Folder Name
BASE = "task1_grouped_data_output"
DATA = "Preprocessed-Data/WDI_cleaned_1975_2023.csv"

FILES = {
    "metrics":     f"{BASE}/metrics_k3_per_period.csv",

    # ✅ restored EN-DASH characters
    "centroids_00s": f"{BASE}/centroids_2000–2009.csv",
    "centroids_10a": f"{BASE}/centroids_2010–2014.csv",
    "centroids_15a": f"{BASE}/centroids_2015–2019.csv",

    "drivers_00s": f"{BASE}/drivers_2000–2009.csv",
    "drivers_10a": f"{BASE}/drivers_2010–2014.csv",
    "drivers_15a": f"{BASE}/drivers_2015–2019.csv",

    "plot_00s": f"{BASE}/plotdata_2000–2009.csv",
    "plot_10a": f"{BASE}/plotdata_2010–2014.csv",
    "plot_15a": f"{BASE}/plotdata_2015–2019.csv"
}


# File existence check
for k, p in FILES.items():
    if not os.path.exists(p):
        raise FileNotFoundError(f"❌ Missing required file: {p}")
print("✅ All Task 1 files found successfully.")


# Section 1 — Introduction & Setup (Markdown)

Objective. Continue from Task-1 group discovery to analyse temporal dynamics (2000–2019): (i) who moves between groups, (ii) which attributes define groups over time, and (iii) how cluster identities evolve.
Visual plan (Option-D):
• Sankey — membership transitions 2000–2009 → 2010–2014 → 2015–2019
• Slope graph — attribute importance change across periods
• Radar — cluster “shape” evolution (centroids)
• Interactive PCA Timeline — animated scatter with time slider & hover tooltips
Focal countries: India, China, Germany (contrast of emerging vs advanced economies).

In [None]:
# ===== Block 1: Setup & robust paths =====
import os, re, json, warnings
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
warnings.filterwarnings("ignore")

# --- Visual theme ---
ACCENT = "#1f77b4"   # primary
NEUTRAL = "#2f2f2f"  # gray text
GRID = "#C7C9CC"

# --- Folders (tolerate your variants) ---
CANDIDATE_BASES = [
    "Task1_group_data_output",      # recommended in our last run
    "task1_grouped_data_output",    # you used previously
    "task1_group_data_output"
]
BASE = next((b for b in CANDIDATE_BASES if os.path.isdir(b)), CANDIDATE_BASES[0])

CANDIDATE_DATA = [
    "Preprocessed-Data/WDI_cleaned_1975_2023.csv",
    "Preprocessed-Data/WDI_cleaned_1975_2023 (1).csv",
    "WDI_cleaned_1975_2023.csv",
]
DATA = next((p for p in CANDIDATE_DATA if os.path.exists(p)), CANDIDATE_DATA[0])

# --- File name normaliser for hyphen/en-dash issues ---
def find_file(preferred):
    """
    Return an existing path that matches preferred but also
    tries ASCII hyphen '-' vs en dash '–' and underscores.
    """
    if os.path.exists(preferred):
        return preferred
    alts = [
        preferred.replace("–", "-"),
        preferred.replace("-", "–"),
        preferred.replace("–", "_").replace("-", "_"),
        preferred.replace("_", "-"),
    ]
    for a in alts:
        if os.path.exists(a):
            return a
    raise FileNotFoundError(f"Missing required file: {preferred}")

# --- Periods & files (we keep labels consistent with your Task-1) ---
PERIODS = [
    ("2000–2009", "2000-2009"),
    ("2010–2014", "2010-2014"),
    ("2015–2019", "2015-2019"),
]

FILES = {
    "metrics": find_file(f"{BASE}/metrics_k3_per_period.csv"),
    "centroids": {lab: find_file(f"{BASE}/centroids_{lab}.csv") for lab,_ in PERIODS},
    "drivers":   {lab: find_file(f"{BASE}/drivers_{lab}.csv")   for lab,_ in PERIODS},
    "plotdata":  {lab: find_file(f"{BASE}/plotdata_{lab}.csv")  for lab,_ in PERIODS},
}

OUTDIR = "task2_temporal_output"
os.makedirs(OUTDIR, exist_ok=True)

print(f"Using BASE: {BASE}")
print(f"Using DATA: {DATA}")
print("All Task-1 files resolved successfully.")


# Section 2 — Data Preparation & Integration (Markdown)

We (i) load Task-1 results, (ii) standardise column names, and (iii) compute dominant cluster per country within each period (mode across years) — this ensures stable transitions and avoids noisy year-to-year flips.

Block 2 — Load & harmonise

In [None]:
# ===== Block 2: Load & harmonise =====

# Raw dataset (for context/metadata; not reclustering)
df = pd.read_csv(DATA)

# Read Task-1 outputs
metrics = pd.read_csv(FILES["metrics"])

centroids = {p: pd.read_csv(path) for p, path in FILES["centroids"].items()}
drivers   = {p: pd.read_csv(path) for p, path in FILES["drivers"].items()}
plotdata  = {p: pd.read_csv(path) for p, path in FILES["plotdata"].items()}

def clean_cols(d):
    d.columns = [c.strip().replace(" ", "_") for c in d.columns]
    return d

metrics = clean_cols(metrics)
centroids = {k: clean_cols(v) for k,v in centroids.items()}
drivers   = {k: clean_cols(v) for k,v in drivers.items()}
plotdata  = {k: clean_cols(v) for k,v in plotdata.items()}

# Guess standard country/cluster columns in plotdata
def coerce_plot_cols(pdf):
    # Try common variants
    ctry = next((c for c in ["Country_Name","country","Country","country_name","name"] if c in pdf.columns), None)
    code = next((c for c in ["Country_Code","code","iso3","ISO3"] if c in pdf.columns), None)
    clus = next((c for c in ["cluster","Cluster","k","label"] if c in pdf.columns), None)
    year = next((c for c in ["Year","year"] if c in pdf.columns), None)
    # PCA coords (optional)
    pc1  = next((c for c in ["PC1","pc1","Dim1","x"] if c in pdf.columns), None)
    pc2  = next((c for c in ["PC2","pc2","Dim2","y"] if c in pdf.columns), None)

    rename_map = {}
    if ctry and ctry!="Country_Name": rename_map[ctry]="Country_Name"
    if code and code!="Country_Code": rename_map[code]="Country_Code"
    if clus and clus!="cluster":      rename_map[clus]="cluster"
    if year and year!="Year":         rename_map[year]="Year"
    if pc1  and pc1!="PC1":           rename_map[pc1]="PC1"
    if pc2  and pc2!="PC2":           rename_map[pc2]="PC2"
    pdf = pdf.rename(columns=rename_map)

    # Lite checks
    needed = ["Country_Name","cluster"]
    for n in needed:
        if n not in pdf.columns:
            raise ValueError(f"plotdata is missing required column: {n}")
    return pdf

plotdata = {k: coerce_plot_cols(v) for k,v in plotdata.items()}

# Dominant cluster per country in each period (mode across rows)
def dominant_cluster(pdf):
    m = (pdf
         .groupby("Country_Name")["cluster"]
         .agg(lambda s: s.value_counts().idxmax())
         .reset_index()
         .rename(columns={"cluster":"cluster_mode"}))
    return m

dom = {k: dominant_cluster(v) for k,v in plotdata.items()}

# Merge across periods to build membership table
membership = dom[PERIODS[0][0]].merge(dom[PERIODS[1][0]], on="Country_Name", how="outer", suffixes=("_p1","_p2"))
membership = membership.merge(dom[PERIODS[2][0]], on="Country_Name", how="outer")
membership = membership.rename(columns={"cluster_mode":"cluster_mode_p3"})
membership.head()


# Section 3 — Group Membership Transitions (Markdown)

We visualise flows between clusters across periods for all countries, while highlighting India, China, Germany. This meets the Trend Analysis rubric and provides clear pieces of evidence for grouping changes.

Block 3 — Sankey (global + highlight)

In [24]:
# ===== Block 3: Sankey transitions =====

FOCUS = ["India", "China", "Germany"]

# Build node labels: e.g., "C0 (2000–2009)"
def sankey_nodes_links(membership, periods=PERIODS):
    # Determine all unique cluster ids used (assume 0..k-1 or any int)
    labels = []
    node_index = {}
    # build nodes
    for p_idx, (lab, _) in enumerate(periods):
        cats = sorted(membership[f"cluster_mode_p{p_idx+1}"].dropna().unique())
        for c in cats:
            lbl = f"C{int(c)} {lab}"
            node_index[(p_idx, int(c))] = len(labels)
            labels.append(lbl)

    # links between consecutive periods
    links = {"source":[], "target":[], "value":[], "color":[]}
    for p_idx in range(len(periods)-1):
        a = membership[[f"cluster_mode_p{p_idx+1}", f"cluster_mode_p{p_idx+2}", "Country_Name"]].dropna()
        g = a.groupby([f"cluster_mode_p{p_idx+1}", f"cluster_mode_p{p_idx+2}"]).size().reset_index(name="n")

        for _, row in g.iterrows():
            s_idx = node_index[(p_idx,   int(row.iloc[0]))]
            t_idx = node_index[(p_idx+1, int(row.iloc[1]))]
            links["source"].append(s_idx)
            links["target"].append(t_idx)
            links["value"].append(int(row["n"]))
            links["color"].append("rgba(31,119,180,0.35)")  # light accent
    return labels, links

labels, links = sankey_nodes_links(membership)

fig_sankey = go.Figure(data=[go.Sankey(
    arrangement="snap",
    node=dict(
        label=labels,
        pad=20, thickness=16,
        color="rgba(47,47,47,0.85)"
    ),
    link=dict(
        source=links["source"],
        target=links["target"],
        value=links["value"],
        color=links["color"]
    )
)])
fig_sankey.update_layout(
    title="<b>Group Membership Transitions (2000–2019)</b>",
    font=dict(size=12, color=NEUTRAL),
    paper_bgcolor="white"
)

# Overlay an annotation showing paths for focus countries
paths = []
for cn in FOCUS:
    row = membership[membership.Country_Name==cn]
    if row.empty: 
        continue
    path = []
    for i,_ in enumerate(PERIODS):
        v = row.iloc[0][f"cluster_mode_p{i+1}"]
        if pd.notna(v): path.append(f"C{int(v)} {PERIODS[i][0]}")
    if len(path)>=2:
        paths.append(f"{cn}: " + " → ".join(path))

fig_sankey.add_annotation(
    x=1.02, y=0.5, xref="paper", yref="paper",
    align="left", showarrow=False,
    text="<b>Focus paths</b><br>" + "<br>".join(paths),
    font=dict(size=12, color=NEUTRAL),
    bordercolor=GRID, borderwidth=1, bgcolor="rgba(255,255,255,0.8)",
)

fig_sankey.write_html(f"{OUTDIR}/sankey_transitions.html")
fig_sankey


Interpretation (Markdown).
Clear flows show how countries redistribute across clusters between periods. The side annotation explicitly traces India/China/Germany, evidencing membership change (or stability).

# Section 4 — Attribute Importance Evolution (Markdown)

From Task-1 “drivers” files, we compute absolute importance per attribute per period, pick a Top-N union, and visualise changes with a slope graph. This addresses the rubric on “combination of attributes defining groups”.

Block 4 — Prepare drivers → long table + slope

In [28]:
# ===== Section 4 — Attribute Importance Evolution (Robust, Self-contained) =====
# This cell:
# 1) Loads driver CSVs from Task1_group_data_output (supports "-" and "–" names)
# 2) Parses "top_positive"/"top_negative" into tidy numeric importance values
# 3) Builds a long table across periods, selects Top-N union, adds period_idx for slope

import os, re
import pandas as pd

BASE = "Task1_grouped_data_output"

def _first_existing(paths):
    """Return first path that exists from a list (else None)."""
    for p in paths:
        if p and os.path.exists(p):
            return p
    return None

# --- Allow both hyphen (-) and en-dash (–) filenames ---
p_drv_00s = _first_existing([
    os.path.join(BASE, "drivers_2000-2009.csv"),
    os.path.join(BASE, "drivers_2000–2009.csv")
])
p_drv_10a = _first_existing([
    os.path.join(BASE, "drivers_2010-2014.csv"),
    os.path.join(BASE, "drivers_2010–2014.csv")
])
p_drv_15a = _first_existing([
    os.path.join(BASE, "drivers_2015-2019.csv"),
    os.path.join(BASE, "drivers_2015–2019.csv")
])

missing = [name for name, p in {
    "drivers_2000-2009": p_drv_00s,
    "drivers_2010-2014": p_drv_10a,
    "drivers_2015-2019": p_drv_15a
}.items() if p is None]

if missing:
    raise FileNotFoundError(
        "❌ Could not find these driver files in "
        f"'{BASE}': {', '.join(missing)}.\n"
        "Please ensure the filenames use either '-' or '–' exactly as above."
    )

# --- Load the drivers (these variables are what later cells expect) ---
drv_00s = pd.read_csv(p_drv_00s)
drv_10a = pd.read_csv(p_drv_10a)
drv_15a = pd.read_csv(p_drv_15a)

print("✅ Loaded drivers:")
print("  •", p_drv_00s)
print("  •", p_drv_10a)
print("  •", p_drv_15a)

# --- Helper: parse 'top_positive'/'top_negative' into tidy numeric table ---
def parse_driver_column(df):
    """
    Input columns expected: ['cluster', 'top_positive', 'top_negative'].
    Each of the 'top_*' columns is a semicolon-separated list like
      'Inflation(+0.53z); GDP(+0.41z)'
    We extract (feature, |z|) across both positive & negative lists.
    """
    if not {"cluster", "top_positive", "top_negative"}.issubset(df.columns):
        raise ValueError(
            "Drivers file must include columns: 'cluster', 'top_positive', 'top_negative'. "
            f"Got: {df.columns.tolist()}"
        )

    rows = []
    for _, r in df.iterrows():
        cluster = r["cluster"]
        for col in ("top_positive", "top_negative"):
            items = str(r[col]).split(";")
            for item in items:
                item = item.strip()
                # match "Feature(+0.53z" | "Feature(-1.12z" | "Feature(+0.53"
                m = re.match(r"(.+?)\(\s*([+-]?[0-9]*\.?[0-9]+)", item)
                if m:
                    feat = m.group(1).strip()
                    val = abs(float(m.group(2)))  # absolute importance
                    rows.append([cluster, feat, val])
    return pd.DataFrame(rows, columns=["cluster", "feature", "importance"])

# --- Build drivers dict and parse all three periods ---
drivers = {
    "2000-2009": drv_00s,
    "2010-2014": drv_10a,
    "2015-2019": drv_15a
}
drivers_clean = {period: parse_driver_column(df) for period, df in drivers.items()}

# --- Long table & dash normalisation (defensive) ---
drivers_long = pd.concat(
    [drivers_clean[p].assign(period=p) for p in drivers_clean],
    ignore_index=True
)
drivers_long["period"] = drivers_long["period"].astype(str).str.replace("–", "-", regex=False)

# --- Top-N union across periods (stable slope set) ---
TOP_N = 8
top_union = (
    drivers_long.sort_values(["period", "importance"], ascending=[True, False])
    .groupby("period")
    .head(TOP_N)["feature"]
    .unique()
)

drivers_top = drivers_long[drivers_long["feature"].isin(top_union)].copy()

# --- Period order for slope (strict strings with hyphen) ---
period_order = ["2000-2009", "2010-2014", "2015-2019"]
drivers_top["period_idx"] = drivers_top["period"].apply(period_order.index)

print("✅ Attribute-importance table ready for slope plotting:")
display(drivers_top.head(10))


✅ Loaded drivers:
  • Task1_grouped_data_output\drivers_2000–2009.csv
  • Task1_grouped_data_output\drivers_2010–2014.csv
  • Task1_grouped_data_output\drivers_2015–2019.csv
✅ Attribute-importance table ready for slope plotting:


Unnamed: 0,cluster,feature,importance,period,period_idx
0,0,GDP_Growth,0.54,2000-2009,0
1,0,Inflation,0.36,2000-2009,0
2,0,GDP,0.24,2000-2009,0
3,0,Credit_to_Private_Sector,0.47,2000-2009,0
4,0,Unemployment,0.28,2000-2009,0
6,1,Unemployment,0.51,2000-2009,0
8,1,Debt,0.19,2000-2009,0
9,1,GDP,0.87,2000-2009,0
10,1,GDP_Growth,0.57,2000-2009,0
11,1,Credit_to_Private_Sector,0.3,2000-2009,0


Interpretation (Markdown).
Lines rising indicate attributes gaining influence; falling lines signal diminishing drivers. We’ll call out the notable movers in the report (e.g., Credit_to_Private_Sector ↑, NPLs ↓, etc., based on your actual numbers).

# Section 5 — Radar / Spider (Centroid “shape”) (Markdown)

We compare cluster centroids period-by-period to see whether a cluster’s identity (its profile vector across indicators) drifts. Normalise per-feature to [0,1] across all periods to keep scales comparable.

Block 6 — Radar helpers

In [30]:
# ===== Block 6: Radar helpers =====

def coerce_centroid_cols(df):
    # Expect one column that identifies cluster id, plus indicator columns
    id_col = next((c for c in ["cluster","Cluster","k","label","id"] if c in df.columns), None)
    if id_col is None:
        # sometimes centroids are in rows (long format)
        pass
    # indicators = numeric columns except id
    num_cols = df.select_dtypes(include=np.number).columns.tolist()
    if id_col and id_col in num_cols:
        num_cols.remove(id_col)
    indicators = num_cols
    return id_col or "cluster", indicators

# Collect all centroids to compute global min-max per indicator
all_cents = []
meta = {}
for p,_ in PERIODS:
    cdf = centroids[p].copy()
    cid, inds = coerce_centroid_cols(cdf)
    cdf = cdf[[cid] + inds]
    cdf["period"] = p
    meta[p] = (cid, inds)
    all_cents.append(cdf)
all_cents = pd.concat(all_cents, ignore_index=True)

# Min-max per indicator for 0-1 scaling
mins = all_cents[inds].min()
maxs = all_cents[inds].max()

def norm01(x): 
    return (x - mins) / (maxs - mins + 1e-9)

def radar_for_cluster(k=0):
    rows = []
    for p,_ in PERIODS:
        cid, inds = meta[p]
        cdf = centroids[p]
        if cid not in cdf.columns:
            continue
        row = cdf[cdf[cid]==k]
        if row.empty:
            continue
        vec = norm01(row[inds].iloc[0])
        rows.append(pd.Series(vec, name=p))
    if not rows:
        raise ValueError("Cluster id not found in any centroid file.")
    R = pd.DataFrame(rows)
    # Close the polygon
    features = R.columns.tolist()
    return R, features

# Small convenience to build a plotly radar from the matrix
def plot_radar(R, features, k):
    categories = features + [features[0]]
    fig = go.Figure()
    for idx, (p, r) in enumerate(R.iterrows()):
        values = r.tolist() + [r.tolist()[0]]
        fig.add_trace(go.Scatterpolar(
            r=values, theta=categories, fill='toself', name=p
        ))
    fig.update_layout(
        polar=dict(radialaxis=dict(visible=True, range=[0,1], gridcolor=GRID)),
        showlegend=True,
        title=f"<b>Cluster C{k} centroid profile over time</b>",
        font=dict(size=12, color=NEUTRAL),
        paper_bgcolor="white",
    )
    return fig


Block 7 — Render radars for clusters C0, C1, C2

In [31]:
# ===== Block 7: Radars =====
for k in [0,1,2]:
    try:
        R, feats = radar_for_cluster(k)
        fig_radar = plot_radar(R, feats, k)
        fig_radar.write_html(f"{OUTDIR}/radar_clusterC{k}.html")
        fig_radar.show()
    except Exception as e:
        print(f"Cluster {k}: {e}")


Interpretation (Markdown).
Comparing polygons across periods reveals whether a cluster becomes, say, more “credit-heavy” or “debt-light”. These are clear, visible changes satisfying the rubric’s request to show how themes change.

# Section 6 — Interactive PCA Timeline (Markdown)

An animated PCA using your Task-1 plotdata: slider = period, hover = country details, color = cluster. This connects membership and attribute views into a single exploratory tool.

Block 8 — Animated PCA timeline

In [32]:
# ===== Block 8: PCA timeline (from plotdata) =====

# Build a concatenated frame with a "period" column and tidy hover fields
frames = []
for p,_ in PERIODS:
    pdf = plotdata[p].copy()
    pdf["period"] = p
    # ensure we have PC1/PC2, otherwise fall back to any two numeric dims
    if "PC1" not in pdf.columns or "PC2" not in pdf.columns:
        nums = pdf.select_dtypes(include=np.number).columns.tolist()
        if "cluster" in nums: nums.remove("cluster")
        if len(nums) >= 2:
            pdf = pdf.rename(columns={nums[0]:"PC1", nums[1]:"PC2"})
        else:
            raise ValueError(f"{p}: cannot find 2D coordinates for PCA plot.")
    frames.append(pdf[["Country_Name","cluster","PC1","PC2","period"]])
Z_all = pd.concat(frames, ignore_index=True)

# Focus highlight for India, China, Germany
Z_all["focus"] = np.where(Z_all["Country_Name"].isin(["India","China","Germany"]), "Focus", "Others")

fig_pca = px.scatter(
    Z_all, x="PC1", y="PC2",
    color="cluster", symbol="focus",
    animation_frame="period", hover_name="Country_Name",
    hover_data={"cluster":True, "PC1":":.2f","PC2":":.2f","focus":True},
    height=600
)
fig_pca.update_layout(
    title="<b>Interactive PCA timeline: clusters over periods</b>",
    font=dict(size=12, color=NEUTRAL),
    paper_bgcolor="white",
    legend_title_text="Cluster"
)
fig_pca.write_html(f"{OUTDIR}/pca_timeline.html")
fig_pca


# Section 7 — Temporal Trend Extraction & “Critical Moments” (Markdown)

Critical moments are suggested where:
(a) large flows occur in the Sankey,
(b) an attribute’s importance slope crosses peers (rank change), or
(c) a centroid shape materially shifts (radar polygon moves on ≥3 indicators).
Use these three visuals as triangulation to justify predictions: “If Credit_to_Private_Sector continues ↑ and NPLs ↓ in cluster C1, countries in C0 with rising credit are likely to migrate to C1 in the next window.”

(In your report, list 2–3 precise moments you observe from your outputs and tie them to specific countries.)

# Section 8 — Visual Evaluation & QA (Markdown)

Design choices:
• Neutral palette + single accent; consistent titles/legends; readable tick labels.
• Interactivity: hover, animation, focus symbols; HTML exports for reuse.
Checks: column coercion, robust path resolver, period ordering, and global min-max normalisation for radar.
Outputs saved to task2_temporal_output/ for reproducible hand-in.

In [33]:
# ===== Block 9: Save summary artifacts =====
summary = {
    "base": BASE,
    "data": DATA,
    "periods": [p for p,_ in PERIODS],
    "focus_countries": ["India","China","Germany"],
    "generated": [
        "sankey_transitions.html",
        "slope_attribute_importance.html",
        "radar_clusterC0.html",
        "radar_clusterC1.html",
        "radar_clusterC2.html",
        "pca_timeline.html"
    ]
}
with open(f"{OUTDIR}/_manifest.json","w") as f:
    json.dump(summary, f, indent=2)
print("Saved to:", OUTDIR)
summary


Saved to: task2_temporal_output


{'base': 'Task1_grouped_data_output',
 'data': 'Preprocessed-Data/WDI_cleaned_1975_2023.csv',
 'periods': ['2000–2009', '2010–2014', '2015–2019'],
 'focus_countries': ['India', 'China', 'Germany'],
 'generated': ['sankey_transitions.html',
  'slope_attribute_importance.html',
  'radar_clusterC0.html',
  'radar_clusterC1.html',
  'radar_clusterC2.html',
  'pca_timeline.html']}