In [6]:
# WGI corruption/governance “risk index” builder (CSV -> clean table -> risk score)
# Assumes your file is at: /mnt/data/WGICSV.csv
# Output: wgi_risk_index_<YEAR>.csv

import pandas as pd
import numpy as np

# =========================
# CONFIG
# =========================
WGI_PATH = "C:/Users/pnlna/Downloads/WGI_CSV/WGICSV.csv"
YEAR = "2023"  # change to "2022", "2021", etc. Must match a column name in WGICSV.

# Core WGI estimate indicator codes (these are the main governance estimates)
INDICATORS_EST = {
    "CC": "CC.EST",  # Control of Corruption (estimate)
    "RL": "RL.EST",  # Rule of Law (estimate)
    "GE": "GE.EST",  # Government Effectiveness (estimate)
    "PV": "PV.EST",  # Political Stability / Absence of Violence (estimate)
}

# Optional: confidence / data quality indicators (uncomment if you want them)
INDICATORS_META = {
    "CC_NO_SRC": "CC.NO.SRC",
    "RL_NO_SRC": "RL.NO.SRC",
    "GE_NO_SRC": "GE.NO.SRC",
    "PV_NO_SRC": "PV.NO.SRC",
    "CC_STD_ERR": "CC.STD.ERR",
    "RL_STD_ERR": "RL.STD.ERR",
    "GE_STD_ERR": "GE.STD.ERR",
    "PV_STD_ERR": "PV.STD.ERR",
}

# Risk weights (must sum to 1.0 ideally)
WEIGHTS = {"CC": 0.4, "RL": 0.2, "GE": 0.2, "PV": 0.2}

# Normalization method for estimates: "minmax" or "zscore" (minmax is easiest to explain)
NORMALIZE = "minmax"

# =========================
# HELPERS
# =========================
def safe_minmax(series: pd.Series) -> pd.Series:
    """Min-max normalize to [0,1], ignoring NaNs. If constant, returns NaNs."""
    s = series.astype(float)
    mn, mx = np.nanmin(s), np.nanmax(s)
    if np.isfinite(mn) and np.isfinite(mx) and mx > mn:
        return (s - mn) / (mx - mn)
    return pd.Series([np.nan] * len(s), index=s.index)

def safe_zscore_to_unit(series: pd.Series) -> pd.Series:
    """Convert z-score to an approximate 0-1 scale using sigmoid."""
    s = series.astype(float)
    mu, sd = np.nanmean(s), np.nanstd(s)
    if np.isfinite(sd) and sd > 0:
        z = (s - mu) / sd
        return 1 / (1 + np.exp(-z))  # sigmoid
    return pd.Series([np.nan] * len(s), index=s.index)

# =========================
# LOAD + CLEAN
# =========================
df = pd.read_csv(WGI_PATH)

required_cols = {"Country Name", "Country Code", "Indicator Code"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"WGICSV missing required columns: {missing}")

if YEAR not in df.columns:
    # show a hint of available year-like columns
    year_cols = [c for c in df.columns if c.isdigit()]
    raise ValueError(f"Year column '{YEAR}' not found. Example year columns: {year_cols[:10]} ...")

# Convert the year column to numeric (WGI sometimes uses '..' for missing)
df[YEAR] = pd.to_numeric(df[YEAR], errors="coerce")

# Filter to only the indicator rows we need
wanted_codes = set(INDICATORS_EST.values()) | set(INDICATORS_META.values())
df_small = df[df["Indicator Code"].isin(wanted_codes)].copy()

# Keep only essential columns + the chosen year
df_small = df_small[["Country Name", "Country Code", "Indicator Code", YEAR]]

# Pivot so each country is one row with columns for each indicator code
wide = df_small.pivot_table(
    index=["Country Code", "Country Name"],
    columns="Indicator Code",
    values=YEAR,
    aggfunc="first",
).reset_index()

# =========================
# BUILD QUALITY + RISK COMPONENTS
# =========================
# Step 1: Normalize governance "quality" (higher = better). Then risk_component = 1 - quality.
for short_name, code in INDICATORS_EST.items():
    if code not in wide.columns:
        wide[code] = np.nan

    if NORMALIZE == "minmax":
        quality = safe_minmax(wide[code])
    elif NORMALIZE == "zscore":
        quality = safe_zscore_to_unit(wide[code])
    else:
        raise ValueError("NORMALIZE must be 'minmax' or 'zscore'.")

    wide[f"{short_name}_quality"] = quality
    wide[f"{short_name}_risk_component"] = 1 - quality  # higher = higher risk

# Step 2: Weighted risk index
wide["risk_index"] = 0.0
for short_name, w in WEIGHTS.items():
    wide["risk_index"] += w * wide[f"{short_name}_risk_component"]

# =========================
# OPTIONAL: ADD CONFIDENCE METRICS
# =========================
# Keep meta indicators if present (NO.SRC, STD.ERR). Helpful for “confidence” labeling.
for meta_name, meta_code in INDICATORS_META.items():
    if meta_code in wide.columns:
        wide[meta_name.lower()] = wide[meta_code]

# Simple confidence score example (optional):
# - higher confidence if more sources and lower std error (very rough heuristic)
# You can comment this out if you don’t want it.
src_cols = [c for c in wide.columns if c.endswith("_no_src")]
err_cols = [c for c in wide.columns if c.endswith("_std_err")]
if src_cols and err_cols:
    # normalize sources (higher better) and std_err (lower better)
    src_mean = wide[src_cols].mean(axis=1, skipna=True)
    err_mean = wide[err_cols].mean(axis=1, skipna=True)

    src_q = safe_minmax(src_mean)           # 0..1
    err_q = 1 - safe_minmax(err_mean)       # invert so lower error => higher confidence

    wide["confidence_score"] = 0.6 * src_q + 0.4 * err_q

# =========================
# FINAL OUTPUT (clean columns)
# =========================
out_cols = [
    "Country Code",
    "Country Name",
    "risk_index",
    "CC_risk_component",
    "RL_risk_component",
    "GE_risk_component",
    "PV_risk_component",
    "CC_quality",
    "RL_quality",
    "GE_quality",
    "PV_quality",
]
if "confidence_score" in wide.columns:
    out_cols.append("confidence_score")

out = wide[out_cols].copy()
out["year"] = int(YEAR)

# Sort: highest risk first
out = out.sort_values("risk_index", ascending=False)

out_path = f"wgi_risk_index_{YEAR}.csv"
out.to_csv(out_path, index=False)

# Print only risk_index and country name
output = out[["Country Name", "risk_index"]].copy()
print(output.head(40))

Indicator Code               Country Name  risk_index
175                           South Sudan    0.971355
183                  Syrian Arab Republic    0.948112
173                               Somalia    0.946646
209                           Yemen, Rep.    0.942027
166                                 Sudan    0.903505
109                                 Libya    0.882205
1                             Afghanistan    0.871569
203                         Venezuela, RB    0.870526
39                       Congo, Dem. Rep.    0.870178
32               Central African Republic    0.867878
90                                   Iraq    0.860063
128                               Myanmar    0.849624
84                                  Haiti    0.848041
184                                  Chad    0.824573
58                                Eritrea    0.822866
14                                Burundi    0.797177
156             Korea, Dem. People's Rep.    0.794904
107                         