In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from pathlib import Path

pd.set_option("display.width", 120)
pd.set_option("display.max_columns", 50)

# -----------------------------------------
# HELPERS
# -----------------------------------------
def print_section(title):
    print("\n" + "="*len(title))
    print(title)
    print("="*len(title))

def show_df(name, df, n=5):
    print(f"\n[{name}] shape={df.shape}")
    print(df.head(n))

def nz(x):
    return 1.0 if x == 0 else x

# -----------------------------------------
# STEP 0 — Load data
# -----------------------------------------
base = Path(".")  # change if needed

print_section("STEP 0 — Load data")
m_master = pd.read_csv(base / "vn_monthly_master_new.csv", parse_dates=["Date"])
m_shocks = pd.read_csv(base / "vn_monthly_with_shocks_new.csv", parse_dates=["Date"])

# Robust read for Google Trends (encoding can vary)
try:
    gtr = pd.read_csv(base / "google_trends_combined_2013_2024.csv")
except UnicodeDecodeError:
    gtr = pd.read_csv(base / "google_trends_combined_2013_2024.csv", encoding="ISO-8859-1")

show_df("Monthly master (no shocks)", m_master)
show_df("Monthly with shocks", m_shocks)
show_df("Google Trends (raw)", gtr)

# Normalize name and parse
if not gtr.columns[0].lower().startswith("date") and gtr.columns[0].lower().startswith("month"):
    gtr = gtr.rename(columns={gtr.columns[0]: "Date"})
elif gtr.columns[0].lower() != "date":
    gtr = gtr.rename(columns={gtr.columns[0]: "Date"})
gtr["Date"] = pd.to_datetime(gtr["Date"], errors="coerce")
gtr = gtr.dropna(subset=["Date"]).copy()

print("\n[Google Trends] columns:", list(gtr.columns))
print("[Google Trends] date range:", gtr["Date"].min().date(), "→", gtr["Date"].max().date())

# -----------------------------------------
# STEP 1 — Clean & standardize Google Trends, build AI indices
# -----------------------------------------
print_section("STEP 1 — Clean & standardize Google Trends; build AI indices")

gt_numeric_cols = [c for c in gtr.columns if c != "Date"]
for c in gt_numeric_cols:
    gtr[c] = pd.to_numeric(gtr[c], errors="coerce")

# Restrict to 2013–2024
gtr = gtr[(gtr["Date"].dt.year >= 2013) & (gtr["Date"].dt.year <= 2024)].copy()

# Fill small gaps
gtr[gt_numeric_cols] = gtr[gt_numeric_cols].fillna(method="ffill").fillna(method="bfill")

show_df("Google Trends (coerced & trimmed)", gtr)

# Z-score each series
gtr_z = gtr.copy()
for c in gt_numeric_cols:
    s = gtr_z[c].astype(float)
    std = s.std(ddof=0)
    gtr_z[c] = (s - s.mean()) / nz(std)

# Equal-weighted z-average index
gtr_z["AI_index_zavg"] = gtr_z[gt_numeric_cols].mean(axis=1)

# PCA(1) via SVD
X = gtr_z[gt_numeric_cols].to_numpy()
if X.ndim == 1 or X.shape[1] == 1:
    pc1 = (X[:, 0] - X[:, 0].mean()) / nz(X[:, 0].std(ddof=0))
    var_ratio = np.array([1.0])
else:
    U, S, Vt = np.linalg.svd(X, full_matrices=False)
    pc1 = U[:, 0] * S[0]
    pc1 = (pc1 - pc1.mean()) / nz(pc1.std(ddof=0))
    var_ratio = (S**2) / (S**2).sum()

gtr_z["AI_index_pc1"] = pc1
gtr_idx = gtr_z[["Date", "AI_index_zavg", "AI_index_pc1"]].copy()
show_df("AI Indices (for merge)", gtr_idx)

# -----------------------------------------
# STEP 2 — Build analysis panel (merge monthly shocks + AI indices)
# -----------------------------------------
print_section("STEP 2 — Merge AI indices into monthly with shocks + build Age65_z_m")

panel = m_shocks.merge(gtr_idx, on="Date", how="left").sort_values("Date").reset_index(drop=True)

if "Age65_percent" in panel.columns:
    s = panel["Age65_percent"].astype(float)
    panel["Age65_z_m"] = (s - s.mean()) / nz(s.std(ddof=0))
else:
    panel["Age65_z_m"] = np.nan

panel = panel[(panel["Date"].dt.year >= 2013) & (panel["Date"].dt.year <= 2024)].copy()

show_df("Analysis panel (merged)", panel)

analysis_cols = [
    "Date",
    "IIP_yoy_pct","IIP_mom_pct","IIP_level",
    "CPI_yoy_pct","CPI_mom_pct","CPI_level",
    "Credit_yoy_pct","Credit_stock_VNDbn",
    "ON_avg_monthly_interbank_rate",
    "MP_shock_monthly",
    "Age65_percent","Age65_z_m",
    "AI_index_zavg","AI_index_pc1"
]
panel_out = panel[[c for c in analysis_cols if c in panel.columns]].copy()
panel_out.to_csv(base / "vn_monthly_analysis_panel.csv", index=False)
print("\nSaved: vn_monthly_analysis_panel.csv  | shape:", panel_out.shape)

# -----------------------------------------
# STEP 3A — Baseline Local Projections (short horizons only, no interactions)
# -----------------------------------------
print_section("STEP 3A — Baseline Local Projections (short horizons)")

def baseline_lp(df, dep_col, shock_col="MP_shock_monthly",
                credit_col="Credit_yoy_pct", horizons=(1,3,6)):
    out_rows = []
    for h in horizons:
        y_lead = df[dep_col].shift(-h) - df[dep_col].shift(1)
        X = pd.DataFrame({
            "shock": df[shock_col].astype(float),
            "credit_growth": df[credit_col].astype(float) if credit_col in df.columns else np.nan,
            "lag_y": df[dep_col].shift(1)
        })

        valid = y_lead.notna() & X.notna().all(axis=1)
        Y = y_lead[valid]
        Xv = sm.add_constant(X[valid])

        mod = sm.OLS(Y, Xv).fit(cov_type="HAC", cov_kwds={"maxlags": h})
        out_rows.append({
            "h": h,
            "nobs": int(mod.nobs),
            "R2": float(mod.rsquared),
            "beta_MP": mod.params.get("shock", np.nan),
            "se_beta_MP": mod.bse.get("shock", np.nan)
        })
        print(f"dep={dep_col} | h={h} | β={mod.params.get('shock', np.nan):.3f} (SE={mod.bse.get('shock', np.nan):.3f})")
    return pd.DataFrame(out_rows)

baseline_results = {}
for dep in ["IIP_yoy_pct", "CPI_yoy_pct"]:
    if dep in panel_out.columns:
        res = baseline_lp(panel_out, dep_col=dep)
        baseline_results[dep] = res
        res.to_csv(base / f"baseline_lp_{dep}.csv", index=False)

# -----------------------------------------
# STEP 3B — Extended LP with Age/AI interactions (short horizons only)
# -----------------------------------------
print_section("STEP 3B — Extended LP with ageing & AI interactions")

def local_projection_irf(df, dep_col, shock_col="MP_shock_monthly", age_col="Age65_z_m",
                         ai_col="AI_index_pc1", credit_col="Credit_yoy_pct",
                         nlags=6, horizons=(1,3,6)):

    out_rows = []
    for h in horizons:
        y_lead = df[dep_col].shift(-h) - df[dep_col].shift(1)
        X = pd.DataFrame({
            "shock": df[shock_col].astype(float),
            "shock_x_age": df[shock_col]*df[age_col] if age_col in df.columns else np.nan,
            "shock_x_ai": df[shock_col]*df[ai_col] if ai_col in df.columns else np.nan,
            "credit_growth": df[credit_col].astype(float) if credit_col in df.columns else np.nan,
            "lag_y": df[dep_col].shift(1)
        })

        valid = y_lead.notna() & X.notna().all(axis=1)
        Y = y_lead[valid]
        Xv = sm.add_constant(X[valid])

        mod = sm.OLS(Y, Xv).fit(cov_type="HAC", cov_kwds={"maxlags": h})
        out_rows.append({
            "h": h, "nobs": int(mod.nobs), "R2": float(mod.rsquared),
            "beta_MP": mod.params.get("shock", np.nan), "se_beta_MP": mod.bse.get("shock", np.nan),
            "gamma_MPxAge": mod.params.get("shock_x_age", np.nan), "se_gamma_MPxAge": mod.bse.get("shock_x_age", np.nan),
            "delta_MPxAI": mod.params.get("shock_x_ai", np.nan), "se_delta_MPxAI": mod.bse.get("shock_x_ai", np.nan)
        })
        print(f"dep={dep_col} | h={h} | β={mod.params.get('shock', np.nan):.3f} "
              f"(SE={mod.bse.get('shock', np.nan):.3f}) γ={mod.params.get('shock_x_age', np.nan):.3f} "
              f"δ={mod.params.get('shock_x_ai', np.nan):.3f}")
    return pd.DataFrame(out_rows)

extended_results = {}
for dep in ["IIP_yoy_pct", "CPI_yoy_pct"]:
    if dep in panel_out.columns:
        res = local_projection_irf(panel_out, dep_col=dep)
        extended_results[dep] = res
        res.to_csv(base / f"extended_lp_{dep}.csv", index=False)

print("\nDONE.")


SyntaxError: '(' was never closed (1676598164.py, line 170)