In [1]:
# STEP 0 — Imports (run as is)
from pathlib import Path
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor


In [5]:
# HOTFIX: fija/detecta la carpeta de datos para que encuentre el target
from pathlib import Path

# Opción fija :
DATA_DIR = Path("udacity-wb-credit-forecast/data/raw")

# Si prefieres detección automática, descomenta el bloque de abajo:
# for p in [Path("../data/raw"),
#           Path("udacity-wb-credit-forecast/data/raw"),
#           Path("./udacity-wb-credit-forecast/data/raw"),
#           Path("./data/raw")]:
#     if (p / "credit_private_growth.csv").exists():
#         DATA_DIR = p
#         break

print("DATA_DIR →", DATA_DIR.resolve())
assert (DATA_DIR / "credit_private_growth.csv").exists(), \
    "No se encontró credit_private_growth.csv en DATA_DIR."

DATA_DIR → /workspace/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/data/raw


AssertionError: No se encontró credit_private_growth.csv en DATA_DIR.

In [6]:
# STEP 1 — Project params (CAMBIA AQUI si quieres otro pais ISO3)
COUNTRY = "COL"  # CAMBIA AQUI: ej "MEX", "PER", "CHL"
DATA_DIR = Path("../data/raw")
from pathlib import Path
print("CWD:", Path.cwd())
for p in [Path("../data/raw"),
          Path("udacity-wb-credit-forecast/data/raw"),
          Path("./data/raw")]:
    print(p.resolve(), "→", (p/"credit_private_growth.csv").exists())

FIG_DIR = Path("../figures"); FIG_DIR.mkdir(parents=True, exist_ok=True)

FILES = {
    "credit_growth": "credit_private_growth.csv",   # TARGET obligatorio
    "inflation":     "inflation_cpi.csv",
    "lending_rate":  "lending_rate.csv",
    "real_interest": "real_interest.csv",
    "broad_money":   "broad_money.csv",
    "unemployment":  "unemployment.csv",
    "population":    "population_growth.csv",       # opcional
}
print("Using data from:", DATA_DIR.resolve())


CWD: /workspace/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/notebooks
/workspace/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/data/raw → False
/workspace/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/data/raw → False
/workspace/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/notebooks/data/raw → False
Using data from: /workspace/udacity-wb-credit-forecast/notebooks/udacity-wb-credit-forecast/data/raw


In [7]:
# STEP 2 — Load CSVs 
def load_wide(path, value_name):
    df = pd.read_csv(path)
    year_cols = [c for c in df.columns if str(c).isdigit()]
    for c in year_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    tidy = df[["Country Code"] + year_cols].melt(id_vars=["Country Code"], value_vars=year_cols,
                                                 var_name="Year", value_name=value_name)
    tidy["Year"] = tidy["Year"].astype(int)
    return tidy

available = {k:v for k,v in FILES.items() if (DATA_DIR / v).exists()}
assert "credit_growth" in available, "Falta credit_private_growth.csv en ../data/raw/"
print("Available files:", available)

from functools import reduce
dfs = [load_wide(DATA_DIR/v, k) for k,v in available.items()]
merged = reduce(lambda L,R: pd.merge(L,R, on=["Country Code","Year"], how="outer"), dfs)

# filtro robusto (sin .query por el espacio en 'Country Code')
df = merged.loc[merged["Country Code"] == COUNTRY].sort_values("Year").reset_index(drop=True)
df.tail()


AssertionError: Falta credit_private_growth.csv en ../data/raw/

In [None]:
# STEP 3 — Quick EDA (time series)
num_cols = [c for c in df.columns if c not in ["Country Code","Year"]]
eda = df[["Year"] + num_cols].dropna()

for col in num_cols:
    plt.figure(figsize=(7,3))
    plt.plot(eda["Year"], eda[col])
    plt.title(f"{col} over time ({COUNTRY})")
    plt.xlabel("Year"); plt.ylabel(col)
    plt.tight_layout(); plt.savefig(FIG_DIR / f"ts_{col}.png"); plt.show()

if len(num_cols) >= 2:
    display(eda[num_cols].corr().round(2))


In [None]:
# STEP 4 — Feature engineering (t -> predict t+1)
feature_cols = [c for c in df.columns if c not in ["Country Code","Year","credit_growth"]]
work = df[["Year","credit_growth"] + feature_cols].copy()
work["target_t1"] = work["credit_growth"].shift(-1)

data = work.dropna(subset=feature_cols + ["target_t1"]).copy()
X = data[feature_cols]; y = data["target_t1"]; years = data["Year"].values
print(f"Samples: {len(data)} | Features: {feature_cols}")


In [None]:
# STEP 5 — Split time-aware + models
n = len(data); split = max(int(n*0.8), n-5)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]
years_test = years[split:]

pre = ColumnTransformer([("num", StandardScaler(), feature_cols)], remainder="drop")

candidates = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.05, max_iter=10000),
    "RandomForest": RandomForestRegressor(n_estimators=400, random_state=42)
}

results, pipes = [], {}
for name, est in candidates.items():
    pipe = Pipeline([("prep", pre), ("model", est)])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    rmse = mean_squared_error(y_test, pred, squared=False)
    mape = float(np.mean(np.abs((y_test - pred)/(np.abs(y_test)+1e-6))) * 100)
    results.append((name, mae, rmse, mape))
    pipes[name] = (pipe, pred)

res = pd.DataFrame(results, columns=["model","MAE","RMSE","MAPE"]).sort_values("RMSE")
display(res)
best = res.iloc[0]["model"]
best_pipe, best_pred = pipes[best]


In [None]:
# STEP 6 — Plots + importance
plt.figure(figsize=(8,3))
plt.plot(years_test, y_test.values, marker="o", label="Actual")
plt.plot(years_test, best_pred, marker="o", label="Predicted")
plt.title(f"Next-year credit growth — Actual vs Predicted ({COUNTRY}) [{best}]")
plt.xlabel("Year"); plt.ylabel("Credit growth (t+1, %)")
plt.legend(); plt.tight_layout(); plt.savefig(FIG_DIR / "pred_vs_actual.png"); plt.show()

if best in ["LinearRegression","Ridge","Lasso"]:
    importances = pd.Series(best_pipe.named_steps["model"].coef_, index=feature_cols).sort_values()
else:
    importances = pd.Series(best_pipe.named_steps["model"].feature_importances_, index=feature_cols).sort_values()

display(importances.sort_values(ascending=False))
plt.figure(figsize=(6,3))
importances.sort_values().plot(kind="barh")
plt.title(f"Feature importance ({best})")
plt.tight_layout(); plt.savefig(FIG_DIR / "feature_importance.png"); plt.show()


In [None]:
# STEP 7 — Scenario + save metrics
latest_X = X.iloc[[-1]]
latest_year = int(data.iloc[-1]["Year"])
scenario = float(best_pipe.predict(latest_X)[0])

metrics_tidy = res.copy()
metrics_tidy.to_csv(FIG_DIR / "metrics_table.csv", index=False)
res.set_index("model").to_csv(FIG_DIR / "summary_metrics.csv")

print(f"Scenario: Using year {latest_year} macro data for {COUNTRY}, predicted next-year credit growth is {scenario:.2f}%.")


## Copia para el README y el Blog (en inglés)
Best model: **PEGA AQUI**
RMSE / MAE / MAPE: **PEGA AQUI**
Scenario (frase anterior): **PEGA AQUI**
Las imagenes estan en ../figures/ (pred_vs_actual.png, feature_importance.png).
