In [None]:
import pandas as pd
from pathlib import Path

HERE = Path.cwd()

p = HERE
while p != p.parent and not (p / "data").exists():
    p = p.parent

CSV_PATH = p / "data" / "interim" / "IMF_WEO_A_panel.csv"

print("Notebook cwd:", HERE)
print("Detected project root:", p)
print("Trying to load:", CSV_PATH)

df = pd.read_csv(CSV_PATH)

print("Loaded:", df.shape)
df.head()

Loaded: (775, 11)


Unnamed: 0,REF_AREA,TIME_PERIOD,IMF_WEO_NGDP_RPCH,IMF_WEO_LUR,IMF_WEO_LP,IMF_WEO_PCPIPCH,IMF_WEO_TM_RPCH,IMF_WEO_TX_RPCH,IMF_WEO_BCA_NGDPD,IMF_WEO_NGAP_NPGDP,IMF_WEO_NGSD_NGDP
0,AUT,1980,2.314,1.6,7.54,6.325,6.17,5.227,-4.776,0.785,25.919
1,AUT,1981,-0.099,2.2,7.556,6.807,-0.771,5.063,-4.338,-1.579,24.598
2,AUT,1982,1.908,3.1,7.565,5.44,-4.667,1.591,1.003,-1.475,23.647
3,AUT,1983,2.804,3.7,7.543,3.335,5.691,3.638,0.389,-0.502,21.978
4,AUT,1984,0.332,3.8,7.544,5.665,10.05,6.334,-0.266,-2.322,23.015


In [None]:
1) Build target

In [8]:
df = build_target(
    df,
    country_col="REF_AREA",
    year_col="TIME_PERIOD",
    gdp_growth_col="IMF_WEO_NGDP_RPCH"
)

print(df[["REF_AREA","TIME_PERIOD","IMF_WEO_NGDP_RPCH","gdp_growth_tplus1","GDP_accel_tplus1","Decel_flag_tplus1"]].head(10))
print("Class balance:", df["Decel_flag_tplus1"].mean())

  REF_AREA  TIME_PERIOD  IMF_WEO_NGDP_RPCH  gdp_growth_tplus1  \
0      AUT         1980              2.314             -0.099   
1      AUT         1981             -0.099              1.908   
2      AUT         1982              1.908              2.804   
3      AUT         1983              2.804              0.332   
4      AUT         1984              0.332              2.243   
5      AUT         1985              2.243              2.341   
6      AUT         1986              2.341              1.681   
7      AUT         1987              1.681              0.961   
8      AUT         1988              0.961              3.887   
9      AUT         1989              3.887              4.346   

   GDP_accel_tplus1  Decel_flag_tplus1  
0            -2.413                  1  
1             2.007                  0  
2             0.896                  0  
3            -2.472                  1  
4             1.911                  0  
5             0.098                  0

2) Define features (your 9 indicators)

In [9]:
FEATURES = [
    "IMF_WEO_LUR",        
    "IMF_WEO_LP",         
    "IMF_WEO_PCPIPCH",    
    "IMF_WEO_TM_RPCH",    
    "IMF_WEO_TX_RPCH",    
    "IMF_WEO_NGDP_RPCH"   
]

TARGET = "Decel_flag_tplus1"

print("Features available check:", [c for c in FEATURES if c in df.columns])

Features available check: ['IMF_WEO_LUR', 'IMF_WEO_LP', 'IMF_WEO_PCPIPCH', 'IMF_WEO_TM_RPCH', 'IMF_WEO_TX_RPCH', 'IMF_WEO_NGDP_RPCH']


Train Logistic Regression (time-based split) + Metrics

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, recall_score, confusion_matrix, classification_report

split_year = 2010

train_df = df[df["TIME_PERIOD"] <= split_year].copy()
test_df  = df[df["TIME_PERIOD"] > split_year].copy()

print("Train:", train_df.shape, " Test:", test_df.shape)
print("Train years:", train_df["TIME_PERIOD"].min(), "→", train_df["TIME_PERIOD"].max())
print("Test years :", test_df["TIME_PERIOD"].min(), "→", test_df["TIME_PERIOD"].max())

X_train = train_df[FEATURES]
y_train = train_df[TARGET].astype(int)

X_test  = test_df[FEATURES]
y_test  = test_df[TARGET].astype(int)

model = Pipeline([
    ("scaler", StandardScaler()),
    ("logit", LogisticRegression(max_iter=2000))
])

model.fit(X_train, y_train)

proba = model.predict_proba(X_test)[:, 1]
pred  = (proba >= 0.50).astype(int)

print("\nAUC   :", roc_auc_score(y_test, proba))
print("F1    :", f1_score(y_test, pred))
print("Recall:", recall_score(y_test, pred))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred))
print("\nClassification Report:\n", classification_report(y_test, pred, digits=3))

Train: (529, 14)  Test: (227, 14)
Train years: 1980 → 2010
Test years : 2011 → 2022

AUC   : 0.7954935287696865
F1    : 0.6262626262626263
Recall: 0.512396694214876

Confusion Matrix:
 [[91 15]
 [59 62]]

Classification Report:
               precision    recall  f1-score   support

           0      0.607     0.858     0.711       106
           1      0.805     0.512     0.626       121

    accuracy                          0.674       227
   macro avg      0.706     0.685     0.669       227
weighted avg      0.712     0.674     0.666       227



show top coefficients (interpretation)

In [11]:
coef = model.named_steps["logit"].coef_.ravel()
coef_df = pd.DataFrame({"feature": FEATURES, "coef": coef}).sort_values("coef", ascending=False)
coef_df

Unnamed: 0,feature,coef
5,IMF_WEO_NGDP_RPCH,1.072426
1,IMF_WEO_LP,0.399092
2,IMF_WEO_PCPIPCH,0.075754
4,IMF_WEO_TX_RPCH,-0.110589
3,IMF_WEO_TM_RPCH,-0.167776
0,IMF_WEO_LUR,-0.296622


In [None]:
Expanding Window Evaluation

In [12]:
results = []

for test_year in range(2000, 2022):

    train_df = df[df["TIME_PERIOD"] <= test_year - 1]
    test_df  = df[df["TIME_PERIOD"] == test_year]

    if len(test_df) == 0:
        continue

    X_train = train_df[FEATURES]
    y_train = train_df[TARGET].astype(int)

    X_test  = test_df[FEATURES]
    y_test  = test_df[TARGET].astype(int)

    model = Pipeline([
        ("scaler", StandardScaler()),
        ("logit", LogisticRegression(max_iter=2000))
    ])

    model.fit(X_train, y_train)

    proba = model.predict_proba(X_test)[:, 1]
    pred  = (proba >= 0.5).astype(int)

    auc = None
    if y_test.nunique() == 2:
        auc = roc_auc_score(y_test, proba)

    results.append({
        "year": test_year,
        "auc": auc,
        "f1": f1_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "actual_rate": y_test.mean(),
        "pred_rate": pred.mean()
    })

cv_results = pd.DataFrame(results)

cv_results.head()

Unnamed: 0,year,auc,f1,recall,actual_rate,pred_rate
0,2000,0.942857,0.8,0.714286,0.736842,0.578947
1,2001,0.577778,0.555556,0.555556,0.473684,0.473684
2,2002,0.329545,0.315789,0.272727,0.578947,0.421053
3,2003,0.884615,0.769231,0.833333,0.315789,0.368421
4,2004,0.488095,0.545455,0.5,0.631579,0.526316


In [None]:
Average Performance

In [13]:
print("Average AUC:", cv_results["auc"].mean())
print("Average F1:", cv_results["f1"].mean())
print("Average Recall:", cv_results["recall"].mean())

Average AUC: 0.7382360630425646
Average F1: 0.4773907936826598
Average Recall: 0.45528201666676205


Average AUC ≈ 0.74 → Strong for annual macro panel

F1 ≈ 0.48

Recall ≈ 0.46

Interpretation:

The model ranks deceleration risk well (AUC strong).

But threshold 0.5 is too conservative → recall is low.

This is normal in early warning systems.

In [None]:
Find Best Threshold (maximize recall ≥ 0.65)

In [14]:
import numpy as np

thresholds = np.linspace(0.1, 0.9, 17)

best = []

for t in thresholds:
    recalls = []
    f1s = []

    for year in cv_results["year"]:
        test_df = df[df["TIME_PERIOD"] == year]
        train_df = df[df["TIME_PERIOD"] <= year - 1]

        X_train = train_df[FEATURES]
        y_train = train_df[TARGET].astype(int)

        X_test = test_df[FEATURES]
        y_test = test_df[TARGET].astype(int)

        model = Pipeline([
            ("scaler", StandardScaler()),
            ("logit", LogisticRegression(max_iter=2000))
        ])

        model.fit(X_train, y_train)

        proba = model.predict_proba(X_test)[:, 1]
        pred = (proba >= t).astype(int)

        recalls.append(recall_score(y_test, pred, zero_division=0))
        f1s.append(f1_score(y_test, pred, zero_division=0))

    best.append({
        "threshold": t,
        "recall": np.mean(recalls),
        "f1": np.mean(f1s)
    })

threshold_df = pd.DataFrame(best)
threshold_df.sort_values("recall", ascending=False).head()

Unnamed: 0,threshold,recall,f1
0,0.1,0.943872,0.632469
1,0.15,0.921091,0.631631
2,0.2,0.864324,0.622515
3,0.25,0.848881,0.624141
4,0.3,0.815993,0.625728


In [None]:
Explicit Train/Test Definition (Clean Version)

In [20]:


train_start = 1980
train_end   = 2010

test_start  = 2011
test_end    = 2021

train_df = df[(df["TIME_PERIOD"] >= train_start) &
              (df["TIME_PERIOD"] <= train_end)].copy()

test_df  = df[(df["TIME_PERIOD"] >= test_start) &
              (df["TIME_PERIOD"] <= test_end)].copy()

print("Train shape:", train_df.shape)
print("Test shape :", test_df.shape)
print("Train years:", train_df["TIME_PERIOD"].min(), "→", train_df["TIME_PERIOD"].max())
print("Test years :", test_df["TIME_PERIOD"].min(), "→", test_df["TIME_PERIOD"].max())

Train shape: (529, 14)
Test shape : (209, 14)
Train years: 1980 → 2010
Test years : 2011 → 2021


In [None]:
Prepare X / y

In [21]:
X_train = train_df[FEATURES]
y_train = train_df[TARGET].astype(int)

X_test  = test_df[FEATURES]
y_test  = test_df[TARGET].astype(int)

In [None]:
Train Final Model (Single Out-of-Sample Evaluation)

In [22]:
model = Pipeline([
    ("scaler", StandardScaler()),
    ("logit", LogisticRegression(max_iter=2000))
])

model.fit(X_train, y_train)

proba_test = model.predict_proba(X_test)[:, 1]
pred_test  = (proba_test >= 0.5).astype(int)

print("Test AUC:", roc_auc_score(y_test, proba_test))
print("Test F1:", f1_score(y_test, pred_test))
print("Test Recall:", recall_score(y_test, pred_test))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_test))

Test AUC: 0.7879648287232094
Test F1: 0.5952380952380952
Test Recall: 0.4854368932038835

Confusion Matrix:
 [[91 15]
 [53 50]]


In [None]:
add precision + classification report

In [23]:
from sklearn.metrics import precision_score, classification_report

print("Test Precision:", precision_score(y_test, pred_test))
print("\nClassification Report:\n", classification_report(y_test, pred_test, digits=3))

Test Precision: 0.7692307692307693

Classification Report:
               precision    recall  f1-score   support

           0      0.632     0.858     0.728       106
           1      0.769     0.485     0.595       103

    accuracy                          0.675       209
   macro avg      0.701     0.672     0.662       209
weighted avg      0.700     0.675     0.663       209



In [None]:
Threshold Tuning on TEST (maximize Recall / best F1)

In [24]:
import numpy as np
from sklearn.metrics import f1_score, recall_score, precision_score

ths = np.linspace(0.10, 0.90, 17)

rows = []
for t in ths:
    pred_t = (proba_test >= t).astype(int)
    rows.append({
        "threshold": t,
        "precision": precision_score(y_test, pred_t, zero_division=0),
        "recall": recall_score(y_test, pred_t, zero_division=0),
        "f1": f1_score(y_test, pred_t, zero_division=0),
        "pred_rate": pred_t.mean()
    })

thr_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
thr_df.head(10)

Unnamed: 0,threshold,precision,recall,f1,pred_rate
5,0.35,0.674242,0.864078,0.757447,0.631579
6,0.4,0.698276,0.786408,0.739726,0.555024
4,0.3,0.615894,0.902913,0.732283,0.722488
3,0.25,0.588957,0.932039,0.721805,0.779904
7,0.45,0.752688,0.679612,0.714286,0.444976
2,0.2,0.566474,0.951456,0.710145,0.827751
1,0.15,0.540984,0.961165,0.692308,0.875598
0,0.1,0.520833,0.970874,0.677966,0.91866
8,0.5,0.769231,0.485437,0.595238,0.311005
9,0.55,0.803922,0.398058,0.532468,0.244019


In [None]:
Calibration

In [25]:
from sklearn.calibration import calibration_curve

prob_true, prob_pred = calibration_curve(y_test, proba_test, n_bins=10)

calib_df = pd.DataFrame({"mean_pred_prob": prob_pred, "actual_freq": prob_true})
calib_df

Unnamed: 0,mean_pred_prob,actual_freq
0,0.058922,0.176471
1,0.150445,0.105263
2,0.252156,0.227273
3,0.348961,0.342857
4,0.4541,0.607843
5,0.547679,0.791667
6,0.648404,0.625
7,0.753156,0.733333
8,0.837694,1.0
9,0.957522,1.0


In [None]:
Save Risk Scores for Reporting (Country-Year EWS table)

In [26]:
test_out = test_df[["REF_AREA", "TIME_PERIOD"]].copy()
test_out["proba_decel"] = proba_test
test_out["pred_05"] = pred_test

# if you select a tuned threshold, example 0.35:
tuned_t = 0.35
test_out[f"pred_{tuned_t:.2f}"] = (proba_test >= tuned_t).astype(int)

test_out.sort_values(["REF_AREA", "TIME_PERIOD"]).head(20)

Unnamed: 0,REF_AREA,TIME_PERIOD,proba_decel,pred_05,pred_0.35
31,AUT,2011,0.497636,0,1
32,AUT,2012,0.359314,0,1
33,AUT,2013,0.315165,0,0
34,AUT,2014,0.331033,0,0
35,AUT,2015,0.34981,0,0
36,AUT,2016,0.416627,0,1
37,AUT,2017,0.435005,0,1
38,AUT,2018,0.458363,0,1
39,AUT,2019,0.40763,0,1
40,AUT,2020,0.076477,0,0


In [27]:
test_out.to_csv("test_predictions_2011_2021.csv", index=False)
print("Saved: test_predictions_2011_2021.csv")

Saved: test_predictions_2011_2021.csv


Metric calculated (R2 OR Precision/Recall)

In [None]:
Metrics code (AUC / Precision / Recall / F1)

In [28]:
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

# probabilities + predicted class at threshold 0.5
proba_test = model.predict_proba(X_test)[:, 1]
pred_test  = (proba_test >= 0.50).astype(int)

print("AUC      :", roc_auc_score(y_test, proba_test))
print("Precision:", precision_score(y_test, pred_test, zero_division=0))
print("Recall   :", recall_score(y_test, pred_test, zero_division=0))
print("F1       :", f1_score(y_test, pred_test, zero_division=0))

print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_test))
print("\nClassification Report:\n", classification_report(y_test, pred_test, digits=3))

AUC      : 0.7879648287232094
Precision: 0.7692307692307693
Recall   : 0.4854368932038835
F1       : 0.5952380952380952

Confusion Matrix:
 [[91 15]
 [53 50]]

Classification Report:
               precision    recall  f1-score   support

           0      0.632     0.858     0.728       106
           1      0.769     0.485     0.595       103

    accuracy                          0.675       209
   macro avg      0.701     0.672     0.662       209
weighted avg      0.700     0.675     0.663       209



different threshold (Early Warning = higher recall)

In [29]:
t = 0.35  # change this (lower = higher recall)
pred_t = (proba_test >= t).astype(int)

print("Threshold:", t)
print("Precision:", precision_score(y_test, pred_t, zero_division=0))
print("Recall   :", recall_score(y_test, pred_t, zero_division=0))
print("F1       :", f1_score(y_test, pred_t, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, pred_t))

Threshold: 0.35
Precision: 0.6742424242424242
Recall   : 0.8640776699029126
F1       : 0.7574468085106383

Confusion Matrix:
 [[63 43]
 [14 89]]


Automatic threshold table (best F1 / best Recall)

In [30]:
import numpy as np
import pandas as pd

rows = []
for t in np.linspace(0.10, 0.90, 17):
    pred = (proba_test >= t).astype(int)
    rows.append({
        "threshold": t,
        "precision": precision_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "f1": f1_score(y_test, pred, zero_division=0),
        "pred_rate": pred.mean()
    })

thr_df = pd.DataFrame(rows).sort_values("f1", ascending=False)
thr_df.head(10)

Unnamed: 0,threshold,precision,recall,f1,pred_rate
5,0.35,0.674242,0.864078,0.757447,0.631579
6,0.4,0.698276,0.786408,0.739726,0.555024
4,0.3,0.615894,0.902913,0.732283,0.722488
3,0.25,0.588957,0.932039,0.721805,0.779904
7,0.45,0.752688,0.679612,0.714286,0.444976
2,0.2,0.566474,0.951456,0.710145,0.827751
1,0.15,0.540984,0.961165,0.692308,0.875598
0,0.1,0.520833,0.970874,0.677966,0.91866
8,0.5,0.769231,0.485437,0.595238,0.311005
9,0.55,0.803922,0.398058,0.532468,0.244019


In [32]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score

proba_test = model.predict_proba(X_test)[:, 1]
pred_test  = (proba_test >= 0.5).astype(int)

print("AUC:", roc_auc_score(y_test, proba_test))
print("Precision:", precision_score(y_test, pred_test, zero_division=0))
print("Recall:", recall_score(y_test, pred_test, zero_division=0))
print("F1:", f1_score(y_test, pred_test, zero_division=0))

AUC: 0.7879648287232094
Precision: 0.7692307692307693
Recall: 0.4854368932038835
F1: 0.5952380952380952


In [33]:
from sklearn.metrics import r2_score

proba_test = model.predict_proba(X_test)[:, 1]
pred_test  = (proba_test >= 0.5).astype(int)

print("R2 (using class predictions 0/1):", r2_score(y_test, pred_test))

R2 (using class predictions 0/1): -0.3017036087195457


Improve Recall by tuning threshold

In [35]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

proba_test = model.predict_proba(X_test)[:, 1]

rows = []
for t in np.linspace(0.10, 0.90, 17):
    pred = (proba_test >= t).astype(int)
    rows.append({
        "threshold": t,
        "precision": precision_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "f1": f1_score(y_test, pred, zero_division=0),
        "pred_rate": pred.mean()
    })

thr_df = pd.DataFrame(rows)
thr_df.sort_values("recall", ascending=False).head(10)

Unnamed: 0,threshold,precision,recall,f1,pred_rate
0,0.1,0.520833,0.970874,0.677966,0.91866
1,0.15,0.540984,0.961165,0.692308,0.875598
2,0.2,0.566474,0.951456,0.710145,0.827751
3,0.25,0.588957,0.932039,0.721805,0.779904
4,0.3,0.615894,0.902913,0.732283,0.722488
5,0.35,0.674242,0.864078,0.757447,0.631579
6,0.4,0.698276,0.786408,0.739726,0.555024
7,0.45,0.752688,0.679612,0.714286,0.444976
8,0.5,0.769231,0.485437,0.595238,0.311005
9,0.55,0.803922,0.398058,0.532468,0.244019


In [36]:
thr_df.sort_values("f1", ascending=False).head(5)

Unnamed: 0,threshold,precision,recall,f1,pred_rate
5,0.35,0.674242,0.864078,0.757447,0.631579
6,0.4,0.698276,0.786408,0.739726,0.555024
4,0.3,0.615894,0.902913,0.732283,0.722488
3,0.25,0.588957,0.932039,0.721805,0.779904
7,0.45,0.752688,0.679612,0.714286,0.444976


In [37]:
from sklearn.metrics import r2_score
print("R2 (using probabilities):", r2_score(y_test, proba_test))

R2 (using probabilities): 0.20976220591451933


Baseline performance documented

In [40]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report
)

# ============================================================
# 0) SETUP: columns (adjust only if your names differ)
# ============================================================
COUNTRY_COL = "REF_AREA"
YEAR_COL    = "TIME_PERIOD"
GDP_COL     = "IMF_WEO_NGDP_RPCH"          # GDP growth (%)

FEATURES = [
    "IMF_WEO_LUR",        # unemployment
    "IMF_WEO_LP",         # labor productivity
    "IMF_WEO_PCPIPCH",    # inflation
    "IMF_WEO_TM_RPCH",    # imports growth
    "IMF_WEO_TX_RPCH",    # exports growth
    "IMF_WEO_NGDP_RPCH"   # GDP growth
]

TARGET = "Decel_flag_tplus1"

# ============================================================
# 1) BUILD TARGET (Decel_flag_tplus1) - NO LEAKAGE
# ============================================================
df = df.sort_values([COUNTRY_COL, YEAR_COL]).copy()
df["gdp_growth_tplus1"] = df.groupby(COUNTRY_COL)[GDP_COL].shift(-1)
df["GDP_accel_tplus1"]  = df["gdp_growth_tplus1"] - df[GDP_COL]
df[TARGET]              = (df["GDP_accel_tplus1"] < 0).astype(int)

# drop last year per country (no t+1 label)
df = df.dropna(subset=["gdp_growth_tplus1", "GDP_accel_tplus1"]).copy()

print("Data after target:", df.shape)
print("Class balance (mean of target):", df[TARGET].mean())

# ============================================================
# 2) TRAIN/TEST SPLIT (TIME-BASED)
# ============================================================
train_start, train_end = 1980, 2010
test_start,  test_end  = 2011, 2021

train_df = df[(df[YEAR_COL] >= train_start) & (df[YEAR_COL] <= train_end)].copy()
test_df  = df[(df[YEAR_COL] >= test_start)  & (df[YEAR_COL] <= test_end)].copy()

print("\nTrain shape:", train_df.shape, "Years:", train_df[YEAR_COL].min(), "→", train_df[YEAR_COL].max())
print("Test  shape:", test_df.shape,  "Years:", test_df[YEAR_COL].min(),  "→", test_df[YEAR_COL].max())

X_train = train_df[FEATURES]
y_train = train_df[TARGET].astype(int)

X_test  = test_df[FEATURES]
y_test  = test_df[TARGET].astype(int)

# ============================================================
# 3) BASELINE MODEL: Logistic Regression (with Scaling)
# ============================================================
baseline_model = Pipeline([
    ("scaler", StandardScaler()),
    ("logit", LogisticRegression(max_iter=2000))
])

baseline_model.fit(X_train, y_train)

# predicted probability of class 1 (deceleration)
proba_test = baseline_model.predict_proba(X_test)[:, 1]

# default threshold = 0.50
threshold = 0.50
pred_test = (proba_test >= threshold).astype(int)

# ============================================================
# 4) BASELINE PERFORMANCE (DOCUMENTED)
# ============================================================
auc   = roc_auc_score(y_test, proba_test)
prec  = precision_score(y_test, pred_test, zero_division=0)
rec   = recall_score(y_test, pred_test, zero_division=0)
f1    = f1_score(y_test, pred_test, zero_division=0)
cm    = confusion_matrix(y_test, pred_test)

print("\n================ BASELINE RESULTS ================")
print(f"Threshold: {threshold:.2f}")
print(f"AUC      : {auc:.3f}")
print(f"Precision: {prec:.3f}")
print(f"Recall   : {rec:.3f}")
print(f"F1       : {f1:.3f}")
print("\nConfusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, pred_test, digits=3))

# ============================================================
# 5) SAVE BASELINE PREDICTIONS (for report/plots)
# ============================================================
baseline_pred = test_df[[COUNTRY_COL, YEAR_COL]].copy()
baseline_pred["proba_decel"] = proba_test
baseline_pred["pred_050"]    = pred_test

out_path = "baseline_predictions_2011_2021.csv"
baseline_pred.to_csv(out_path, index=False)
print("\nSaved:", out_path)

Data after target: (737, 14)
Class balance (mean of target): 0.4816824966078697

Train shape: (529, 14) Years: 1980 → 2010
Test  shape: (208, 14) Years: 2011 → 2021

Threshold: 0.50
AUC      : 0.786
Precision: 0.766
Recall   : 0.480
F1       : 0.590

Confusion Matrix:
 [[91 15]
 [53 49]]

Classification Report:
               precision    recall  f1-score   support

           0      0.632     0.858     0.728       106
           1      0.766     0.480     0.590       102

    accuracy                          0.673       208
   macro avg      0.699     0.669     0.659       208
weighted avg      0.697     0.673     0.661       208


Saved: baseline_predictions_2011_2021.csv


SABINA COMMENT: I think I would use 5 years rather than 10 years in my test set. Time series are a little bit different in terms of validation... also I'm afraid that not having v recent yrs might also be why your scores are low.

Increase Recall (Policy-Relevant Model)

In [41]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

proba_test = baseline_model.predict_proba(X_test)[:, 1]

rows = []
for t in np.linspace(0.10, 0.90, 17):
    pred = (proba_test >= t).astype(int)
    rows.append({
        "threshold": t,
        "precision": precision_score(y_test, pred, zero_division=0),
        "recall": recall_score(y_test, pred, zero_division=0),
        "f1": f1_score(y_test, pred, zero_division=0),
        "pred_rate": pred.mean()
    })

thr_df = pd.DataFrame(rows)
thr_df.sort_values("recall", ascending=False).head(10)

Unnamed: 0,threshold,precision,recall,f1,pred_rate
0,0.1,0.518325,0.970588,0.675768,0.918269
1,0.15,0.538462,0.960784,0.690141,0.875
2,0.2,0.563953,0.95098,0.708029,0.826923
3,0.25,0.58642,0.931373,0.719697,0.778846
4,0.3,0.613333,0.901961,0.730159,0.721154
5,0.35,0.671756,0.862745,0.755365,0.629808
6,0.4,0.695652,0.784314,0.737327,0.552885
7,0.45,0.75,0.676471,0.71134,0.442308
8,0.5,0.765625,0.480392,0.590361,0.307692
9,0.55,0.8,0.392157,0.526316,0.240385


In [42]:
final_threshold = 0.35

In [43]:
final_threshold = 0.35

final_pred = (proba_test >= final_threshold).astype(int)

print("FINAL MODEL (Threshold = 0.35)")
print("Precision:", precision_score(y_test, final_pred))
print("Recall:", recall_score(y_test, final_pred))
print("F1:", f1_score(y_test, final_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, final_pred))

FINAL MODEL (Threshold = 0.35)
Precision: 0.6717557251908397
Recall: 0.8627450980392157
F1: 0.7553648068669528
Confusion Matrix:
 [[63 43]
 [14 88]]


SABINA COMMENT: Not bad! Well done :) 