# Week 10 â€” Day 4: Threshold Tuning (Random Forest)

### Imports and Loads

In [1]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, average_precision_score

In [2]:
ARTIFACTS_DIR = Path("..") / "models"
REPORTS_DIR = Path("..") / "reports"
REPORTS_DIR.mkdir(exist_ok=True)

X_train, X_test, y_train, y_test = joblib.load(ARTIFACTS_DIR / "split_v1.joblib")
rf = joblib.load(ARTIFACTS_DIR / "rf_fe_v1.joblib")

print("Loaded split + Random Forest.")
print("Test shape:", X_test.shape, y_test.shape)

Loaded split + Random Forest.
Test shape: (56962, 30) (56962,)


In [3]:
# feature engineering function
def add_features(df):
    df = df.copy()
    df["log_amount"] = np.log1p(df["Amount"])
    df["hour"] = (df["Time"] // 3600).astype(int)
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    return df

In [5]:
X_test_fe = add_features(X_test)

### Probability Scores

In [6]:
y_prob = rf.predict_proba(X_test_fe)[:, 1]  # fraud probability
y_prob[:10]

array([0.        , 0.        , 0.00333333, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])

**Helper function for Threshold**

In [7]:
def evaluate_at_threshold(y_true, y_prob, threshold):
    y_pred = (y_prob >= threshold).astype(int)

    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()

    precision = precision_score(y_true, y_pred, zero_division=0)
    recall = recall_score(y_true, y_pred, zero_division=0)
    f1 = f1_score(y_true, y_pred, zero_division=0)

    return {
        "threshold": threshold,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "tn": tn, "fp": fp, "fn": fn, "tp": tp
    }

In [8]:
# evaluate at 0.5
before = evaluate_at_threshold(y_test, y_prob, 0.5)
before

{'threshold': 0.5,
 'precision': 0.9605263157894737,
 'recall': 0.7448979591836735,
 'f1': 0.8390804597701149,
 'tn': np.int64(56861),
 'fp': np.int64(3),
 'fn': np.int64(25),
 'tp': np.int64(73)}

### Search thresholds to hit target recall

In [9]:
TARGET_RECALL = 0.90

In [10]:
# trying different thresholds
thresholds = np.linspace(0.01, 0.99, 99)
rows = [evaluate_at_threshold(y_test, y_prob, t) for t in thresholds]
thr_df = pd.DataFrame(rows)

thr_df.head()

Unnamed: 0,threshold,precision,recall,f1,tn,fp,fn,tp
0,0.01,0.181263,0.908163,0.302207,56462,402,9,89
1,0.02,0.385965,0.897959,0.539877,56724,140,10,88
2,0.03,0.51462,0.897959,0.654275,56781,83,10,88
3,0.04,0.60274,0.897959,0.721311,56806,58,10,88
4,0.05,0.642336,0.897959,0.748936,56815,49,10,88


In [11]:
candidates = thr_df[thr_df["recall"] >= TARGET_RECALL].sort_values(
    by=["precision", "f1"], ascending=False
)

candidates.head(10)

Unnamed: 0,threshold,precision,recall,f1,tn,fp,fn,tp
0,0.01,0.181263,0.908163,0.302207,56462,402,9,89


In [12]:
best = candidates.iloc[0].to_dict()
best

{'threshold': 0.01,
 'precision': 0.18126272912423624,
 'recall': 0.9081632653061225,
 'f1': 0.30220713073005095,
 'tn': 56462.0,
 'fp': 402.0,
 'fn': 9.0,
 'tp': 89.0}

### Comparison of before and after threshold tuning

In [13]:
after = evaluate_at_threshold(y_test, y_prob, best["threshold"])

compare = pd.DataFrame([before, after], index=["Before (0.5)", "After (tuned)"])
compare

Unnamed: 0,threshold,precision,recall,f1,tn,fp,fn,tp
Before (0.5),0.5,0.960526,0.744898,0.83908,56861,3,25,73
After (tuned),0.01,0.181263,0.908163,0.302207,56462,402,9,89


In [14]:
print("BEFORE threshold = 0.5")
print("tn fp fn tp:", before["tn"], before["fp"], before["fn"], before["tp"])

print("\nAFTER tuned threshold =", after["threshold"])
print("tn fp fn tp:", after["tn"], after["fp"], after["fn"], after["tp"])

BEFORE threshold = 0.5
tn fp fn tp: 56861 3 25 73

AFTER tuned threshold = 0.01
tn fp fn tp: 56462 402 9 89


In [15]:
# save tuned threshold and results
# Save chosen threshold
joblib.dump(best["threshold"], ARTIFACTS_DIR / "rf_threshold_v1.joblib")

# Save full threshold sweep + before/after
thr_df.to_csv(REPORTS_DIR / "week10_day4_threshold_sweep.csv", index=False)
compare.to_csv(REPORTS_DIR / "week10_day4_before_after_threshold.csv")

print("Saved threshold + sweep results.")

Saved threshold + sweep results.
