# Week 10 â€” Day 2: Random Forest

### Imports + load split

In [2]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import (confusion_matrix, precision_score, recall_score, f1_score, average_precision_score)

In [3]:
ARTIFACTS_DIR = Path("..") / "models"
REPORTS_DIR = Path("..") / "reports"
REPORTS_DIR.mkdir(exist_ok=True)

X_train, X_test, y_train, y_test = joblib.load(ARTIFACTS_DIR / "split_v1.joblib")

print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (227845, 30) (227845,)
Test: (56962, 30) (56962,)


### Reusing Feature Engineering Function

In [4]:
def add_features(df):
    df = df.copy()
    df["log_amount"] = np.log1p(df["Amount"])
    df["hour"] = (df["Time"] // 3600).astype(int)
    df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
    df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)
    return df

In [5]:
# applying features
X_train_fe = add_features(X_train)
X_test_fe = add_features(X_test)

print("Before:", X_train.shape, "After:", X_train_fe.shape)

Before: (227845, 30) After: (227845, 34)


### Random Forest Model

In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    n_jobs=-1,
    class_weight="balanced_subsample"
)

rf.fit(X_train_fe, y_train)
print("Random Forest trained.")

Random Forest trained.


### Evaluate

In [7]:
# predict and scores
y_pred = rf.predict(X_test_fe)
y_prob = rf.predict_proba(X_test_fe)[:, 1]

In [8]:
# evaluation metrics
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
pr_auc = average_precision_score(y_test, y_prob)

print("Confusion matrix (tn, fp, fn, tp):", tn, fp, fn, tp)
print(f"Precision: {precision:.6f}")
print(f"Recall:    {recall:.6f}")
print(f"F1:        {f1:.6f}")
print(f"PR-AUC:    {pr_auc:.6f}")

Confusion matrix (tn, fp, fn, tp): 56861 3 25 73
Precision: 0.960526
Recall:    0.744898
F1:        0.839080
PR-AUC:    0.865561


In [9]:
# comparison table 
week9_baseline = {"model": "LogReg Baseline", "precision": 0.826667, "recall": 0.632653, "f1": 0.716763, "pr_auc": 0.741382}
week9_weighted = {"model": "LogReg + Class Weights", "precision": 0.060976, "recall": 0.918367, "f1": 0.114358, "pr_auc": 0.718971}
week10_fe_weighted = {"model": "LogReg + FE + Class Weights", "precision": 0.059840, "recall": 0.918367, "f1": 0.112360, "pr_auc": 0.720672}

week10_rf = {"model": "Random Forest + FE", "precision": precision, "recall": recall, "f1": f1, "pr_auc": pr_auc}

compare_df = pd.DataFrame([week9_baseline, week9_weighted, week10_fe_weighted, week10_rf])
compare_df

Unnamed: 0,model,precision,recall,f1,pr_auc
0,LogReg Baseline,0.826667,0.632653,0.716763,0.741382
1,LogReg + Class Weights,0.060976,0.918367,0.114358,0.718971
2,LogReg + FE + Class Weights,0.05984,0.918367,0.11236,0.720672
3,Random Forest + FE,0.960526,0.744898,0.83908,0.865561


### Save Table and Random Forest

In [10]:
compare_df.to_csv(REPORTS_DIR / "week10_day2_random_forest_results.csv", index=False)
print("Saved:", REPORTS_DIR / "week10_day2_random_forest_results.csv")

Saved: ..\reports\week10_day2_random_forest_results.csv


In [11]:
joblib.dump(rf, ARTIFACTS_DIR / "rf_fe_v1.joblib")
print("Saved:", ARTIFACTS_DIR / "rf_fe_v1.joblib")

Saved: ..\models\rf_fe_v1.joblib
