In [3]:
from pathlib import Path
import pandas as pd

# -----------------------------
# Resolve project root safely
# -----------------------------
BASE_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

# -----------------------------
# Define paths
# -----------------------------
RAW_PATH = BASE_DIR / "data" / "raw" / "creditcard.csv"
IFOREST_PATH = BASE_DIR / "data" / "processed" / "iforest_scores.csv"
AE_PATH = BASE_DIR / "data" / "processed" / "ae_scores.csv"

# -----------------------------
# Optional safety checks
# -----------------------------
assert RAW_PATH.exists(), f"Missing raw data file: {RAW_PATH}"
assert IFOREST_PATH.exists(), f"Missing iForest scores file: {IFOREST_PATH}"
assert AE_PATH.exists(), f"Missing autoencoder scores file: {AE_PATH}"

# -----------------------------
# Load data
# -----------------------------
# Original fraud labels
raw = pd.read_csv(RAW_PATH, usecols=["is_fraud"])
labels = raw["is_fraud"]

# Model scores
if_df = pd.read_csv(IFOREST_PATH)
ae_df = pd.read_csv(AE_PATH)

# -----------------------------
# Combine scores and labels for comparison
# -----------------------------
scores = pd.DataFrame({
    "if_score": if_df["anomaly_score"],
    "ae_score": ae_df["ae_score"],
    "label": labels
})


In [5]:
eval_df = pd.DataFrame({
    "label": raw["is_fraud"],
    "if_score": if_df["anomaly_score"],
    "ae_score": ae_df["ae_score"]
})


In [6]:
eval_df["label"].mean()


np.float64(0.005788651743883394)

In [7]:
import numpy as np

alert_rates = [0.005, 0.01, 0.02]


In [8]:
def precision_at_k(scores, labels, rate):
    threshold = np.percentile(scores, 100 * (1 - rate))
    preds = scores >= threshold
    return labels[preds].mean()


In [9]:
results = []

for rate in alert_rates:
    results.append({
        "alert_rate": rate,
        "model": "IsolationForest",
        "precision_at_k": precision_at_k(
            eval_df["if_score"].values,
            eval_df["label"].values,
            rate
        )
    })
    results.append({
        "alert_rate": rate,
        "model": "Autoencoder",
        "precision_at_k": precision_at_k(
            eval_df["ae_score"].values,
            eval_df["label"].values,
            rate
        )
    })

precision_df = pd.DataFrame(results)
precision_df


Unnamed: 0,alert_rate,model,precision_at_k
0,0.005,IsolationForest,0.006169
1,0.005,Autoencoder,0.005861
2,0.01,IsolationForest,0.006401
3,0.01,Autoencoder,0.006247
4,0.02,IsolationForest,0.006285
5,0.02,Autoencoder,0.00563


In [10]:
def recall_at_k(scores, labels, rate):
    threshold = np.percentile(scores, 100 * (1 - rate))
    preds = scores >= threshold
    return labels[preds].sum() / labels.sum()


In [11]:
results = []

for rate in alert_rates:
    results.append({
        "alert_rate": rate,
        "model": "IsolationForest",
        "recall_at_k": recall_at_k(
            eval_df["if_score"].values,
            eval_df["label"].values,
            rate
        )
    })
    results.append({
        "alert_rate": rate,
        "model": "Autoencoder",
        "recall_at_k": recall_at_k(
            eval_df["ae_score"].values,
            eval_df["label"].values,
            rate
        )
    })

recall_df = pd.DataFrame(results)
recall_df


Unnamed: 0,alert_rate,model,recall_at_k
0,0.005,IsolationForest,0.005329
1,0.005,Autoencoder,0.005063
2,0.01,IsolationForest,0.011058
3,0.01,Autoencoder,0.010791
4,0.02,IsolationForest,0.021716
5,0.02,Autoencoder,0.019451


In [12]:
from sklearn.metrics import roc_auc_score

roc_results = {
    "IsolationForest": roc_auc_score(eval_df["label"], eval_df["if_score"]),
    "Autoencoder": roc_auc_score(eval_df["label"], eval_df["ae_score"])
}

roc_results


{'IsolationForest': 0.5006937096322033, 'Autoencoder': 0.4954634274174488}

In [13]:
final_metrics = precision_df.merge(
    recall_df,
    on=["model", "alert_rate"]
)

final_metrics


Unnamed: 0,alert_rate,model,precision_at_k,recall_at_k
0,0.005,IsolationForest,0.006169,0.005329
1,0.005,Autoencoder,0.005861,0.005063
2,0.01,IsolationForest,0.006401,0.011058
3,0.01,Autoencoder,0.006247,0.010791
4,0.02,IsolationForest,0.006285,0.021716
5,0.02,Autoencoder,0.00563,0.019451
