In [1]:
from pathlib import Path
import pandas as pd

# Resolve project root safely
BASE_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

# Define path to scores file
SCORES_PATH = BASE_DIR / "data" / "processed" / "iforest_scores.csv"

# (Optional but recommended) safety check
assert SCORES_PATH.exists(), f"Missing scores file: {SCORES_PATH}"

# Load data
df = pd.read_csv(SCORES_PATH)

df.head()


Unnamed: 0,anomaly_score,is_anomaly
0,0.586762,True
1,0.616149,True
2,0.626528,True
3,0.592172,True
4,0.596531,True


In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

# Resolve project root safely
BASE_DIR = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()

# Define path
SCORES_PATH = BASE_DIR / "data" / "processed" / "iforest_scores.csv"

# Optional safety check
assert SCORES_PATH.exists(), f"Missing scores file: {SCORES_PATH}"

# Load scores
scores_df = pd.read_csv(SCORES_PATH)
scores = scores_df["anomaly_score"].values


In [3]:
alert_rates = [0.005, 0.01, 0.02]  # 0.5%, 1%, 2%

In [4]:
thresholds = {}

for rate in alert_rates:
    threshold = np.percentile(scores, 100 * (1 - rate))
    thresholds[rate] = threshold

thresholds


{0.005: np.float64(0.6074460799082331),
 0.01: np.float64(0.5855393791151755),
 0.02: np.float64(0.5541245405571872)}

In [5]:
for rate, threshold in thresholds.items():
    col_name = f"alert_{int(rate*1000)}bp"
    scores_df[col_name] = scores_df["anomaly_score"] >= threshold


In [6]:
scores_df.filter(like="alert_").head()

Unnamed: 0,alert_5bp,alert_10bp,alert_20bp
0,False,True,True
1,True,True,True
2,True,True,True
3,False,True,True
4,False,True,True


In [7]:
for col in scores_df.columns:
    if col.startswith("alert_"):
        print(col, scores_df[col].mean())


alert_5bp 0.005000482002043689
alert_10bp 0.010000192800817476
alert_20bp 0.020000385601634953


In [10]:
scores_df.columns.tolist()

['anomaly_score', 'is_anomaly', 'alert_5bp', 'alert_10bp', 'alert_20bp']