In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.metrics import average_precision_score, roc_auc_score
import math, time

# ----------------- CONFIG (simpler) -----------------
CSV_PATH = '/content/processed_features_scenario1p3n23f.csv'
RANDOM_STATE = 42

MIN_ROWS_FOR_SPLIT = 3

# If True use per-user 0.99 percentile threshold; else global percentile
PER_USER_THRESHOLD = True
PCT_THRESHOLD = 99

# features to treat as already-normalized / flags (do NOT per-user normalize these)
NO_NORMALIZE_COLUMNS = [
    # flags / categorical / label
    "is_weekend","wikileaks_flag","offhour_logon_flag","offhour_usb_flag","offhour_http_flag",
    "is_first_time_usb_user","rare_usb_user","scenario","is_malicious",'user','day'
    # z-scores (already normalized)
    "usb_connect_disconnect_zscore","http_offhour_requests_zscore",
    "session_duration_hours_zscore","total_http_requests_zscore",
    # rolling percentiles & stds (already relative)
    "logon_online_duration_rollpct_7d","logon_online_duration_rollpct_30d",
    "logon_offhour_count_rollpct_7d","logon_offhour_count_rollpct_30d",
    "logon_distinct_pcs_rollpct_7d","logon_distinct_pcs_rollpct_30d",
    "http_total_requests_rollpct_7d","http_total_requests_rollpct_30d",
    "http_unique_domains_rollpct_7d","http_unique_domains_rollpct_30d",
    "device_usb_count_rollpct_7d","device_usb_count_rollpct_30d",
    "device_total_usb_duration_rollpct_7d","device_total_usb_duration_rollpct_30d",
    "device_afterhours_usb_rollpct_7d","device_afterhours_usb_rollpct_30d",
    "logon_online_duration_rollstd_7d","logon_online_duration_rollstd_30d",
    "http_total_requests_rollstd_7d","http_total_requests_rollstd_30d",
    "device_usb_count_rollstd_7d","device_usb_count_rollstd_30d",
]

RULE_FLAG_NAMES = ["is_weekend","wikileaks_flag","offhour_logon_flag","offhour_usb_flag","offhour_http_flag"]
RULE_WEIGHTS = {"wikileaks_flag":1.0, "offhour_usb_flag":0.7, "offhour_http_flag":0.3, "offhour_logon_flag":0.3}

# ----------------- LOAD -----------------
df = pd.read_csv(CSV_PATH)
assert 'user' in df.columns and 'day' in df.columns and 'is_malicious' in df.columns
df['day'] = pd.to_datetime(df['day'])

In [2]:
# ----------------- SPLIT per-user time-respecting (train/val/test) -----------------
def per_user_time_split_train_test(df, train_frac=0.5, min_rows=3):
    """
    Per-user time-respecting split into train/test.
    Users with fewer than min_rows go entirely into train (so we have baseline stats).
    Ensures at least 1 row in train and 1 row in test when possible.
    """
    train_parts, test_parts = [], []
    for user, g in df.groupby('user'):
        g = g.sort_values('day')
        n = len(g)
        if n < min_rows:
            train_parts.append(g)
            continue
        # allocate at least one row to each side when possible
        i1 = int(math.floor(train_frac * n))
        if i1 < 1:
            i1 = 1
        if i1 >= n:
            i1 = n - 1
        train_parts.append(g.iloc[:i1])
        test_parts.append(g.iloc[i1:])
    train_df = pd.concat(train_parts).reset_index(drop=True)
    test_df  = pd.concat(test_parts).reset_index(drop=True) if test_parts else pd.DataFrame(columns=df.columns)
    return train_df, test_df


# Replace the 3-way split call:
train_df, test_df = per_user_time_split_train_test(df, train_frac=0.5, min_rows=MIN_ROWS_FOR_SPLIT)

#train_df, val_df, test_df = per_user_time_split(df)
print(f"Split sizes: train={len(train_df)}, test={len(test_df)}")
print("Unique users: train/test:", train_df['user'].nunique(), test_df['user'].nunique())

Split sizes: train=160738, test=160861
Unique users: train/test: 960 960


In [3]:
# ----------------- Determine features: raw-like vs derived -----------------
EXPLICIT_DROP = ['user','day','is_malicious','scenario']
all_features = [c for c in df.columns if c not in EXPLICIT_DROP]
RAW_LIKE = [c for c in all_features if c not in NO_NORMALIZE_COLUMNS]
DERIVED_KEEP = [c for c in all_features if c in NO_NORMALIZE_COLUMNS and c not in ('is_malicious','scenario','user','day',"is_weekend","wikileaks_flag","offhour_logon_flag","offhour_usb_flag","offhour_http_flag"'')]
print(f"Raw-like features (to normalize): {len(RAW_LIKE)}")
print(f"Derived/flags kept as-is: {len(DERIVED_KEEP)}")

print(DERIVED_KEEP)
print(RAW_LIKE)

# ----------------- Train-benign subset for baseline stats and IF training -----------------
train_ben_df = train_df[train_df['is_malicious']==0].reset_index(drop=True)
if train_ben_df.empty:
    raise ValueError("No benign rows in training partition — cannot fit IF")

# simple impute medians for missing raw-like values before computing stats
medians = train_ben_df[RAW_LIKE].median()

Raw-like features (to normalize): 23
Derived/flags kept as-is: 27
['is_first_time_usb_user', 'rare_usb_user', 'http_offhour_requests_zscore', 'session_duration_hours_zscore', 'total_http_requests_zscore', 'logon_online_duration_rollpct_7d', 'logon_online_duration_rollstd_7d', 'logon_online_duration_rollpct_30d', 'logon_online_duration_rollstd_30d', 'logon_offhour_count_rollpct_7d', 'logon_offhour_count_rollpct_30d', 'logon_distinct_pcs_rollpct_7d', 'logon_distinct_pcs_rollpct_30d', 'http_total_requests_rollpct_7d', 'http_total_requests_rollstd_7d', 'http_total_requests_rollpct_30d', 'http_total_requests_rollstd_30d', 'http_unique_domains_rollpct_7d', 'http_unique_domains_rollpct_30d', 'device_usb_count_rollpct_7d', 'device_usb_count_rollstd_7d', 'device_usb_count_rollpct_30d', 'device_usb_count_rollstd_30d', 'device_total_usb_duration_rollpct_7d', 'device_total_usb_duration_rollpct_30d', 'device_afterhours_usb_rollpct_7d', 'device_afterhours_usb_rollpct_30d']
['logon_count', 'logoff_co

In [4]:
# ----------------- Simple per-user med/std (no shrink) -----------------
user_med = train_ben_df.groupby('user')[RAW_LIKE].median()
user_std = train_ben_df.groupby('user')[RAW_LIKE].std().replace(0, 1.0)
global_med = train_ben_df[RAW_LIKE].median()
global_std = train_ben_df[RAW_LIKE].std().replace(0, 1.0)

def norm_df(input_df, features=RAW_LIKE):
    out = pd.DataFrame(index=input_df.index, columns=features, dtype=float)
    for feat in features:
        med_map = user_med[feat]
        std_map = user_std[feat]
        user_means = input_df['user'].map(med_map).fillna(global_med[feat])
        user_stds  = input_df['user'].map(std_map).fillna(global_std[feat])
        vals = input_df[feat].fillna(medians[feat])
        out[feat] = (vals - user_means) / user_stds
    return out.fillna(0.0)

X_train_scaled = norm_df(train_ben_df)
X_test_scaled  = norm_df(test_df)


In [5]:
def assemble_features(df_rows, raw_scaled_df):
    derived = df_rows[DERIVED_KEEP].reset_index(drop=True)
    assembled = pd.concat([raw_scaled_df.reset_index(drop=True), derived.reset_index(drop=True)], axis=1)
    assembled = assembled.apply(pd.to_numeric, errors='coerce').fillna(0.0)
    return assembled

X_train_if = assemble_features(train_ben_df, X_train_scaled)   # fit IF on benign
X_test = assemble_features(test_df, X_test_scaled)

print("X_train_if shape:", X_train_if.shape, "X_test shape:", X_test.shape)

X_train_if shape: (160738, 50) X_test shape: (160861, 50)


In [6]:
# ----------------- Fit IsolationForest on benign training rows -----------------
clf = IsolationForest(n_estimators=200, max_samples= 'auto', contamination='auto', random_state=RANDOM_STATE)
clf.fit(X_train_if)

train_scores = -clf.decision_function(X_train_if)  # higher = more anomalous
test_scores = -clf.decision_function(X_test)

# Attach scores
train_ben_df = train_ben_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
train_ben_df['anomaly_score'] = train_scores
test_df['anomaly_score'] = test_scores

In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

def evaluate_thresholded_preds(y_true, y_score, thresholds_array):
    # thresholds_array: array-like of threshold values of same length as y_score OR a scalar per-row array
    preds = (y_score >= thresholds_array).astype(int)
    return {
        'acc': accuracy_score(y_true, preds),
        'prec': precision_score(y_true, preds, zero_division=0),
        'rec': recall_score(y_true, preds, zero_division=0),
        'f1': f1_score(y_true, preds, zero_division=0),
        'n_alerts': int(preds.sum())
    }

def tune_percentile(train_scores, test_df, pct_list=range(90,100), per_user=True):
    rows = []
    total_tp = int(test_df['is_malicious'].sum())
    y_true = test_df['is_malicious'].astype(int).values
    for p in pct_list:
        global_thr = np.percentile(train_scores, p)
        if per_user:
            pupct = p/100.0
            per_user_thresh = train_ben_df.groupby('user')['anomaly_score'].quantile(pupct)
            thr_map = test_df['user'].map(per_user_thresh).fillna(global_thr).values
        else:
            thr_map = np.full(len(test_df), global_thr)
        stats = evaluate_thresholded_preds(y_true, test_df['anomaly_score'].values, thr_map)
        # compute percent of all TPs found and precision@k for a small top-k budget (optional)
        # percent of all TPs covered:
        preds = (test_df['anomaly_score'].values >= thr_map).astype(int)
        tp_in_budget = int(((test_df['is_malicious'].astype(int).values) * preds).sum())
        pct_of_all_tp = (tp_in_budget / total_tp*100.0) if total_tp>0 else float('nan')
        rows.append({
            'pct': p, 'n_alerts': stats['n_alerts'], 'prec': stats['prec'],
            'rec': stats['rec'], 'f1': stats['f1'], 'pct_of_all_tp': pct_of_all_tp
        })
    res = pd.DataFrame(rows)
    display(res.sort_values('pct', ascending=False).reset_index(drop=True))
    # show best by F1
    best = res.loc[res['f1'].idxmax()]
    print(f"\nBest by F1 -> percentile {int(best['pct'])}, F1={best['f1']:.4f}, prec={best['prec']:.4f}, rec={best['rec']:.4f}, alerts={int(best['n_alerts'])}")
    return res

# Example usage:
res = tune_percentile(train_scores, test_df, pct_list=range(90,100), per_user=PER_USER_THRESHOLD)


Unnamed: 0,pct,n_alerts,prec,rec,f1,pct_of_all_tp
0,99,2149,0.037227,0.941176,0.07162,94.117647
1,98,4014,0.020429,0.964706,0.04001,96.470588
2,97,5658,0.014493,0.964706,0.028557,96.470588
3,96,7285,0.011531,0.988235,0.022795,98.823529
4,95,8890,0.009449,0.988235,0.018719,98.823529
5,94,10430,0.008054,0.988235,0.015977,98.823529
6,93,11952,0.007028,0.988235,0.013957,98.823529
7,92,13498,0.006223,0.988235,0.012368,98.823529
8,91,15002,0.005599,0.988235,0.011135,98.823529
9,90,16502,0.00509,0.988235,0.010128,98.823529



Best by F1 -> percentile 99, F1=0.0716, prec=0.0372, rec=0.9412, alerts=2149


In [8]:
# ----------------- thresholds: global and per-user -----------------
global_threshold = np.percentile(train_scores, PCT_THRESHOLD)
if PER_USER_THRESHOLD:
    pupct = PCT_THRESHOLD/100.0
    per_user_thresh = train_ben_df.groupby('user')['anomaly_score'].quantile(pupct)
    # fallback for users not present in train benign -> use global
    test_df['anomaly_threshold'] = test_df['user'].map(per_user_thresh).fillna(global_threshold)
else:
    test_df['anomaly_threshold'] = global_threshold

# mark model-detected anomalies (anomaly-only detection)
test_df['is_model_anomaly'] = (test_df['anomaly_score'] >= test_df['anomaly_threshold']).astype(int)

In [9]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# ensure labels/preds exist
if 'is_model_anomaly' not in test_df.columns:
    raise RuntimeError("Column 'is_model_anomaly' not found in test_df")

y_true = test_df['is_malicious'].astype(int).values
y_pred = test_df['is_model_anomaly'].astype(int).values

if len(y_true) == 0:
    print("No rows in test set to evaluate.")
else:
    acc   = accuracy_score(y_true, y_pred)
    prec  = precision_score(y_true, y_pred, zero_division=0)
    rec   = recall_score(y_true, y_pred, zero_division=0)
    f1    = f1_score(y_true, y_pred, zero_division=0)
    cm    = confusion_matrix(y_true, y_pred)  # rows: true, cols: pred

    print("Percentile: ", PCT_THRESHOLD)
    print("=== Detection metrics (anomaly-only thresholding) ===")
    print(f"Count (test rows) : {len(y_true)}")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1 score : {f1:.4f}\n")

    print("Confusion matrix (rows=true [0,1], cols=pred [0,1]):")
    print(cm)
    print("\nClassification report:")
    print(classification_report(y_true, y_pred, zero_division=0))


Percentile:  99
=== Detection metrics (anomaly-only thresholding) ===
Count (test rows) : 160861
Accuracy : 0.9871
Precision: 0.0372
Recall   : 0.9412
F1 score : 0.0716

Confusion matrix (rows=true [0,1], cols=pred [0,1]):
[[158707   2069]
 [     5     80]]

Classification report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99    160776
           1       0.04      0.94      0.07        85

    accuracy                           0.99    160861
   macro avg       0.52      0.96      0.53    160861
weighted avg       1.00      0.99      0.99    160861



In [10]:
# ----------------- Investigation budgets for model-score re-rank -----------------
import numpy as np

# sanity checks
if 'test_df' not in globals():
    raise RuntimeError("test_df not found. Run previous blocks first.")
if 'is_model_anomaly' not in test_df.columns:
    raise RuntimeError("'is_model_anomaly' missing in test_df. Run detection thresholding first.")
if 'anomaly_score' not in test_df.columns:
    raise RuntimeError("'anomaly_score' missing in test_df. Run IF scoring first.")

# prepare ranked list of model-detected anomalies by model score (descending)
model_anoms = test_df[test_df['is_model_anomaly'] == 1].copy()
model_anoms = model_anoms.sort_values('anomaly_score', ascending=False).reset_index(drop=True)

# prepare matching keys for mapping back to full test (robust matching)
test_keys = list(zip(test_df['user'], pd.to_datetime(test_df['day'])))
model_keys = list(zip(model_anoms['user'], pd.to_datetime(model_anoms['day'])))

total_tp = int(test_df['is_malicious'].sum())
total_model_alerts = len(model_anoms)

print(f"\nTotal true positives in test set: {total_tp}")
print(f"Total model-detected anomalies: {total_model_alerts}\n")

if total_model_alerts == 0:
    print("No model-detected anomalies to evaluate.")
else:

    # Relative budgets (Top X% of the model-detected alerts)
    percent_budgets = [0.01, 0.02, 0.05, 0.10, 0.15]  # 0.5%,1%,2%,5%,10%
    print("\nModel-score re-rank (Top X%):")
    print(f"{'Top-%':>8} | {'Top-N':>6} | {'% TP in budget':>15} | {'% of all TP covered':>21} | {'TP_in_budget':>12}")
    print("-"*88)
    for p in percent_budgets:
        k = max(1, int(np.floor(p * total_model_alerts)))
        head_keys = set(model_keys[:k])
        in_topk = np.array([1 if key in head_keys else 0 for key in test_keys], dtype=int)
        n_alerts = int(in_topk.sum())
        tp_in_budget = int(((test_df['is_malicious'].astype(int).values) * in_topk).sum())
        precision_pct = (tp_in_budget / n_alerts * 100.0) if n_alerts > 0 else 0.0
        coverage_pct = (tp_in_budget / total_tp * 100.0) if total_tp > 0 else float('nan')
        print(f"{p*100:7.2f}% | {k:6d} | {precision_pct:15.2f} | {coverage_pct:21.2f} | {tp_in_budget:12d}")

print("\nNote: '% TP in budget' = precision within that budget;")
print("      '% of all TP covered' = fraction of total test-set positives found inside the budget.\n")



Total true positives in test set: 85
Total model-detected anomalies: 2149


Model-score re-rank (Top X%):
   Top-% |  Top-N |  % TP in budget |   % of all TP covered | TP_in_budget
----------------------------------------------------------------------------------------
   1.00% |     21 |            4.76 |                  1.18 |            1
   2.00% |     42 |            9.52 |                  4.71 |            4
   5.00% |    107 |           12.15 |                 15.29 |           13
  10.00% |    214 |           15.42 |                 38.82 |           33
  15.00% |    322 |           15.22 |                 57.65 |           49

Note: '% TP in budget' = precision within that budget;
      '% of all TP covered' = fraction of total test-set positives found inside the budget.



In [11]:
 #----------------- Rule scoring function (deterministic) -----------------
def compute_rule_score_row(row):
    score = 0.0
    if row.get("wikileaks_flag",0) == 1:
        score += RULE_WEIGHTS["wikileaks_flag"]
    trio = {"offhour_usb_flag": row.get("offhour_usb_flag",0),
            "offhour_http_flag": row.get("offhour_http_flag",0),
            "offhour_logon_flag": row.get("offhour_logon_flag",0)}
    if trio["offhour_usb_flag"] == 1 or sum(trio.values()) >= 1:
        score += sum(RULE_WEIGHTS[k] for k,v in trio.items() if v==1)
    return score

# precompute rule_score on train/val/test (for tuning if needed)
train_ben_df['rule_score'] = train_ben_df.apply(compute_rule_score_row, axis=1)
test_df['rule_score'] = test_df.apply(compute_rule_score_row, axis=1)


In [12]:
# standardize rule scores using train benign stats (z)
rmean = train_ben_df['rule_score'].mean()
rstd  = train_ben_df['rule_score'].std() if train_ben_df['rule_score'].std() > 0 else 1.0
test_df['rule_z'] = (test_df['rule_score'] - rmean) / rstd
rule_z_std = test_df['rule_z'].replace([np.inf,-np.inf], np.nan).dropna().std()
anomaly_std = train_ben_df['anomaly_score'].std()
scale = anomaly_std / (rule_z_std if rule_z_std>0 else 1.0)
test_df['rule_z_scaled'] = test_df['rule_z'] * scale
# ----------------- Re-rank only the model-detected anomalies using hybrid score -----------------
# Note: we DO NOT use hybrid for detection; detection is anomaly-only thresholding above.
anomalies_all = test_df[test_df['is_model_anomaly'] == 1].copy()
HYBRID_ALPHA = 0.5   # try 0.2, 0.5, 0.7, 1.0 in grid-search
anomalies_all['hybrid_score'] = anomalies_all['anomaly_score'] + HYBRID_ALPHA * anomalies_all['rule_z_scaled']

# final re-ranked alerts (descending hybrid_score)
anomalies_all = anomalies_all.sort_values('hybrid_score', ascending=False).reset_index(drop=True)

In [13]:
import numpy as np

# anomalies_all must be the re-ranked DataFrame (descending hybrid_score)
# test_df must contain 'is_malicious' for total TPs

# safety checks
if 'anomalies_all' not in globals():
    raise RuntimeError("anomalies_all not found. Run the re-ranking block first.")
if 'test_df' not in globals():
    raise RuntimeError("test_df not found.")

total_tp = int(test_df['is_malicious'].sum())
total_alerts = len(anomalies_all)

print(f"Total re-ranked model-detected anomalies: {total_alerts}")
print(f"Total true positives in test set: {total_tp}\n")

if total_alerts == 0:
    print("No model-detected anomalies to evaluate.")
else:

    # define relative budgets (percent of re-ranked alerts)
    percent_budgets = [0.01, 0.02, 0.05, 0.10, 0.15]  # 0.5%, 1%, 2%, 5%, 10%
    percent_budgets = [p for p in percent_budgets if p > 0]
    percent_budgets = sorted(list(set(percent_budgets)))

    print("\nRelative budgets (Top X% of re-ranked alerts):")
    print(f"{'Top-%':>8} | {'Top-N':>6} | {'TP in budget (%)':>17} | {'TPs in budget / All TPs (%)':>28} | {'#TP_in_budget':>14}")
    print("-"*100)
    for p in percent_budgets:
        k = max(1, int(np.floor(p * total_alerts)))
        head = anomalies_all.head(k)
        n_alerts = len(head)
        tp_in_budget = int(head['is_malicious'].sum())
        precision_pct = (tp_in_budget / n_alerts * 100.0) if n_alerts>0 else 0.0
        coverage_pct = (tp_in_budget / total_tp * 100.0) if total_tp>0 else float('nan')
        print(f"{p*100:7.2f}% | {k:6d} | {precision_pct:17.2f} | {coverage_pct:28.2f} | {tp_in_budget:14d}")


Total re-ranked model-detected anomalies: 2149
Total true positives in test set: 85


Relative budgets (Top X% of re-ranked alerts):
   Top-% |  Top-N |  TP in budget (%) |  TPs in budget / All TPs (%) |  #TP_in_budget
----------------------------------------------------------------------------------------------------
   1.00% |     21 |            100.00 |                        24.71 |             21
   2.00% |     42 |            100.00 |                        49.41 |             42
   5.00% |    107 |             61.68 |                        77.65 |             66
  10.00% |    214 |             31.78 |                        80.00 |             68
  15.00% |    322 |             21.12 |                        80.00 |             68


In [31]:
# Top 15 users with highest hybrid scores
top15 = (
    anomalies_all
    .sort_values("hybrid_score", ascending=False)
    .head(15)[["user", "hybrid_score", "rule_z", "anomaly_score", "is_malicious"]]
    .reset_index(drop=True)
)

print("\n=== Top 15 Users with Highest Hybrid Scores ===")
print(top15.to_string(index=False))



=== Top 15 Users with Highest Hybrid Scores ===
   user  hybrid_score   rule_z  anomaly_score  is_malicious
AJR0932      0.403674 6.455615       0.219785             1
FTM0406      0.398085 6.455615       0.214196             1
HJB0742      0.387873 6.455615       0.203985             1
KLH0596      0.382510 6.455615       0.198621             1
RKD0604      0.377433 6.455615       0.193544             1
TAP0551      0.368518 6.455615       0.184629             1
AAM0658      0.361651 6.455615       0.177762             1
MYD0978      0.360372 6.455615       0.176483             1
EHD0584      0.360336 6.455615       0.176447             1
TAP0551      0.359626 6.455615       0.175737             1
LJR0523      0.350979 6.455615       0.167090             1
AJR0932      0.347512 6.455615       0.163623             1
BDV0168      0.347026 6.455615       0.163138             1
MCF0600      0.346965 6.455615       0.163076             1
EHB0824      0.342222 6.455615       0.158333      

In [21]:
# Ensure anomalies_all exists; if not, compute it from test_df
if 'anomalies_all' not in globals():
    # compute rule_z if missing
    if 'rule_z' not in test_df.columns:
        # compute train stats fallback if you have train_ben_df in scope
        rmean = train_ben_df['rule_score'].mean() if 'train_ben_df' in globals() else test_df['rule_score'].mean()
        rstd  = train_ben_df['rule_score'].std() if ('train_ben_df' in globals() and train_ben_df['rule_score'].std()>0) else (test_df['rule_score'].std() or 1.0)
        test_df['rule_z'] = (test_df['rule_score'] - rmean) / (rstd if rstd>0 else 1.0)
    anomalies_all = test_df[test_df.get('is_model_anomaly', 0) == 1].copy()
    if 'hybrid_score' not in anomalies_all.columns:
        anomalies_all['hybrid_score'] = anomalies_all['anomaly_score'] + (HYBRID_ALPHA if 'HYBRID_ALPHA' in globals() else 1.0) * anomalies_all['rule_z']
    anomalies_all = anomalies_all.sort_values('hybrid_score', ascending=False).reset_index(drop=True)

# determine which rule flags to inspect (fall back to names in environment)
rule_flags = RULE_FLAG_NAMES if 'RULE_FLAG_NAMES' in globals() else ['wikileaks_flag','offhour_usb_flag','offhour_http_flag','offhour_logon_flag','is_weekend']

def list_triggered_rules(row, flags=rule_flags):
    triggered = [f for f in flags if int(row.get(f, 0)) == 1]
    return ",".join(triggered) if triggered else "none"

# Add triggered_rules column (row-level)
anomalies_all['triggered_rules'] = anomalies_all.apply(lambda r: list_triggered_rules(r), axis=1)

# Top 10 rows
top10_rows = anomalies_all.head(10)[['user','day','hybrid_score','anomaly_score','rule_score','triggered_rules','is_malicious']].reset_index(drop=True)
print("Top 10 anomaly rows by hybrid_score (row-level):")
display(top10_rows)   # in Jupyter; use print(top10_rows.to_string(index=False)) if display not available

# Save to CSV if you want
top10_rows.to_csv('/content/top10_hybrid_rows.csv', index=False)
print("Saved /content/top10_hybrid_rows.csv")


Top 10 anomaly rows by hybrid_score (row-level):


Unnamed: 0,user,day,hybrid_score,anomaly_score,rule_score,triggered_rules,is_malicious
0,AJR0932,2010-09-18,0.403674,0.219785,2.3,"is_weekend,wikileaks_flag,offhour_logon_flag,o...",1
1,FTM0406,2010-11-25,0.398085,0.214196,2.3,"wikileaks_flag,offhour_logon_flag,offhour_usb_...",1
2,HJB0742,2010-11-25,0.387873,0.203985,2.3,"wikileaks_flag,offhour_logon_flag,offhour_usb_...",1
3,KLH0596,2011-02-12,0.38251,0.198621,2.3,"is_weekend,wikileaks_flag,offhour_logon_flag,o...",1
4,RKD0604,2010-07-13,0.377433,0.193544,2.3,"wikileaks_flag,offhour_logon_flag,offhour_usb_...",1
5,TAP0551,2010-10-23,0.368518,0.184629,2.3,"is_weekend,wikileaks_flag,offhour_logon_flag,o...",1
6,AAM0658,2010-10-23,0.361651,0.177762,2.3,"is_weekend,wikileaks_flag,offhour_logon_flag,o...",1
7,MYD0978,2010-12-18,0.360372,0.176483,2.3,"is_weekend,wikileaks_flag,offhour_logon_flag,o...",1
8,EHD0584,2010-10-02,0.360336,0.176447,2.3,"is_weekend,wikileaks_flag,offhour_logon_flag,o...",1
9,TAP0551,2010-10-29,0.359626,0.175737,2.3,"wikileaks_flag,offhour_logon_flag,offhour_usb_...",1


Saved /content/top10_hybrid_rows.csv


In [30]:
# ---------------------------
# Show one malicious user whose rank improved after hybrid re-rank
# Paste this AFTER anomalies_all is available
# ---------------------------

import numpy as np

if 'anomalies_all' not in globals():
    # compute anomalies_all same as notebook fallback
    if 'test_df' not in globals():
        raise RuntimeError("test_df not found; run detection first.")
    if 'rule_z' not in test_df.columns:
        rmean = train_ben_df['rule_score'].mean() if 'train_ben_df' in globals() else test_df['rule_score'].mean()
        rstd  = train_ben_df['rule_score'].std() if ('train_ben_df' in globals() and train_ben_df['rule_score'].std()>0) else (test_df['rule_score'].std() or 1.0)
        test_df['rule_z'] = (test_df['rule_score'] - rmean) / (rstd if rstd>0 else 1.0)
    anomalies_all = test_df[test_df.get('is_model_anomaly', 0) == 1].copy()
    alpha = HYBRID_ALPHA if 'HYBRID_ALPHA' in globals() else 0.5
    anomalies_all['hybrid_score'] = anomalies_all['anomaly_score'] + alpha * anomalies_all.get('rule_z', 0.0)
    anomalies_all = anomalies_all.sort_values('hybrid_score', ascending=False).reset_index(drop=True)

anoms = anomalies_all.copy()

# ensure hybrid_score exists
if 'hybrid_score' not in anoms.columns:
    alpha = HYBRID_ALPHA if 'HYBRID_ALPHA' in globals() else 0.5
    anoms['hybrid_score'] = anoms['anomaly_score'] + alpha * anoms.get('rule_z_scaled', 0.0)

# ranks within model-detected alerts (1 = highest anomaly_score)
anoms['model_rank']  = anoms['anomaly_score'].rank(method='min', ascending=False).astype(int)
anoms['hybrid_rank'] = anoms['hybrid_score'].rank(method='min', ascending=False).astype(int)
anoms['rank_change'] = anoms['model_rank'] - anoms['hybrid_rank']

# rule flags to inspect
rule_flags = RULE_FLAG_NAMES if 'RULE_FLAG_NAMES' in globals() else ['wikileaks_flag','offhour_usb_flag','offhour_http_flag','offhour_logon_flag','is_weekend']

def _triggered_list(row):
    trig = [f for f in rule_flags if int(row.get(f, 0)) == 1]
    return ",".join(trig) if trig else "none"

anoms['triggered_rules'] = anoms.apply(_triggered_list, axis=1)

# find one malicious row that improved (rank_change > 0)
cand = anoms[(anoms.get('is_malicious',0)==1) & (anoms['rank_change'] > 0)]

if cand.empty:
    print("No malicious model-detected alert improved its rank after hybrid re-rank.")
    # optional: show the top improved non-malicious if you want a concrete example
    top_nonmal = anoms[anoms['rank_change'] > 0].sort_values('rank_change', ascending=False)
    if len(top_nonmal) > 0:
        row = top_nonmal.iloc[0]
        print("\nShowing top-improved non-malicious example instead:")
        print(f"User: {row['user']}, day: {row.get('day')}")
        print(f"Anomaly score: {row['anomaly_score']:.6f}")
        print(f"hybrid score: {row['hybrid_score']:.6f}")
        print(f"Model rank -> Hybrid rank: {row['model_rank']} -> {row['hybrid_rank']} (change {int(row['rank_change'])})")
        print(f"Rule score: {row.get('rrule_z_scaled', np.nan)}; Triggered rules: {row['triggered_rules']}")
    else:
        print("No rows moved up after hybrid re-rank at all.")
else:
    # pick the malicious user with largest improvement
    row = cand.sort_values('rank_change', ascending=False).iloc[0]
    print("Malicious user whose rank increased after hybrid re-rank (best improvement):")
    print(f"User: {row['user']}")
    print(f"Day: {row.get('day')}")
    print(f"Anomaly score: {row['anomaly_score']:.6f}")
    print(f"Rank in model-detected alerts (model_rank): {int(row['model_rank'])}")
    print(f"Hybrid score: {row['hybrid_score']:.6f}")
    print(f"Rank after hybrid re-rank (hybrid_rank): {int(row['hybrid_rank'])}")
    print(f"Rank change (model_rank - hybrid_rank): {int(row['rank_change'])}  (positive => moved up)")
    print(f"Rule score: {row.get('rule_z_scaled', np.nan)}")
    print(f"Triggered rules that helped: {row['triggered_rules']}")


Malicious user whose rank increased after hybrid re-rank (best improvement):
User: RGG0064
Day: 2010-10-27 00:00:00
Anomaly score: 0.046183
Rank in model-detected alerts (model_rank): 690
Hybrid score: 0.230072
Rank after hybrid re-rank (hybrid_rank): 113
Rank change (model_rank - hybrid_rank): 577  (positive => moved up)
Rule score: 0.36777772947624304
Triggered rules that helped: wikileaks_flag,offhour_logon_flag,offhour_usb_flag,offhour_http_flag
