In [1]:
import sys
from pathlib import Path

# let notebook access code in src/
project_root = Path().resolve().parent  # .../FYP2
sys.path.append(str(project_root / "src"))

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)

# for Static model candidates
from sklearn.ensemble import RandomForestClassifier
import joblib
try:
    from xgboost import XGBClassifier
except ImportError:
    print("If xgboost import fails, install it in your venv: pip install xgboost")

# for Behavioral model
from catboost import CatBoostClassifier, Pool

# for dealing with imbalance
from sklearn.utils.class_weight import compute_class_weight


In [2]:
# ---------- Load STATIC data (file-based features like PE headers) ----------

df_static = pd.read_parquet(str(project_root / "data_processed/static_baseline.parquet"))

# Ground truth label: 1 = malicious (WannaCry), 0 = benign
y_static = (df_static["Benign"] == 0).astype(int)

# Drop non-feature columns that shouldn't go into training
# Keep only numeric PE-related features
# We'll guess for now that Feature columns = all columns except ['FileName','md5Hash','Benign']
static_drop_cols = ["FileName", "md5Hash", "Benign"]
X_static = df_static.drop(columns=static_drop_cols, errors="ignore")

print("STATIC SHAPE:", X_static.shape, y_static.shape)
print("STATIC class distribution:", np.bincount(y_static))


# ---------- Load BEHAVIORAL data (network / process-like features) ----------

df_behav = pd.read_parquet(str(project_root / "data_processed/behav_baseline.parquet"))

# Ground truth label: from 'Prediction' where 'A' means WannaCry
y_behav = df_behav["Prediction"].apply(lambda x: 1 if x == "A" else 0).astype(int)

# For behavioral features:
# We will reuse the raw columns and generate engineered features similar to build_behav_features().
# Since build_behav_features lives in src/utils.py, let's import and call it.

from utils import build_behav_features
X_behav_full = build_behav_features(df_behav)

print("BEHAV SHAPE (before col filtering):", X_behav_full.shape, y_behav.shape)
print("BEHAV class distribution:", np.bincount(y_behav))

# Now: we will drop columns that are obviously IDs / text addresses if they make the model confused.
# Optionally keep numeric / engineered columns only.
# We'll do a simple filter: keep numeric columns.
X_behav = X_behav_full.select_dtypes(include=[np.number]).copy()

print("BEHAV SHAPE (numeric only):", X_behav.shape)


STATIC SHAPE: (62485, 15) (62485,)
STATIC class distribution: [27118 35367]
BEHAV SHAPE (before col filtering): (149043, 19) (149043,)
BEHAV class distribution: [106482  42561]
BEHAV SHAPE (numeric only): (149043, 10)


In [3]:
# ---------- Train/Test split for BEHAVIORAL ----------
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    X_behav, y_behav,
    test_size=0.2,
    random_state=42,
    stratify=y_behav
)

print("Behavioral train size:", Xb_train.shape, "test size:", Xb_test.shape)

# Compute class weights so model pays more attention to WannaCry class (1)
classes = np.unique(yb_train)
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=yb_train
)
class_weights_dict = {cls: w for cls, w in zip(classes, class_weights)}
print("Behavioral class weights:", class_weights_dict)

# Prepare CatBoost Pools (this is CatBoost's native data wrapper)
train_pool = Pool(Xb_train, label=yb_train)
test_pool  = Pool(Xb_test,  label=yb_test)

# Train a tuned CatBoost
cat_model = CatBoostClassifier(
    iterations=800,
    depth=8,
    learning_rate=0.05,
    l2_leaf_reg=3,
    loss_function="Logloss",
    eval_metric="AUC",
    verbose=False,
    class_weights=[class_weights_dict.get(0,1.0), class_weights_dict.get(1,1.0)],
    random_seed=42
)

cat_model.fit(train_pool, eval_set=test_pool)

# Predict probabilities and classes
proba_b = cat_model.predict_proba(Xb_test)[:, 1]

# Instead of a hardcoded threshold, let's search best F1 threshold
thresholds = np.linspace(0.1, 0.9, 81)
best_f1 = -1
best_thr = 0.5
for t in thresholds:
    preds_t = (proba_b >= t).astype(int)
    f1_t = f1_score(yb_test, preds_t, zero_division=0)
    if f1_t > best_f1:
        best_f1 = f1_t
        best_thr = t

preds_b = (proba_b >= best_thr).astype(int)

# Evaluate tuned behavioral model
acc_b  = accuracy_score(yb_test, preds_b)
prec_b = precision_score(yb_test, preds_b, zero_division=0)
rec_b  = recall_score(yb_test, preds_b, zero_division=0)
f1_b   = f1_score(yb_test, preds_b, zero_division=0)
auc_b  = roc_auc_score(yb_test, proba_b)
tn_b, fp_b, fn_b, tp_b = confusion_matrix(yb_test, preds_b).ravel()
fpr_b = fp_b / (fp_b + tn_b) if (fp_b + tn_b) > 0 else 0.0

print("=== Tuned Behavioral CatBoost ===")
print(f"Best threshold  : {best_thr:.3f}")
print(f"Accuracy        : {acc_b:.4f}")
print(f"Precision       : {prec_b:.4f}")
print(f"Recall (TPR)    : {rec_b:.4f}")
print(f"F1-score        : {f1_b:.4f}")
print(f"ROC AUC         : {auc_b:.4f}")
print(f"False Pos Rate  : {fpr_b:.4f}")
print("Confusion Matrix [tn fp; fn tp]:")
print([[tn_b, fp_b],[fn_b, tp_b]])


Behavioral train size: (119234, 10) test size: (29809, 10)
Behavioral class weights: {np.int64(0): np.float64(0.6998532605505664), np.int64(1): np.float64(1.7509177949425827)}
=== Tuned Behavioral CatBoost ===
Best threshold  : 0.620
Accuracy        : 0.9831
Precision       : 0.9791
Recall (TPR)    : 0.9612
F1-score        : 0.9701
ROC AUC         : 0.9974
False Pos Rate  : 0.0082
Confusion Matrix [tn fp; fn tp]:
[[np.int64(21122), np.int64(175)], [np.int64(330), np.int64(8182)]]


In [4]:
# ---------- Train/Test split for STATIC ----------
Xs_train, Xs_test, ys_train, ys_test = train_test_split(
    X_static, y_static,
    test_size=0.2,
    random_state=42,
    stratify=y_static
)

print("Static train size:", Xs_train.shape, "test size:", Xs_test.shape)
print("Static class distribution train:", np.bincount(ys_train))

# CLASS WEIGHTS for imbalance
classes_s = np.unique(ys_train)
class_weights_s = compute_class_weight(
    class_weight="balanced",
    classes=classes_s,
    y=ys_train
)
cw_s = {cls: w for cls, w in zip(classes_s, class_weights_s)}
print("Static class weights:", cw_s)

# --- 1. RandomForest baseline (retrained properly) ---
rf_model = RandomForestClassifier(
    n_estimators=400,
    max_depth=12,
    class_weight=cw_s,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(Xs_train, ys_train)

rf_proba = rf_model.predict_proba(Xs_test)[:,1]

# tune threshold for RF
thresholds = np.linspace(0.1, 0.9, 81)
best_f1_rf = -1
best_thr_rf = 0.5
for t in thresholds:
    preds_t = (rf_proba >= t).astype(int)
    f1_t = f1_score(ys_test, preds_t, zero_division=0)
    if f1_t > best_f1_rf:
        best_f1_rf = f1_t
        best_thr_rf = t

rf_preds = (rf_proba >= best_thr_rf).astype(int)

acc_rf  = accuracy_score(ys_test, rf_preds)
prec_rf = precision_score(ys_test, rf_preds, zero_division=0)
rec_rf  = recall_score(ys_test, rf_preds, zero_division=0)
f1_rf   = f1_score(ys_test, rf_preds, zero_division=0)
auc_rf  = roc_auc_score(ys_test, rf_proba)
tn_rf, fp_rf, fn_rf, tp_rf = confusion_matrix(ys_test, rf_preds).ravel()
fpr_rf = fp_rf / (fp_rf + tn_rf) if (fp_rf + tn_rf) > 0 else 0.0


print("\n=== Retrained Static RandomForest ===")
print(f"Best threshold  : {best_thr_rf:.3f}")
print(f"Accuracy        : {acc_rf:.4f}")
print(f"Precision       : {prec_rf:.4f}")
print(f"Recall (TPR)    : {rec_rf:.4f}")
print(f"F1-score        : {f1_rf:.4f}")
print(f"ROC AUC         : {auc_rf:.4f}")
print(f"False Pos Rate  : {fpr_rf:.4f}")
print("Confusion Matrix [tn fp; fn tp]:")
print([[tn_rf, fp_rf],[fn_rf, tp_rf]])


# --- 2. XGBoost candidate for static ---
try:
    xgb_model = XGBClassifier(
        n_estimators=500,
        max_depth=8,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric='logloss',
        scale_pos_weight= cw_s.get(1,1.0) / cw_s.get(0,1.0),  # weight malware class
        random_state=42,
        n_jobs=-1
    )

    xgb_model.fit(Xs_train, ys_train)

    xgb_proba = xgb_model.predict_proba(Xs_test)[:,1]

    # tune threshold for XGB
    thresholds = np.linspace(0.1, 0.9, 81)
    best_f1_xgb = -1
    best_thr_xgb = 0.5
    for t in thresholds:
        preds_t = (xgb_proba >= t).astype(int)
        f1_t = f1_score(ys_test, preds_t, zero_division=0)
        if f1_t > best_f1_xgb:
            best_f1_xgb = f1_t
            best_thr_xgb = t

    xgb_preds = (xgb_proba >= best_thr_xgb).astype(int)

    acc_xgb  = accuracy_score(ys_test, xgb_preds)
    prec_xgb = precision_score(ys_test, xgb_preds, zero_division=0)
    rec_xgb  = recall_score(ys_test, xgb_preds, zero_division=0)
    f1_xgb   = f1_score(ys_test, xgb_preds, zero_division=0)
    auc_xgb  = roc_auc_score(ys_test, xgb_proba)
    tn_xgb, fp_xgb, fn_xgb, tp_xgb = confusion_matrix(ys_test, xgb_preds).ravel()
    fpr_xgb = fp_xgb / (fp_xgb + tn_xgb) if (fp_xgb + tn_xgb) > 0 else 0.0

    print("\n=== Static XGBoost Candidate ===")
    print(f"Best threshold  : {best_thr_xgb:.3f}")
    print(f"Accuracy        : {acc_xgb:.4f}")
    print(f"Precision       : {prec_xgb:.4f}")
    print(f"Recall (TPR)    : {rec_xgb:.4f}")
    print(f"F1-score        : {f1_xgb:.4f}")
    print(f"ROC AUC         : {auc_xgb:.4f}")
    print(f"False Pos Rate  : {fpr_xgb:.4f}")
    print("Confusion Matrix [tn fp; fn tp]:")
    print([[tn_xgb, fp_xgb],[fn_xgb, tp_xgb]])

except Exception as e:
    print("Could not train XGBoost:", e)
    xgb_model = None


Static train size: (49988, 15) test size: (12497, 15)
Static class distribution train: [21694 28294]
Static class weights: {np.int64(0): np.float64(1.1521157923849912), np.int64(1): np.float64(0.8833674984095568)}

=== Retrained Static RandomForest ===
Best threshold  : 0.560
Accuracy        : 0.9961
Precision       : 0.9958
Recall (TPR)    : 0.9973
F1-score        : 0.9965
ROC AUC         : 0.9996
False Pos Rate  : 0.0055
Confusion Matrix [tn fp; fn tp]:
[[np.int64(5394), np.int64(30)], [np.int64(19), np.int64(7054)]]

=== Static XGBoost Candidate ===
Best threshold  : 0.780
Accuracy        : 0.9966
Precision       : 0.9980
Recall (TPR)    : 0.9959
F1-score        : 0.9970
ROC AUC         : 0.9998
False Pos Rate  : 0.0026
Confusion Matrix [tn fp; fn tp]:
[[np.int64(5410), np.int64(14)], [np.int64(29), np.int64(7044)]]


In [5]:
optimized_dir = project_root / "models" / "optimized"
optimized_dir.mkdir(parents=True, exist_ok=True)

# Save tuned behavioral CatBoost
catboost_path = optimized_dir / "behav_catboost_tuned.cbm"
cat_model.save_model(catboost_path)

with open(optimized_dir / "behav_threshold.json", "w") as f:
    import json
    json.dump({"best_threshold": float(best_thr), "note": "tuned F1 threshold"}, f)

# Save static models
joblib.dump(rf_model, optimized_dir / "static_rf_tuned.joblib")
joblib.dump({"best_threshold": float(best_thr_rf)}, optimized_dir / "static_rf_threshold.json")
joblib.dump(list(X_static.columns), optimized_dir / "static_rf_feature_names.joblib")

if 'xgb_model' in locals() and xgb_model is not None:
    joblib.dump(xgb_model, optimized_dir / "static_xgb_tuned.joblib")
    joblib.dump({"best_threshold": float(best_thr_xgb)}, optimized_dir / "static_xgb_threshold.json")
    joblib.dump(list(X_static.columns), optimized_dir / "static_xgb_feature_names.joblib")

print("Saved optimized models in", optimized_dir)


Saved optimized models in C:\Users\richa\OneDrive\Documents\FYP2\models\optimized


In [1]:
import pandas as pd
from pathlib import Path

# pick a few rows from each processed dataset
behav = pd.read_parquet("data_processed/behav_baseline.parquet").head(5)
static = pd.read_parquet("data_processed/static_baseline.parquet").head(5)

# save them to CSVs
behav.to_csv("demo_behav_sample.csv", index=False)
static.to_csv("demo_static_sample.csv", index=False)


FileNotFoundError: [Errno 2] No such file or directory: 'data_processed/behav_baseline.parquet'

In [1]:
import pandas as pd
from pathlib import Path

project_root = Path().resolve().parent  # this assumes you're in FYP2/notebooks right now

behav_path = project_root / "data_processed" / "behav_baseline.parquet"
static_path = project_root / "data_processed" / "static_baseline.parquet"

print("behav_path:", behav_path)
print("static_path:", static_path)

behav = pd.read_parquet(behav_path).head(5)
static = pd.read_parquet(static_path).head(5)

behav.to_csv(project_root / "demo_behav_sample.csv", index=False)
static.to_csv(project_root / "demo_static_sample.csv", index=False)

print("Saved demo_behav_sample.csv and demo_static_sample.csv in project root ✅")


behav_path: C:\Users\richa\OneDrive\Documents\FYP2\data_processed\behav_baseline.parquet
static_path: C:\Users\richa\OneDrive\Documents\FYP2\data_processed\static_baseline.parquet
Saved demo_behav_sample.csv and demo_static_sample.csv in project root ✅


In [1]:
from pathlib import Path
import sys, json, numpy as np, pandas as pd
project_root = Path().resolve().parent
sys.path.append(str(project_root / "src"))

from utils import build_behav_features  # uses your utils.py

# load raw parquet
df_behav_raw = pd.read_parquet(project_root / "data_processed" / "behav_baseline.parquet")

# engineer features exactly like inference does
df_b = build_behav_features(df_behav_raw)

# label (same as before)
y_cb = df_b["is_wannacry"].astype(int) if "is_wannacry" in df_b.columns else \
       df_behav_raw["Prediction"].apply(lambda x: 1 if x == "A" else 0).astype(int)

# define columns used by model (order matters!)
cat_cols = ["Protocol","Flag","Threats","ProtoFlag","PortBucket","has_BTC","has_USD"]
num_cols = ["Time","Clusters","Netflow_Bytes","BTC","USD"]  # <— order fixed
use_cols = num_cols + cat_cols

# assemble X in that exact order
X_cb = df_b.reindex(columns=use_cols, fill_value=0).copy()


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.utils.class_weight import compute_class_weight

Xtr_cb, Xte_cb, ytr_cb, yte_cb = train_test_split(
    X_cb, y_cb, test_size=0.20, stratify=y_cb, random_state=42
)

# cat feature indices (positions in use_cols)
cat_idx = [X_cb.columns.get_loc(c) for c in cat_cols]

# class weighting
neg = int((ytr_cb == 0).sum()); pos = int((ytr_cb == 1).sum())
spw = neg / max(1, pos)

train_pool = Pool(Xtr_cb, ytr_cb, cat_features=cat_idx)
val_pool   = Pool(Xte_cb, yte_cb, cat_features=cat_idx)

cb = CatBoostClassifier(
    iterations=1000,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="PRAUC",
    scale_pos_weight=spw * 1.2,
    random_seed=42,
    verbose=False,
    od_type="Iter",
    od_wait=50
)
cb.fit(train_pool, eval_set=val_pool, use_best_model=True)

proba_cb = cb.predict_proba(val_pool)[:, 1]
prec, rec, thr = precision_recall_curve(yte_cb, proba_cb)
f1 = 2*(prec*rec)/(prec+rec+1e-12)
best_idx = int(np.nanargmax(f1))
best_thr = float(thr[best_idx]) if best_idx < len(thr) else 0.5

print("Chosen threshold:", best_thr)


Chosen threshold: 0.6741265375003658


In [3]:
optimized_dir = project_root / "models" / "optimized"
optimized_dir.mkdir(parents=True, exist_ok=True)

# Save CatBoost model
cb.save_model(optimized_dir / "behav_catboost_tuned.cbm")

# Save threshold
with open(optimized_dir / "behav_threshold.json", "w", encoding="utf-8") as f:
    json.dump({"best_threshold": float(best_thr)}, f, ensure_ascii=False, indent=2)

# Save the exact feature order and types
feat_meta = {
    "use_cols": list(X_cb.columns),
    "cat_cols": cat_cols,
    "num_cols": num_cols
}
with open(optimized_dir / "behav_feature_names.json", "w", encoding="utf-8") as f:
    json.dump(feat_meta, f, ensure_ascii=False, indent=2)

print("✅ Behavioral model + feature names saved correctly.")


✅ Behavioral model + feature names saved correctly.


In [1]:
with open(optimized_dir / "static_xgb_threshold.json", "w", encoding="utf-8") as f:
    json.dump({"best_threshold": float(best_thr_xgb)}, f, ensure_ascii=False, indent=2)


NameError: name 'optimized_dir' is not defined

In [3]:
from pathlib import Path

project_root = Path().resolve().parent   # <-- same as before
optimized_dir = project_root / "models" / "optimized"
optimized_dir.mkdir(parents=True, exist_ok=True)

print("optimized_dir =", optimized_dir)


optimized_dir = C:\Users\richa\OneDrive\Documents\FYP2\models\optimized


In [6]:
# tune threshold for XGB
thresholds = np.linspace(0.1, 0.9, 81)
best_f1_xgb = -1
best_thr_xgb = 0.5
for t in thresholds:
    preds_t = (xgb_proba >= t).astype(int)
    f1_t = f1_score(ys_test, preds_t, zero_division=0)
    if f1_t > best_f1_xgb:
        best_f1_xgb = f1_t
        best_thr_xgb = t

print("best_thr_xgb =", best_thr_xgb)


NameError: name 'np' is not defined