In [1]:
# --- ONE CELL ONLY ---
from pathlib import Path
import sys, os, json
import pandas as pd
import numpy as np          # <- fixes your "np not defined" error
import joblib

# Point notebook to your project root (FYP2 folder)
project_root = Path().resolve().parent
print("project_root =", project_root)

# Make 'src' importable
sys.path.append(str(project_root / "src"))

# Fast checks of the expected folders
for p in [project_root/"src", project_root/"models", project_root/"models"/"optimized", project_root/"templates", project_root/"static"]:
    print(("OK " if p.exists() else "MISS "), p)


project_root = C:\Users\richa\OneDrive\Documents\FYP2
OK  C:\Users\richa\OneDrive\Documents\FYP2\src
OK  C:\Users\richa\OneDrive\Documents\FYP2\models
OK  C:\Users\richa\OneDrive\Documents\FYP2\models\optimized
MISS  C:\Users\richa\OneDrive\Documents\FYP2\templates
MISS  C:\Users\richa\OneDrive\Documents\FYP2\static


In [2]:
# Expected artifacts
opt = project_root / "models" / "optimized"
expected = [
    opt/"behav_catboost_tuned.cbm",
    opt/"behav_threshold.json",
    opt/"behav_feature_names.json",
    opt/"static_xgb_tuned.joblib",
    opt/"static_xgb_threshold.json",
    opt/"static_xgb_feature_names.joblib",
]
for f in expected:
    print(("OK " if f.exists() else "MISS "), f.name)


OK  behav_catboost_tuned.cbm
OK  behav_threshold.json
OK  behav_feature_names.json
OK  static_xgb_tuned.joblib
OK  static_xgb_threshold.json
OK  static_xgb_feature_names.joblib


In [3]:
def read_json(path):
    try:
        with open(path, "r", encoding="utf-8") as f:
            return True, json.load(f)
    except Exception as e:
        return False, str(e)

ok_thr, beh_thr = read_json(opt/"behav_threshold.json")
ok_fmeta, beh_meta = read_json(opt/"behav_feature_names.json")
ok_xthr, xgb_thr = read_json(opt/"static_xgb_threshold.json")

print("behav_threshold.json:", "OK" if ok_thr else beh_thr)
print("behav_feature_names.json:", "OK" if ok_fmeta else beh_meta)
print("static_xgb_threshold.json:", "OK" if ok_xthr else xgb_thr)

if ok_fmeta:
    print("Feature order (first 6):", beh_meta["use_cols"][:6])


behav_threshold.json: OK
behav_feature_names.json: OK
static_xgb_threshold.json: 'utf-8' codec can't decode byte 0x80 in position 0: invalid start byte
Feature order (first 6): ['Time', 'Clusters', 'Netflow_Bytes', 'BTC', 'USD', 'Protocol']


In [5]:
from src.behav_model import predict_behav
from src.static_model import predict_static
from src.db.connection import get_connection   # optional DB ping
print("Imports OK")


ModuleNotFoundError: No module named 'src'

In [6]:
# Confirm the models load without running predictions
from catboost import CatBoostClassifier

# Behavioral model load test
cb = CatBoostClassifier()
cb.load_model(opt/"behav_catboost_tuned.cbm")
print("CatBoost model: OK")

# Static model load test
xgb = joblib.load(opt/"static_xgb_tuned.joblib")
print("Static XGB model: OK")


CatBoost model: OK
Static XGB model: OK


In [7]:
demo_behav_csv  = project_root / "demo_behav_sample.csv"
demo_static_csv = project_root / "demo_static_sample.csv"

print("demo_behav_sample.csv :", "OK" if demo_behav_csv.exists() else "MISS")
print("demo_static_sample.csv:", "OK" if demo_static_csv.exists() else "MISS")

behav_df  = pd.read_csv(demo_behav_csv)
static_df = pd.read_csv(demo_static_csv)

print("behav_df shape :", behav_df.shape)
print("static_df shape:", static_df.shape)
print("behav_df columns:", list(behav_df.columns)[:8], "...")
print("static_df columns:", list(static_df.columns)[:8], "...")


demo_behav_sample.csv : OK
demo_static_sample.csv: OK
behav_df shape : (5, 14)
static_df shape: (5, 18)
behav_df columns: ['Time', 'Protocol', 'Flag', 'Family', 'Clusters', 'SeedAddress', 'ExpAddress', 'BTC'] ...
static_df columns: ['FileName', 'md5Hash', 'Machine', 'DebugSize', 'DebugRVA', 'MajorImageVersion', 'MajorOSVersion', 'ExportRVA'] ...


In [9]:
try:
    proba_b, pred_b, thr_b = predict_behav(behav_df)
    print("Behavioral mean prob :", float(np.mean(proba_b)))
    print("Behavioral threshold :", float(thr_b))
    print("Behavioral decision  :", int((np.mean(proba_b) >= thr_b)))
except Exception as e:
    print("❌ Behavioral predict failed →", e)


❌ Behavioral predict failed → name 'predict_behav' is not defined


In [10]:
try:
    proba_s, pred_s, thr_s = predict_static(static_df)
    print("Static mean prob :", float(np.mean(proba_s)))
    print("Static threshold :", float(thr_s))
    print("Static decision  :", int((np.mean(proba_s) >= thr_s)))
except Exception as e:
    print("❌ Static predict failed →", e)


❌ Static predict failed → name 'predict_static' is not defined


In [12]:
try:
    conn = get_connection()
    print("MySQL connect OK")
    conn.close()
except Exception as e:
    print("❌ MySQL connection failed →", e)


❌ MySQL connection failed → name 'get_connection' is not defined


In [13]:
from pathlib import Path
import sys

project_root = Path().resolve().parent     # prints earlier showed this is .../FYP2
sys.path.insert(0, str(project_root))      # make package root importable
sys.path.insert(0, str(project_root/"src"))  # and allow direct module fallback

# quick proof
print("in sys.path? src =", str(project_root/"src") in sys.path)


in sys.path? src = True


In [14]:
from src.behav_model import predict_behav
from src.static_model import predict_static
from src.db.connection import get_connection
print("Imports OK")


Imports OK


In [15]:
import json
from pathlib import Path

opt = project_root / "models" / "optimized"
# remove the bad binary file (safe)
Path(opt/"static_xgb_threshold.json").unlink(missing_ok=True)

# write a real JSON file
new_thr = 0.780   # <-- use the exact threshold you printed from the notebook
with open(opt/"static_xgb_threshold.json", "w", encoding="utf-8") as f:
    json.dump({"best_threshold": float(new_thr)}, f, ensure_ascii=False, indent=2)

print("Rewrote static_xgb_threshold.json as UTF-8 JSON.")


Rewrote static_xgb_threshold.json as UTF-8 JSON.


In [16]:
import pandas as pd, numpy as np

behav_df = pd.read_csv(project_root/"demo_behav_sample.csv")
proba_b, pred_b, thr_b = predict_behav(behav_df)

print("Behavioral mean prob :", float(np.mean(proba_b)))
print("Behavioral threshold :", float(thr_b))
print("Behavioral decision  :", int((np.mean(proba_b) >= thr_b)))


Behavioral mean prob : 0.8138635252751529
Behavioral threshold : 0.6741265375003658
Behavioral decision  : 1


In [17]:
static_df = pd.read_csv(project_root/"demo_static_sample.csv")
proba_s, pred_s, thr_s = predict_static(static_df)

print("Static mean prob :", float(np.mean(proba_s)))
print("Static threshold :", float(thr_s))
print("Static decision  :", int((np.mean(proba_s) >= thr_s)))


Static mean prob : 0.03285565227270126
Static threshold : 0.78
Static decision  : 0


In [18]:
conn = get_connection()
print("MySQL connect OK")
conn.close()


✅ Connected to MySQL database
MySQL connect OK


In [None]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np

project_root = Path().resolve().parent   # this is your FYP2 folder
sys.path.insert(0, str(project_root))
sys.path.insert(0, str(project_root / "src"))

print("OK — project_root =", project_root)


OK — project_root = C:\Users\richa\OneDrive\Documents\FYP2


In [3]:
from src.utils import build_behav_features

# Load raw parquet
behav_path = project_root / "data_processed" / "behav_baseline.parquet"
df_behav = pd.read_parquet(behav_path)

# Behavioral target
y_behav = (df_behav["Family"].astype(str) == "WannaCry").astype(int)

# Build engineered features (same pipeline used in training)
X_behav = build_behav_features(df_behav)

print("Behavioral shape:", X_behav.shape, y_behav.shape)


Behavioral shape: (149043, 19) (149043,)


In [4]:
from sklearn.model_selection import train_test_split

Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    X_behav,
    y_behav,
    test_size=0.2,
    random_state=42,
    stratify=y_behav
)

print("Train/Test OK:", Xb_train.shape, Xb_test.shape)


Train/Test OK: (119234, 19) (29809, 19)


In [5]:
from catboost import CatBoostClassifier

opt_dir = project_root / "models" / "optimized"
cat_model = CatBoostClassifier()
cat_model.load_model(opt_dir / "behav_catboost_tuned.cbm")

print("✅ Loaded CatBoost model from optimized/")


✅ Loaded CatBoost model from optimized/


In [6]:
import json

with open(opt_dir / "behav_threshold.json", "r", encoding="utf-8") as f:
    beh_thr_meta = json.load(f)

best_thr = float(beh_thr_meta.get("best_threshold", 0.5))
print("✅ Behavioral threshold =", best_thr)


✅ Behavioral threshold = 0.6741265375003658


In [8]:
import json
from catboost import Pool

# 1) Load the feature schema saved during training
with open(opt_dir / "behav_feature_names.json", "r", encoding="utf-8") as f:
    feat_meta = json.load(f)

use_cols = feat_meta["use_cols"]
cat_cols = feat_meta["cat_cols"]
num_cols = feat_meta["num_cols"]

print("Feature order:", use_cols)

# 2) Build engineered features again, then KEEP ONLY the expected columns in the exact order
from src.utils import build_behav_features
X_behav_full = build_behav_features(df_behav)

# create any missing cols with safe defaults
for c in use_cols:
    if c not in X_behav_full.columns:
        X_behav_full[c] = 0

feat_df = X_behav_full[use_cols].copy()

# 3) Enforce dtypes
for c in cat_cols:
    feat_df[c] = feat_df[c].astype(str)
for c in num_cols:
    feat_df[c] = pd.to_numeric(feat_df[c], errors="coerce").fillna(0.0)

# 4) Recreate the same split on this aligned matrix
Xb_train, Xb_test, yb_train, yb_test = train_test_split(
    feat_df, y_behav, test_size=0.20, random_state=42, stratify=y_behav
)

# 5) Wrap in a CatBoost Pool and mark categorical columns
cat_idx = [feat_df.columns.get_loc(c) for c in cat_cols]
test_pool = Pool(Xb_test, label=yb_test, cat_features=cat_idx)

print("Prepared test_pool with", len(cat_idx), "categorical columns.")


Feature order: ['Time', 'Clusters', 'Netflow_Bytes', 'BTC', 'USD', 'Protocol', 'Flag', 'Threats', 'ProtoFlag', 'PortBucket', 'has_BTC', 'has_USD']
Prepared test_pool with 7 categorical columns.


In [9]:
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# Predict probabilities using the Pool (so CatBoost knows which cols are categorical)
proba_b = cat_model.predict_proba(test_pool)[:, 1]
preds_b = (proba_b >= best_thr).astype(int)

acc_b  = accuracy_score(yb_test, preds_b)
prec_b = precision_score(yb_test, preds_b, zero_division=0)
rec_b  = recall_score(yb_test, preds_b, zero_division=0)
f1_b   = f1_score(yb_test, preds_b, zero_division=0)
auc_b  = roc_auc_score(yb_test, proba_b)
tn_b, fp_b, fn_b, tp_b = confusion_matrix(yb_test, preds_b).ravel()
fpr_b = fp_b / (fp_b + tn_b) if (fp_b + tn_b) else 0.0

print("\n=== Tuned Behavioral CatBoost (re-eval) ===")
print(f"Threshold       : {best_thr:.3f}")
print(f"Accuracy        : {acc_b:.4f}")
print(f"Precision       : {prec_b:.4f}")
print(f"Recall (TPR)    : {rec_b:.4f}")
print(f"F1-score        : {f1_b:.4f}")
print(f"ROC AUC         : {auc_b:.4f}")
print(f"False Pos Rate  : {fpr_b:.4f}")
print("Confusion Matrix [tn fp; fn tp]:")
print([[tn_b, fp_b],[fn_b, tp_b]])



=== Tuned Behavioral CatBoost (re-eval) ===
Threshold       : 0.674
Accuracy        : 0.7343
Precision       : 0.2331
Recall (TPR)    : 0.6366
F1-score        : 0.3412
ROC AUC         : 0.7792
False Pos Rate  : 0.2538
Confusion Matrix [tn fp; fn tp]:
[[np.int64(19839), np.int64(6748)], [np.int64(1171), np.int64(2051)]]


In [10]:
df_behav["Family"] = df_behav["Family"].astype(str).str.strip().str.casefold()
y = (df_behav["Family"] == "wannacry").astype(int)


In [11]:
from pathlib import Path
import sys, json, numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from catboost import CatBoostClassifier, Pool
from sklearn.utils.class_weight import compute_class_weight
import joblib

project_root = Path().resolve().parent          # .../FYP2
sys.path.insert(0, str(project_root / "src"))   # import src.*
opt_dir = project_root / "models" / "optimized"
opt_dir.mkdir(parents=True, exist_ok=True)

print("project_root:", project_root)


project_root: C:\Users\richa\OneDrive\Documents\FYP2


In [12]:
from src.utils import build_behav_features

behav_path = project_root / "data_processed" / "behav_baseline.parquet"
df_raw = pd.read_parquet(behav_path)

# Ground truth: Family == WannaCry  →  1 else 0
df_raw["Family"] = df_raw["Family"].astype(str).str.strip()
y = (df_raw["Family"].str.casefold() == "wannacry").astype(int)

# Build engineered features (must match your training/inference pipeline)
X = build_behav_features(df_raw)

print("Shapes:", X.shape, y.shape)
print("Class balance (0=benign,1=WC):", np.bincount(y))


Shapes: (149043, 19) (149043,)
Class balance (0=benign,1=WC): [132933  16110]


In [13]:
use_cols = [
    "Time", "Clusters", "Netflow_Bytes", "BTC", "USD",   # numeric
    "Protocol", "Flag", "Threats", "ProtoFlag", "PortBucket", "has_BTC", "has_USD"  # categorical
]

num_cols = ["Time", "Clusters", "Netflow_Bytes", "BTC", "USD"]
cat_cols = ["Protocol", "Flag", "Threats", "ProtoFlag", "PortBucket", "has_BTC", "has_USD"]

# Ensure presence + order + dtypes
for c in use_cols:
    if c not in X.columns:
        X[c] = 0 if c in num_cols else ""

X = X[use_cols].copy()
for c in num_cols:
    X[c] = pd.to_numeric(X[c], errors="coerce").fillna(0.0)
for c in cat_cols:
    X[c] = X[c].astype(str)

print("Column check OK:", list(X.columns))


Column check OK: ['Time', 'Clusters', 'Netflow_Bytes', 'BTC', 'USD', 'Protocol', 'Flag', 'Threats', 'ProtoFlag', 'PortBucket', 'has_BTC', 'has_USD']


In [14]:
Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# class weights to handle imbalance
classes = np.unique(ytr)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=ytr)
cw_dict = {cls: w for cls, w in zip(classes, cw)}

# cat idx by column position
cat_idx = [X.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(Xtr, ytr, cat_features=cat_idx)
val_pool   = Pool(Xte, yte, cat_features=cat_idx)

cb = CatBoostClassifier(
    iterations=1200,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="PRAUC",
    class_weights=[cw_dict.get(0,1.0), cw_dict.get(1,1.0)],
    od_type="Iter",
    od_wait=50,
    verbose=False,
    random_seed=42
)
cb.fit(train_pool, eval_set=val_pool, use_best_model=True)
print("Model trained ✅")


Model trained ✅


In [15]:
# PR-threshold tuning
from sklearn.metrics import precision_recall_curve

proba = cb.predict_proba(val_pool)[:, 1]
prec, rec, thr = precision_recall_curve(yte, proba)
f1 = 2 * (prec * rec) / (prec + rec + 1e-12)
best_i = int(np.nanargmax(f1))
best_thr = float(thr[best_i]) if best_i < len(thr) else 0.5

pred = (proba >= best_thr).astype(int)

acc  = accuracy_score(yte, pred)
ppv  = precision_score(yte, pred, zero_division=0)
tpr  = recall_score(yte, pred, zero_division=0)
f1s  = f1_score(yte, pred, zero_division=0)
auc  = roc_auc_score(yte, proba)
tn, fp, fn, tp = confusion_matrix(yte, pred).ravel()
fpr  = fp / (fp + tn) if (fp + tn) > 0 else 0.0

print("\n=== Behavioral CatBoost (Family label) ===")
print(f"Best threshold  : {best_thr:.3f}")
print(f"Accuracy        : {acc:.4f}")
print(f"Precision       : {ppv:.4f}")
print(f"Recall (TPR)    : {tpr:.4f}")
print(f"F1-score        : {f1s:.4f}")
print(f"ROC AUC         : {auc:.4f}")
print(f"False Pos Rate  : {fpr:.4f}")
print("Confusion Matrix [tn fp; fn tp]:")
print([[tn, fp],[fn, tp]])



=== Behavioral CatBoost (Family label) ===
Best threshold  : 0.639
Accuracy        : 0.7370
Precision       : 0.2347
Recall (TPR)    : 0.6341
F1-score        : 0.3426
ROC AUC         : 0.7791
False Pos Rate  : 0.2506
Confusion Matrix [tn fp; fn tp]:
[[np.int64(19925), np.int64(6662)], [np.int64(1179), np.int64(2043)]]


In [16]:
# 1) Setup & load data
from pathlib import Path
import sys, json
import pandas as pd
import numpy as np

# Project root (…/FYP2) and make src importable
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root / "src"))

# Load raw behavioral parquet
behav_path = project_root / "data_processed" / "behav_baseline.parquet"
df_b = pd.read_parquet(behav_path)

# Target label (1=WannaCry, 0=Not)
y = (df_b["Family"].astype(str) == "WannaCry").astype(int)

# Build engineered features (your existing util)
from src.utils import build_behav_features
X_full = build_behav_features(df_b)

# Keep **numeric-only** features (remove all object/string columns)
num_cols = [c for c in X_full.columns if X_full[c].dtype != "object"]
X = X_full[num_cols].copy()

print("Numeric feature count:", len(num_cols))
print(num_cols)
print("Shapes:", X.shape, y.shape)


Numeric feature count: 10
['Time', 'Clusters', 'BTC', 'USD', 'Netflow_Bytes', 'Port', 'is_wannacry', 'PortBucket', 'has_BTC', 'has_USD']
Shapes: (149043, 10) (149043,)


In [17]:
# 2) Train/test split + class weights
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

Xtr, Xte, ytr, yte = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

classes = np.unique(ytr)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=ytr)
cw_dict = {int(k): float(v) for k, v in zip(classes, cw)}
cw_dict


{0: 0.5605946627047562, 1: 4.625775915580385}

In [18]:
# 3) Train CatBoost (no categorical features)
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    confusion_matrix, precision_recall_curve
)

cb = CatBoostClassifier(
    iterations=1200,
    depth=6,
    learning_rate=0.05,
    loss_function="Logloss",
    eval_metric="PRAUC",
    class_weights=[cw_dict.get(0,1.0), cw_dict.get(1,1.0)],
    random_seed=42,
    verbose=False,
    od_type="Iter",
    od_wait=50
)

# Since we have only numeric columns, we can pass numpy/pandas directly
cb.fit(Xtr, ytr, eval_set=(Xte, yte), use_best_model=True)

# Tune threshold on PR curve
proba = cb.predict_proba(Xte)[:, 1]
prec, rec, thr = precision_recall_curve(yte, proba)
f1 = 2 * (prec * rec) / (prec + rec + 1e-12)
best_i = int(np.nanargmax(f1))
best_thr = float(thr[best_i]) if best_i < len(thr) else 0.5

# Final metrics at tuned threshold
pred = (proba >= best_thr).astype(int)
acc  = accuracy_score(yte, pred)
ppv  = precision_score(yte, pred, zero_division=0)
tpr  = recall_score(yte, pred, zero_division=0)
f1s  = f1_score(yte, pred, zero_division=0)
auc  = roc_auc_score(yte, proba)
tn, fp, fn, tp = confusion_matrix(yte, pred).ravel()
fpr = fp / (fp + tn) if (fp + tn) > 0 else 0.0

print("\n=== Behavioral CatBoost (NUMERIC ONLY) ===")
print(f"Best threshold  : {best_thr:.3f}")
print(f"Accuracy        : {acc:.4f}")
print(f"Precision       : {ppv:.4f}")
print(f"Recall (TPR)    : {tpr:.4f}")
print(f"F1-score        : {f1s:.4f}")
print(f"ROC AUC         : {auc:.4f}")
print(f"False Pos Rate  : {fpr:.4f}")
print("Confusion Matrix [tn fp; fn tp]:")
print([[tn, fp],[fn, tp]])



=== Behavioral CatBoost (NUMERIC ONLY) ===
Best threshold  : 0.999
Accuracy        : 1.0000
Precision       : 1.0000
Recall (TPR)    : 1.0000
F1-score        : 1.0000
ROC AUC         : 1.0000
False Pos Rate  : 0.0000
Confusion Matrix [tn fp; fn tp]:
[[np.int64(26587), np.int64(0)], [np.int64(0), np.int64(3222)]]


In [19]:
corr = pd.concat([X, y], axis=1).corr()
corr["Family"].sort_values(ascending=False).head(15)



is_wannacry      1.000000
Family           1.000000
USD              0.029006
Port             0.013418
Time             0.011145
Netflow_Bytes   -0.025208
BTC             -0.027480
Clusters        -0.028075
PortBucket            NaN
has_BTC               NaN
has_USD               NaN
Name: Family, dtype: float64

In [22]:
corr = X_full.copy()
corr["y"] = y
leaky = (
    corr.corr()["y"]
    .abs()
    .sort_values(ascending=False)
)
leaky.head(20)


ValueError: could not convert string to float: 'TCP'

In [2]:
import pandas as pd
import numpy as np
import joblib
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, classification_report
)
import matplotlib.pyplot as plt
import seaborn as sns

# Helper to print metrics neatly
def evaluate_model(name, y_true, y_pred, y_prob):
    print(f"\n==== {name} ====")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1 Score :", f1_score(y_true, y_pred))
    print("ROC AUC  :", roc_auc_score(y_true, y_prob))
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))


In [7]:
# --- STATIC dataset ---
static_df = pd.read_csv("../datasets/static/ransomware_detection.csv")

# --- BEHAVIORAL dataset ---
behav_df = pd.read_csv("../datasets/behavioral/ugransom_with_label.csv")

print("✅ Static dataset shape :", static_df.shape)
print("✅ Behavioral dataset shape :", behav_df.shape)


✅ Static dataset shape : (62485, 18)
✅ Behavioral dataset shape : (149043, 15)


In [8]:
# --- Static ---
X_static = static_df.drop(columns=["Benign"], errors="ignore")
y_static = static_df["Benign"].astype(int)

# --- Behavioral ---
# Use the correct label column instead of relying on 'Prediction'
# Your labeled file (ugransom_with_label.csv) has 'label' as target
X_behav = behav_df.drop(columns=["label"], errors="ignore")
y_behav = behav_df["label"].astype(int)


In [9]:
from sklearn.model_selection import train_test_split

X_static_train, X_static_test, y_static_train, y_static_test = train_test_split(
    X_static, y_static, test_size=0.2, random_state=42, stratify=y_static
)

X_behav_train, X_behav_test, y_behav_train, y_behav_test = train_test_split(
    X_behav, y_behav, test_size=0.2, random_state=42, stratify=y_behav
)


In [12]:
# Static (XGBoost)
xgb_model = joblib.load("../models/optimized/static_xgb_tuned.joblib")

# Behavioral (CatBoost)
cb_model = CatBoostClassifier()
cb_model.load_model("../models/optimized/behav_catboost_tuned.cbm")

# Optional RandomForest baseline
import numpy as np

# Keep only numeric columns for the RF baseline
rf_num_cols = X_static_train.select_dtypes(include=[np.number]).columns

rf_static = RandomForestClassifier(n_estimators=200, random_state=42)
rf_static.fit(X_static_train[rf_num_cols], y_static_train)


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [13]:
y_pred_rf  = rf_static.predict(X_static_test[rf_num_cols])
y_prob_rf  = rf_static.predict_proba(X_static_test[rf_num_cols])[:, 1]
evaluate_model("Static RandomForest", y_static_test, y_pred_rf, y_prob_rf)



==== Static RandomForest ====
Accuracy : 0.9964791549971993
Precision: 0.9966765140324964
Recall   : 0.9952064896755162
F1 Score : 0.9959409594095942
ROC AUC  : 0.9997662128239551

Confusion Matrix:
 [[7055   18]
 [  26 5398]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      7073
           1       1.00      1.00      1.00      5424

    accuracy                           1.00     12497
   macro avg       1.00      1.00      1.00     12497
weighted avg       1.00      1.00      1.00     12497

