In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)

import lightgbm as lgb

ROOT = Path("..").resolve()
DATA = ROOT / "datasets" / "behavioral" / "ugransom_with_label.csv"
MODELS = ROOT / "models" / "optimized"
SCHEMA_PATH = MODELS / "behav_feature_names.json"


In [8]:
df = pd.read_csv(DATA)

# 1 = malicious, 0 = benign
df["y_true"] = df["Prediction"].map({"A": 0, "S": 1, "SS": 1}).astype(int)

y = df["y_true"].values
print("Label distribution:\n", pd.Series(y).value_counts())


Label distribution:
 1    106482
0     42561
Name: count, dtype: int64


In [3]:
def _sanitize_columns(df):
    rename_map = {
        "Protcol": "Protocol",
        "SeddAddress": "SeedAddress",
        "Netflow Bytes": "Netflow_Bytes",
    }
    return df.rename(columns={c: rename_map.get(c, c) for c in df.columns})

def _engineer_features(df):
    X = df.copy()
    # coerce numerics safely
    for col in ["BTC","USD","Port","Netflow_Bytes","Time","Clusters"]:
        if col in X.columns:
            X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0)

    X["ProtoFlag"] = X.get("Protocol", "").astype(str) + "_" + X.get("Flag", "").astype(str)

    if "Port" in X.columns:
        X["PortBucket"] = pd.cut(
            X["Port"].astype(float),
            bins=[-1, 1023, 49151, 65535],
            labels=["well_known","registered","dynamic"]
        ).astype(str)
    else:
        X["PortBucket"] = "unknown"

    X["has_BTC"] = (X.get("BTC", 0).astype(float) > 0).astype(int)
    X["has_USD"] = (X.get("USD", 0).astype(float) > 0).astype(int)

    return X


In [4]:
dfx = _sanitize_columns(df)
dfx = _engineer_features(dfx)


In [5]:
with open(SCHEMA_PATH, "r") as f:
    schema = json.load(f)

use_cols = schema["use_cols"]
cat_cols = schema.get("cat_cols", [])

X = dfx[use_cols].copy()

# IMPORTANT: convert categoricals to 'category' dtype for LightGBM
for col in cat_cols:
    X[col] = X[col].astype("category")

X.dtypes


Time                int64
Clusters            int64
Netflow_Bytes       int64
BTC                 int64
USD                 int64
Protocol         category
Flag             category
Threats          category
ProtoFlag        category
PortBucket       category
has_BTC          category
has_USD          category
dtype: object

In [6]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)
print("Malicious ratio (train):", y_train.mean())


Train shape: (119234, 12)
Val shape  : (29809, 12)
Malicious ratio (train): 0.7144354798128051


In [9]:
lgbm = lgb.LGBMClassifier(
    objective="binary",
    boosting_type="gbdt",
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)

lgbm.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="auc",
     callbacks=[lgb.log_evaluation(50)],
)


[LightGBM] [Info] Number of positive: 85185, number of negative: 34049
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000517 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 896
[LightGBM] [Info] Number of data points in the train set: 119234, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.714435 -> initscore=0.917025
[LightGBM] [Info] Start training from score 0.917025
[50]	valid_0's auc: 0.997248	valid_0's binary_logloss: 0.0926811
[100]	valid_0's auc: 0.998385	valid_0's binary_logloss: 0.052202
[150]	valid_0's auc: 0.998872	valid_0's binary_logloss: 0.0422892
[200]	valid_0's auc: 0.999068	valid_0's binary_logloss: 0.037932
[250]	valid_0's auc: 0.999155	valid_0's binary_logloss: 0.0354104
[300]	valid_0's auc: 0.999219	valid_0's binary_logloss: 0.033517
[350]	valid_0's auc: 0.999266	valid_0's binary_loglos

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,'binary'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [10]:
# Predicted probability for class 1 (malicious)
y_val_prob = lgbm.predict_proba(X_val)[:, 1]
y_val_pred_05 = (y_val_prob >= 0.5).astype(int)

print("=== LGBM Validation Performance (thr=0.5) ===")
print("Accuracy :", accuracy_score(y_val, y_val_pred_05))
print("Precision:", precision_score(y_val, y_val_pred_05))
print("Recall   :", recall_score(y_val, y_val_pred_05))
print("F1 Score :", f1_score(y_val, y_val_pred_05))
print("ROC AUC  :", roc_auc_score(y_val, y_val_prob))

print("\nClassification report:\n",
      classification_report(y_val, y_val_pred_05, target_names=["Benign","Malicious"]))


=== LGBM Validation Performance (thr=0.5) ===
Accuracy : 0.9852393572411017
Precision: 0.9879743577745543
Recall   : 0.9914072404564023
F1 Score : 0.9896878222555545
ROC AUC  : 0.9993145109436854

Classification report:
               precision    recall  f1-score   support

      Benign       0.98      0.97      0.97      8512
   Malicious       0.99      0.99      0.99     21297

    accuracy                           0.99     29809
   macro avg       0.98      0.98      0.98     29809
weighted avg       0.99      0.99      0.99     29809



In [11]:
import numpy as np

thresholds = np.linspace(0.1, 0.9, 17)  # 0.10, 0.15, ..., 0.90

best_thr = 0.5
best_f1 = -1
rows = []

for thr in thresholds:
    y_pred = (y_val_prob >= thr).astype(int)
    f1 = f1_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    rows.append((thr, prec, rec, f1))
    if f1 > best_f1:
        best_f1 = f1
        best_thr = thr

print("Top thresholds by F1:")
for thr, prec, rec, f1 in sorted(rows, key=lambda t: t[3], reverse=True)[:5]:
    print(f"thr={thr:.3f}  P={prec:.3f}  R={rec:.3f}  F1={f1:.3f}")

print(f"\nðŸŽ¯ Chosen threshold (max F1): {best_thr:.3f} with F1={best_f1:.3f}")


Top thresholds by F1:
thr=0.350  P=0.985  R=0.997  F1=0.991
thr=0.300  P=0.984  R=0.997  F1=0.991
thr=0.400  P=0.985  R=0.996  F1=0.991
thr=0.650  P=0.993  R=0.987  F1=0.990
thr=0.450  P=0.986  R=0.994  F1=0.990

ðŸŽ¯ Chosen threshold (max F1): 0.350 with F1=0.991


In [1]:
import pandas as pd
from pathlib import Path

project_root = Path(r"C:\Users\richa\OneDrive\Documents\FYP2")

behav_val_df = X_val.copy()
behav_val_df["label"] = y_val  # 1 = malicious, 0 = benign

out_path = project_root / "data_processed" / "behav_val.parquet"
behav_val_df.to_parquet(out_path, index=False)

print("Saved validation dataset to:", out_path)


NameError: name 'X_val' is not defined

In [12]:
import joblib
model_path = MODELS / "behav_lgbm.joblib"

joblib.dump(lgbm, model_path)
print("âœ… Saved LightGBM model to:", model_path)


âœ… Saved LightGBM model to: C:\Users\richa\OneDrive\Documents\FYP2\models\optimized\behav_lgbm.joblib


In [13]:
import json

thr_json_path = MODELS / "behav_threshold.json"

thr_payload = {
    "model_type": "lightgbm",
    "best_threshold_malicious": 0.350,
    "label_mapping": {"A": 0, "S": 1, "SS": 1},
    "note": "Threshold tuned on validation set for F1 (class 1 = malicious)"
}

with open(thr_json_path, "w") as f:
    json.dump(thr_payload, f, indent=2)

print("âœ… Saved new threshold:", thr_json_path)


âœ… Saved new threshold: C:\Users\richa\OneDrive\Documents\FYP2\models\optimized\behav_threshold.json


In [2]:
import pandas as pd
from pathlib import Path

project_root = Path(r"C:\Users\richa\OneDrive\Documents\FYP2")

# Build dataframe for validation set
behav_val_df = X_val.copy()
behav_val_df["label"] = y_val.astype(int)

out_path = project_root / "data_processed" / "behav_val.parquet"
behav_val_df.to_parquet(out_path, index=False)

print("Saved:", out_path)
print(behav_val_df.shape)
print(behav_val_df.head())


NameError: name 'X_val' is not defined

In [3]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)
print("Malicious ratio (train):", y_train.mean())


NameError: name 'train_test_split' is not defined

In [4]:
import pandas as pd
from pathlib import Path

# Make sure these exist in this notebook:
# X_val, y_val  (your validation split)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

project_root = Path(r"C:\Users\richa\OneDrive\Documents\FYP2")

behav_val_df = X_val.copy()
behav_val_df["label"] = y_val.astype(int)

out_path = project_root / "data_processed" / "behav_val.parquet"
behav_val_df.to_parquet(out_path, index=False)

print("âœ… Saved validation dataset to:")
print(out_path)

print("\nColumns:", behav_val_df.columns.tolist())
print(behav_val_df.head())


NameError: name 'X_val' is not defined

In [5]:
from pathlib import Path

project_root = Path(r"C:\Users\richa\OneDrive\Documents\FYP2")
behav_path = project_root / "data_processed" / "behav_val.parquet"

print("Exists?", behav_path.exists())
print("Full path:", behav_path)



Exists? False
Full path: C:\Users\richa\OneDrive\Documents\FYP2\data_processed\behav_val.parquet


In [6]:
import pandas as pd
from pathlib import Path

project_root = Path(r"C:\Users\richa\OneDrive\Documents\FYP2")

print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

behav_val_df = X_val.copy()
behav_val_df["label"] = y_val.astype(int)

out_path = project_root / "data_processed" / "behav_val.parquet"
behav_val_df.to_parquet(out_path, index=False)

print("âœ… Saved validation dataset to:")
print(out_path)
print("\nColumns:", behav_val_df.columns.tolist())
print(behav_val_df.head())


NameError: name 'X_val' is not defined

In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, classification_report
)

import lightgbm as lgb

ROOT = Path("..").resolve()
DATA = ROOT / "datasets" / "behavioral" / "ugransom_with_label.csv"
MODELS = ROOT / "models" / "optimized"
SCHEMA_PATH = MODELS / "behav_feature_names.json"


In [9]:
df = pd.read_csv(DATA)

# 1 = malicious, 0 = benign
df["y_true"] = df["Prediction"].map({"A": 0, "S": 1, "SS": 1}).astype(int)

y = df["y_true"].values
print("Label distribution:\n", pd.Series(y).value_counts())


Label distribution:
 1    106482
0     42561
Name: count, dtype: int64


In [10]:
def _sanitize_columns(df):
    rename_map = {
        "Protcol": "Protocol",
        "SeddAddress": "SeedAddress",
        "Netflow Bytes": "Netflow_Bytes",
    }
    return df.rename(columns={c: rename_map.get(c, c) for c in df.columns})

def _engineer_features(df):
    X = df.copy()
    # coerce numerics safely
    for col in ["BTC","USD","Port","Netflow_Bytes","Time","Clusters"]:
        if col in X.columns:
            X[col] = pd.to_numeric(X[col], errors="coerce").fillna(0)

    X["ProtoFlag"] = X.get("Protocol", "").astype(str) + "_" + X.get("Flag", "").astype(str)

    if "Port" in X.columns:
        X["PortBucket"] = pd.cut(
            X["Port"].astype(float),
            bins=[-1, 1023, 49151, 65535],
            labels=["well_known","registered","dynamic"]
        ).astype(str)
    else:
        X["PortBucket"] = "unknown"

    X["has_BTC"] = (X.get("BTC", 0).astype(float) > 0).astype(int)
    X["has_USD"] = (X.get("USD", 0).astype(float) > 0).astype(int)

    return X


In [11]:
dfx = _sanitize_columns(df)
dfx = _engineer_features(dfx)


In [12]:
with open(SCHEMA_PATH, "r") as f:
    schema = json.load(f)

use_cols = schema["use_cols"]
cat_cols = schema.get("cat_cols", [])

X = dfx[use_cols].copy()

# IMPORTANT: convert categoricals to 'category' dtype for LightGBM
for col in cat_cols:
    X[col] = X[col].astype("category")

X.dtypes


Time                int64
Clusters            int64
Netflow_Bytes       int64
BTC                 int64
USD                 int64
Protocol         category
Flag             category
Threats          category
ProtoFlag        category
PortBucket       category
has_BTC          category
has_USD          category
dtype: object

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)
print("Malicious ratio (train):", y_train.mean())


Train shape: (119234, 12)
Val shape  : (29809, 12)
Malicious ratio (train): 0.7144354798128051


In [14]:
import pandas as pd
from pathlib import Path

project_root = Path(r"C:\Users\richa\OneDrive\Documents\FYP2")

print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

behav_val_df = X_val.copy()
behav_val_df["label"] = y_val.astype(int)

out_path = project_root / "data_processed" / "behav_val.parquet"
behav_val_df.to_parquet(out_path, index=False)

print("âœ… Saved validation dataset to:")
print(out_path)
print("\nColumns:", behav_val_df.columns.tolist())
print(behav_val_df.head())


X_val shape: (29809, 12)
y_val shape: (29809,)
âœ… Saved validation dataset to:
C:\Users\richa\OneDrive\Documents\FYP2\data_processed\behav_val.parquet

Columns: ['Time', 'Clusters', 'Netflow_Bytes', 'BTC', 'USD', 'Protocol', 'Flag', 'Threats', 'ProtoFlag', 'PortBucket', 'has_BTC', 'has_USD', 'label']
        Time  Clusters  Netflow_Bytes  BTC    USD Protocol  Flag    Threats  \
112014     8         1           6506    9      1      TCP    AF  Blacklist   
35358     55         1           1052    3   1336      TCP  APSF        SSH   
23328     16         1            178   15  50923      TCP    AF   UDP Scan   
74334     20         2            381  159    246      UDP   APS   UDP Scan   
5298      30         2            134   12  84374      TCP  APSF        DoS   

       ProtoFlag  PortBucket has_BTC has_USD  label  
112014    TCP_AF  registered       1       1      1  
35358   TCP_APSF  registered       1       1      1  
23328     TCP_AF  registered       1       1      1  
74334 