In [1]:
%pip install catboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import average_precision_score, roc_auc_score
import json

In [3]:
rng = np.random.default_rng(42)

### Data preparation:

In [43]:
df = pd.read_csv("https://raw.githubusercontent.com/Transmittance/KeenEye-NetMonitor/refs/heads/main/port_scanning/csv/df_win_5sec_src.csv")

In [44]:
df.head()

Unnamed: 0,pcap,window_start,src,flows_total,pkts_out,pkts_in,bytes_out,bytes_in,tcp_flows,udp_flows,...,mean_flow_duration,p95_flow_duration,mean_packets_per_flow,p95_packets_per_flow,syn_count,rst_count,entropy_dst_port,entropy_dst_ip,rst_per_syn,label
0,SYN_scan.pcap,1769364000.0,176.196.120.41,1,1,1,106,142,0,1,...,0.039478,0.039478,2.0,2.0,0,0,-0.0,-0.0,0.0,0
1,SYN_scan.pcap,1769364000.0,192.168.1.1,2,47,82,12530,11816,2,0,...,1.215944,2.252237,64.5,115.35,0,0,1.0,-0.0,0.0,0
2,SYN_scan.pcap,1769364000.0,192.168.1.130,870,874,869,50676,52140,870,0,...,0.000197,0.000341,2.003448,2.0,874,869,9.764872,-0.0,0.993143,1
3,SYN_scan.pcap,1769364000.0,192.168.1.135,2,2,1,184,138,0,2,...,0.728288,1.383746,1.5,1.95,0,0,1.0,1.0,0.0,0
4,SYN_scan.pcap,1769364000.0,192.168.1.130,132,133,133,7773,8032,132,0,...,0.000474,0.000247,2.015152,2.0,131,131,7.044394,0.064255,0.992424,1


In [45]:
df.groupby("pcap")["label"].value_counts().unstack(fill_value=0)

label,0,1
pcap,Unnamed: 1_level_1,Unnamed: 2_level_1
SYN_scan.pcap,4,2
benign.pcap,305,0
connect_scan.pcap,123,44
i_SYN_scan.pcap,7,2
i_benign.pcap,760,0
i_connect_scan.pcap,9,1
i_sweep_scan.pcap,7,1
sweep_scan.pcap,4,3


In [46]:
def get_pcap_type(p):
    if "benign" in p:
        return "benign"
    if "SYN" in p:
        return "syn"
    if "connect" in p:
        return "connect"
    if "sweep" in p:
        return "sweep"

df["pcap_type"] = df["pcap"].map(get_pcap_type)

In [47]:
df.groupby(["pcap_type","pcap"]).size()

Unnamed: 0_level_0,Unnamed: 1_level_0,0
pcap_type,pcap,Unnamed: 2_level_1
benign,benign.pcap,305
benign,i_benign.pcap,760
connect,connect_scan.pcap,167
connect,i_connect_scan.pcap,10
sweep,i_sweep_scan.pcap,8
sweep,sweep_scan.pcap,7
syn,SYN_scan.pcap,6
syn,i_SYN_scan.pcap,9


### 2-fold evaluation

In [9]:
FEATURES = [
 'flows_total',
 'pkts_out', 'pkts_in',
 'bytes_out', 'bytes_in',
 'tcp_flows', 'udp_flows',
 'unique_dst_ports', 'unique_dst_ips',
 'mean_flow_duration', 'p95_flow_duration',
 'mean_packets_per_flow', 'p95_packets_per_flow',
 'syn_count', 'rst_count',
 'entropy_dst_port', 'entropy_dst_ip',
 'rst_per_syn'
]

In [29]:
def make_2fold(df):
    is_i = df["pcap"].str.startswith("i_")

    folds = [
        (df[is_i].copy(), df[~is_i].copy(), "train=i_; test=non-i_"),
        (df[~is_i].copy(), df[is_i].copy(), "train=non-i_; test=i_"),
    ]
    return folds

In [48]:
folds = make_2fold(df)
for train_df, test_df, name in folds:
    print(name)
    print("  train pcaps:", sorted(train_df["pcap"].unique()))
    print("  test  pcaps:", sorted(test_df["pcap"].unique()))
    print("  train label counts:\n", train_df["label"].value_counts())
    print("  test  label counts:\n", test_df["label"].value_counts())
    print()

train=i_; test=non-i_
  train pcaps: ['i_SYN_scan.pcap', 'i_benign.pcap', 'i_connect_scan.pcap', 'i_sweep_scan.pcap']
  test  pcaps: ['SYN_scan.pcap', 'benign.pcap', 'connect_scan.pcap', 'sweep_scan.pcap']
  train label counts:
 label
0    783
1      4
Name: count, dtype: int64
  test  label counts:
 label
0    436
1     49
Name: count, dtype: int64

train=non-i_; test=i_
  train pcaps: ['SYN_scan.pcap', 'benign.pcap', 'connect_scan.pcap', 'sweep_scan.pcap']
  test  pcaps: ['i_SYN_scan.pcap', 'i_benign.pcap', 'i_connect_scan.pcap', 'i_sweep_scan.pcap']
  train label counts:
 label
0    436
1     49
Name: count, dtype: int64
  test  label counts:
 label
0    783
1      4
Name: count, dtype: int64



In [34]:
def eval_fold(train_df, test_df):
    X_tr, X_te = train_df[FEATURES], test_df[FEATURES]
    y_tr = train_df["label"].astype(int).values
    y_te = test_df["label"].astype(int).values

    model = CatBoostClassifier(
        loss_function="Logloss",
        learning_rate=0.05,
        depth=8,
        auto_class_weights="Balanced",
        random_seed=42,
        verbose=False,
    )
    model.fit(X_tr, y_tr)

    p = model.predict_proba(X_te)[:, 1]

    pr = average_precision_score(y_te, p) if len(np.unique(y_te)) > 1 else np.nan
    roc = roc_auc_score(y_te, p) if len(np.unique(y_te)) > 1 else np.nan
    return pr, roc

In [49]:
for train_df, test_df, name in folds:
    pr, roc = eval_fold(train_df, test_df)
    print(f"{name}: PR-AUC={pr:.3f} ROC-AUC={roc:.3f}")

train=i_; test=non-i_: PR-AUC=0.977 ROC-AUC=0.991
train=non-i_; test=i_: PR-AUC=1.000 ROC-AUC=1.000


### Final model training

In [50]:
X = df[FEATURES]
y = df["label"].astype(int).values

final_model = CatBoostClassifier(
    loss_function="Logloss",
    learning_rate=0.05,
    depth=8,
    auto_class_weights="Balanced",
    random_seed=42,
    verbose=False
)

final_model.fit(X, y)

<catboost.core.CatBoostClassifier at 0x7d96cdea49e0>

In [51]:
p = final_model.predict_proba(X)[:, 1]

pr = average_precision_score(y, p) if len(np.unique(y)) > 1 else np.nan
roc = roc_auc_score(y, p) if len(np.unique(y)) > 1 else np.nan

In [52]:
print(f"Final model: PR-AUC={pr:.3f} ROC-AUC={roc:.3f}")

Final model: PR-AUC=1.000 ROC-AUC=1.000


In [53]:
final_model.save_model("portscan_detection_cb.cbm")

### Median features

In [None]:
fill_median = X.median(numeric_only=True).to_dict()

In [None]:
print(fill_median)

In [None]:
with open("fill_median.json", "w") as f:
    json.dump(fill_median, f, indent=2)