In [1]:
import nest_asyncio
nest_asyncio.apply()

In [2]:
import sys
import os

In [3]:
os.chdir('..')

In [4]:
import pandas as pd
import numpy as np
pd.set_option("display.float_format", "{:.0f}".format)
from main import pcap_to_flows, build_dns_cache

In [5]:
PCAPS = [
    "benign.pcap", # 192.168.1.130 -> 192.168.1.135
    "SYN_scan.pcap",
    "connect_scan.pcap",
    "sweep_scan.pcap",
    "i_benign.pcap", # 192.168.1.135 -> 192.168.1.64
    "i_SYN_scan.pcap",
    "i_connect_scan.pcap",
    "i_sweep_scan.pcap",
    "UDP_scan.pcap",
]

def load_all_flows(pcaps):
    frames = []
    for p in pcaps:
        ip2domain = build_dns_cache(f'./port_scanning/pcaps/{p}')
        flows = pcap_to_flows(f'./port_scanning/pcaps/{p}', ip2domain=ip2domain)
        df = pd.DataFrame(flows)
        # df["pcap"] = p[2:] if p.startswith("i_") else p
        df["pcap"] = p
        frames.append(df)
    return pd.concat(frames, ignore_index=True)

df_flows = load_all_flows(PCAPS)

In [6]:
SCANNER_1 = "192.168.1.130"
SCANNER_2 = "192.168.1.135"
TARGET = "192.168.1.64"

SCAN_PCPS_TWO_WAY = {"SYN_scan.pcap", "connect_scan.pcap", "i_SYN_scan.pcap", "i_connect_scan.pcap"}
SCAN_PCPS_SINGLE_WAY = {"sweep_scan.pcap", "i_sweep_scan.pcap"}
BENIGN_PCPS = {"benign.pcap", "i_benign.pcap"}

UDP_PCPS = {"UDP_scan.pcap"}

df_flows["label"] = 0
df_flows.loc[df_flows["pcap"].isin(UDP_PCPS), "label"] = -1

mask_scan_two_way_flow = (
    df_flows["pcap"].isin(SCAN_PCPS_TWO_WAY) & (((df_flows["src"] == SCANNER_1) & (df_flows["dst"] == SCANNER_2)) | ((df_flows["src"] == SCANNER_2) & (df_flows["dst"] == TARGET)))
)
mask_scan_single_way_flow = (
    df_flows["pcap"].isin(SCAN_PCPS_SINGLE_WAY) & ((df_flows["src"] == SCANNER_1) | (df_flows["src"] == SCANNER_2))
)

condition = mask_scan_two_way_flow | mask_scan_single_way_flow
df_flows.loc[condition, "label"] = 1

In [7]:
df_flows['pcap'].unique()

<StringArray>
[        'benign.pcap',       'SYN_scan.pcap',   'connect_scan.pcap',
     'sweep_scan.pcap',       'i_benign.pcap',     'i_SYN_scan.pcap',
 'i_connect_scan.pcap',   'i_sweep_scan.pcap',       'UDP_scan.pcap']
Length: 9, dtype: str

In [8]:
WINDOW = 3

df_flows["duration"] = (df_flows["t_end"] - df_flows["t_start"]).clip(lower=0.0)
df_flows["window_start"] = (df_flows["t_start"] // WINDOW) * WINDOW

In [9]:
def entropy(series):
    vc = series.value_counts()
    p = vc / vc.sum()
    return float(-(p * np.log2(p)).sum())

def p95(series):
    return float(series.quantile(0.95)) if len(series) else 0.0

g = df_flows[df_flows["label"] != -1].groupby(["pcap", "window_start", "src"], as_index=False)

df_win_src = g.agg(
    flows_total=("proto", "size"),
    pkts_out=("fwd_packets", "sum"),
    pkts_in=("rev_packets", "sum"),
    bytes_out=("fwd_bytes", "sum"),
    bytes_in=("rev_bytes", "sum"),
    tcp_flows=("proto", lambda s: int((s == "TCP").sum())),
    udp_flows=("proto", lambda s: int((s == "UDP").sum())),
    unique_dst_ports=("dport", pd.Series.nunique),
    unique_dst_ips=("dst", pd.Series.nunique),
    mean_flow_duration=("duration", "mean"),
    p95_flow_duration=("duration", p95),
    mean_packets_per_flow=("packets", "mean"),
    p95_packets_per_flow=("packets", p95),
    syn_count=("tcp_syn", "sum"),
    rst_count=("tcp_rst", "sum"),
)

ent_port = df_flows[df_flows["label"] != -1].groupby(["pcap", "window_start", "src"])["dport"].apply(entropy).reset_index(name="entropy_dst_port")
ent_ip   = df_flows[df_flows["label"] != -1].groupby(["pcap", "window_start", "src"])["dst"].apply(entropy).reset_index(name="entropy_dst_ip")
df_win_src = df_win_src.merge(ent_port, on=["pcap","window_start","src"], how="left").merge(ent_ip, on=["pcap","window_start","src"], how="left")

df_win_src["rst_per_syn"] = df_win_src["rst_count"] / (df_win_src["syn_count"] + 1.0)

win_label = df_flows[df_flows["label"] != -1].groupby(["pcap","window_start","src"])["label"].max().reset_index(name="label")
df_win_src = df_win_src.merge(win_label, on=["pcap","window_start","src"], how="left").fillna({"label": 0})
df_win_src["label"] = df_win_src["label"].astype(int)

In [10]:
g2 = df_flows[df_flows["label"] != -1].groupby(["pcap","window_start","src","dst"], as_index=False)

df_win_pair = g2.agg(
    flows_total=("proto", "size"),
    unique_dst_ports=("dport", pd.Series.nunique),
    syn_count=("tcp_syn", "sum"),
    rst_count=("tcp_rst", "sum"),
)

def top_targets(pcap, window_start, src, topn=5):
    x = df_win_pair[(df_win_pair.pcap==pcap) & (df_win_pair.window_start==window_start) & (df_win_pair.src==src)]
    return x.sort_values(["unique_dst_ports","syn_count","flows_total"], ascending=False).head(topn)

In [11]:
df_win_src.dtypes

pcap                         str
window_start             float64
src                          str
flows_total                int64
pkts_out                   int64
pkts_in                    int64
bytes_out                  int64
bytes_in                   int64
tcp_flows                  int64
udp_flows                  int64
unique_dst_ports           int64
unique_dst_ips             int64
mean_flow_duration       float64
p95_flow_duration        float64
mean_packets_per_flow    float64
p95_packets_per_flow     float64
syn_count                  int64
rst_count                  int64
entropy_dst_port         float64
entropy_dst_ip           float64
rst_per_syn              float64
label                      int64
dtype: object

In [12]:
df_win_src.to_csv("df_win_3sec_src.csv", index=False, encoding="utf-8")
df_win_pair.to_csv("df_win_3sec_pair.csv", index=False, encoding="utf-8")