In [316]:
import joblib
import numpy as np

In [317]:
svm_model = joblib.load("svm_model.pk1")
decision_tree_model = joblib.load("decision_tree_model.pk1")
random_forest_model = joblib.load("random_forest_model.pk1")

In [318]:
from scapy.all import sniff,IP,TCP,UDP
import time
import pandas as pd
from collections import defaultdict
from scipy.stats import mode

In [319]:
flow_data = defaultdict(list)
flow_start_time = {}

In [320]:
def extract_features(packet):
    if IP in packet:
        ip = packet[IP]
        proto = "tcp" if TCP in packet else "udp" if UDP in packet else "other"
        src,dst = ip.src, ip.dst
        sport = packet.sport if hasattr(packet, "sport") else 0
        dport = packet.dport if hasattr(packet,"dport") else 0
        flow_id = f"{src}-{dst}-{proto}-{sport}-{dport}"

        if flow_id not in flow_start_time:
            flow_start_time[flow_id] = packet.time

        duration = packet.time - flow_start_time[flow_id]

        flags = packet[TCP].flags if TCP in packet else 0
        flag_str = str(flags)

        flow_data[flow_id].append({
            "timestamp": packet.time,
            "duration": duration,
            "protocol_type": proto,
            "src": src,
            "dst": dst,
            "sport": sport,
            "dport": dport,
            "flag": flag_str,
            "wrong_fragment": 1 if (ip.frag != 0 and ip.flags == 1) else 0,
        })

In [321]:
def is_syn_error(pkt):
    return pkt["flag"] in ["0x4", "0x14", "0x5"]

def is_rerror(pkt):
    return pkt["flag"] in ["0x4", "0x11", "0x14"]

In [322]:
def tcp_flag_to_kdd_flag(normalized):
    
    if normalized == 'S':
        return 'S0'
    elif normalized == 'SA':
        return 'S1'
    elif normalized in ['PA', 'A', 'FPA', 'FA']:
        return 'SF'
    elif normalized == 'RA':
        return 'REJ'
    elif normalized == 'R':
        return 'RSTR'
    elif normalized == 'R':  # RST without ACK
        return 'RSTO'
    elif normalized == 'RS':  # RST + SYN (rare)
        return 'RSTOS0'
    elif normalized == 'FS':  # SYN + FIN (usually malicious)
        return 'SH'
    elif normalized == 'S2':  # Custom label, not from Scapy
        return 'S2'
    elif normalized == 'S3':  # Custom label, not from Scapy
        return 'S3'
    else:
        return 'OTH'
    

In [323]:
def compute_flow_metrics():

    global df
    protocols = ['tcp', 'udp']
    flags = ['REJ', 'RSTO', 'RSTOS0', 'RSTR', 'S0', 'S1', 'S2', 'S3', 'SF', 'SH']
    for flow_id, packets in flow_data.items():
        if not packets:
            continue

        pkt_count = len(packets)
        duration = packets[-1]["timestamp"] - packets[0]["timestamp"]

        # Map protocol type
        raw_proto = packets[0]["protocol_type"]
        proto = raw_proto if raw_proto in protocols else 'icmp'
        proto_onehot = {f'protocol_type_{p}': int(p == proto) for p in protocols}

        last_flag = packets[-1]["flag"]
        kdd_flag = tcp_flag_to_kdd_flag(last_flag)

        # One-hot encoding for flags
        flag_onehot = {f'flag_{f}': int(f == kdd_flag) for f in flags}

        dst_ip = packets[0]["dst"]
        dport = packets[0]["dport"]

        serror_rate = sum(is_syn_error(p) for p in packets) / pkt_count if pkt_count else 0
        rerror_rate = sum(is_rerror(p) for p in packets) / pkt_count if pkt_count else 0

        host_flows = [
            p for fid, pkts in flow_data.items()
            if fid.split("-")[1] == dst_ip
            for p in pkts
        ]
        host_pkt_count = len(host_flows)
        host_serror_rate = sum(is_syn_error(p) for p in host_flows) / host_pkt_count if host_pkt_count else 0
        host_rerror_rate = sum(is_rerror(p) for p in host_flows) / host_pkt_count if host_pkt_count else 0

        dst_ports = [p["dport"] for p in packets if "dport" in p]
        most_common_port = max(set(dst_ports), key=dst_ports.count) if dst_ports else 0
        same_srv_count = dst_ports.count(most_common_port)
        same_srv_rate = same_srv_count / pkt_count if pkt_count else 0

        dst_host_count = sum(1 for fid in flow_data if fid.split("-")[1] == dst_ip)
        dst_host_srv_count = sum(
            1 for fid in flow_data
            if fid.split("-")[1] == dst_ip and fid.split("-")[4] == str(dport)
        )
        dst_host_same_srv_rate = dst_host_srv_count / dst_host_count if dst_host_count else 0
        dst_host_diff_srv_rate = 1.0 - dst_host_same_srv_rate

        row = {
            "duration": duration,
            "wrong_fragment": sum(p["wrong_fragment"] for p in packets),
            "num_compromised": 0,
            "count": pkt_count,
            "serror_rate": round(serror_rate, 2),
            "srv_serror_rate": round(serror_rate, 2),
            "rerror_rate": round(rerror_rate, 2),
            "srv_rerror_rate": round(rerror_rate, 2),
            "same_srv_rate": round(same_srv_rate, 2),
            "srv_diff_host_rate": 0.0,
            "dst_host_count": dst_host_count,
            "dst_host_srv_count": dst_host_srv_count,
            "dst_host_same_srv_rate": round(dst_host_same_srv_rate, 2),
            "dst_host_diff_srv_rate": round(dst_host_diff_srv_rate, 2),
            "dst_host_serror_rate": round(host_serror_rate, 2),
            "dst_host_srv_serror_rate": round(host_serror_rate, 2),
        }
        row.update(proto_onehot)
        row.update(flag_onehot)
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)

In [324]:
def sniff_packets(timeout=10):
    print("Sniffing packets...")
    sniff(prn=extract_features, timeout=timeout)
sniff_packets()

Sniffing packets...


In [325]:
from collections import Counter

In [326]:
def custom_majority_vote(pred_lists):
    n_samples = len(pred_lists[0])
    n_models = len(pred_lists)
    majority_votes = []
    for i in range(n_samples):
        votes = [pred_lists[m][i] for m in range(n_models)]
        vote_counts = Counter(votes)
        majority_class = vote_counts.most_common(1)[0][0]
        majority_votes.append(majority_class)
    return np.array(majority_votes)

In [327]:
def ensemble_predict(X):
    preds1 = svm_model.predict(X).astype(str)
    preds2 = decision_tree_model.predict(X).astype(str)
    preds3 = random_forest_model.predict(X).astype(str)
    print(pd.isnull(preds1).any(), pd.isnull(preds2).any(), pd.isnull(preds3).any())
    print(preds1.dtype, preds2.dtype, preds3.dtype)

    # Majority voting
    preds = np.vstack((preds1, preds2, preds3)).T
    majority_vote = custom_majority_vote([preds1, preds2, preds3])
    for i, pred in enumerate(majority_vote):
        print(f"Sample {i + 1}: Predicted class = {pred}")

In [328]:
columns = [
    'duration', 'wrong_fragment', 'num_compromised', 'count',
    'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
    'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate',
    'protocol_type_tcp', 'protocol_type_udp',
    'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0',
    'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH'
]

df = pd.DataFrame(columns=columns)

In [329]:
columns_to_normalize = ['count','duration','dst_host_count','dst_host_srv_count']
compute_flow_metrics()
scalar = joblib.load('minMaxScalar.pk1')
df[columns_to_normalize] = scalar.transform(df[columns_to_normalize])
ensemble_predict(df.values)

  df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)


False False False
<U6 <U7 <U7
Sample 1: Predicted class = normal
Sample 2: Predicted class = normal
Sample 3: Predicted class = normal
Sample 4: Predicted class = normal
Sample 5: Predicted class = normal
Sample 6: Predicted class = normal
Sample 7: Predicted class = normal
Sample 8: Predicted class = normal
Sample 9: Predicted class = normal
Sample 10: Predicted class = normal
Sample 11: Predicted class = normal
Sample 12: Predicted class = normal
Sample 13: Predicted class = normal


In [330]:
print(df.columns.tolist())

['duration', 'wrong_fragment', 'num_compromised', 'count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'protocol_type_tcp', 'protocol_type_udp', 'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2', 'flag_S3', 'flag_SF', 'flag_SH']


In [331]:
df.describe()

Unnamed: 0,duration,count,serror_rate,srv_serror_rate,rerror_rate,srv_rerror_rate,same_srv_rate,srv_diff_host_rate,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_serror_rate,dst_host_srv_serror_rate
count,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0
mean,1e-05,0.008899,0.0,0.0,0.0,0.0,1.0,0.0,0.014781,0.006335,0.641538,0.358462,0.0,0.0
std,1.7e-05,0.017694,0.0,0.0,0.0,0.0,0.0,0.0,0.008936,0.003411,0.406506,0.406506,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.003922,0.003922,0.17,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.003922,0.003922,0.17,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.011765,0.003922,1.0,0.0,0.0,0.0
75%,2e-05,0.009804,0.0,0.0,0.0,0.0,1.0,0.0,0.023529,0.007843,1.0,0.83,0.0,0.0
max,4.3e-05,0.054902,0.0,0.0,0.0,0.0,1.0,0.0,0.023529,0.011765,1.0,0.83,0.0,0.0
