In [2]:
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

RAW_DIR = "../data/raw/cicids_simplified"
df = pd.read_csv(f"{RAW_DIR}/basic_data_4.csv")
df.columns = df.columns.str.strip().str.lower()

# Label mapping 5-class + binary
labmap = pd.read_csv(f"{RAW_DIR}/label_category_map.csv")
labmap.columns = labmap.columns.str.strip().str.lower()
raw_col = labmap.columns[0]; cat_col = labmap.columns[1]
df["label"] = df["label"].astype(str).str.strip().str.lower()
df = df.merge(labmap[[raw_col, cat_col]], left_on="label", right_on=raw_col, how="left")
df[cat_col] = np.where((df["label"]=="normal") & (df[cat_col].isna()), "normal", df[cat_col])
df["label_binary"] = (df[cat_col]!="normal").astype(int)


In [3]:
def engineer_features(d):
    d = d.copy()
    # bytes
    d["bytes_total"] = d["src_bytes"].clip(lower=0) + d["dst_bytes"].clip(lower=0)
    d["bytes_ratio_src"] = d["src_bytes"].clip(lower=0) / (d["bytes_total"] + 1)
    d["log_bytes_total"] = np.log1p(d["bytes_total"])
    # traffic intensity
    d["count_per_srv"] = d["count"] / (d["srv_count"] + 1)
    d["duration_per_conn"] = d["duration"] / (d["count"] + 1)
    # service grouping 
    web = {"http","http_443","http_8001","www"}
    mail = {"smtp","imap4","pop_2","pop_3"}
    ftp  = {"ftp","ftp_data"}
    dns  = {"domain","domain_u"}
    d["service_group"] = np.where(d["service"].isin(web), "web",
                          np.where(d["service"].isin(mail), "mail",
                          np.where(d["service"].isin(ftp),  "ftp",
                          np.where(d["service"].isin(dns),  "dns", "other"))))
    return d

df_fe = engineer_features(df)
df_fe.sample(3)


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,count,srv_count,serror_rate,label,category,label_binary,bytes_total,bytes_ratio_src,log_bytes_total,count_per_srv,duration_per_conn,service_group
19373,0.0,udp,domain_u,SF,17.0,73.0,1.0,3.0,0.0,normal,Normal,1,90.0,0.186813,4.51086,0.25,0.0,dns
10693,26.0,tcp,ftp,SF,361.0,1181.0,1.0,1.0,0.0,normal,Normal,1,1542.0,0.23396,7.341484,0.5,13.0,ftp
2615,0.0,tcp,http,SF,198.0,2656.0,8.0,9.0,0.0,normal,Normal,1,2854.0,0.069352,7.956827,0.8,0.0,web


In [4]:
# Near-constant
num_cols = ["duration","src_bytes","dst_bytes","count","srv_count","serror_rate",
            "bytes_total","bytes_ratio_src","log_bytes_total","count_per_srv","duration_per_conn"]
low_var = [c for c in num_cols if df_fe[c].nunique()/len(df_fe) < 0.01]
low_var


['serror_rate']

In [5]:
corr = df_fe[num_cols].corr().abs()
high_pairs = np.column_stack(np.where((corr.values > 0.95) & (corr.values < 0.9999)))
[(num_cols[i], num_cols[j], corr.values[i,j]) for i,j in high_pairs[:10]]

[('duration', 'duration_per_conn', np.float64(0.9845993926816698)),
 ('src_bytes', 'bytes_total', np.float64(0.9993220329680155)),
 ('bytes_total', 'src_bytes', np.float64(0.9993220329680155)),
 ('duration_per_conn', 'duration', np.float64(0.9845993926816698))]

In [6]:
keep_num = [c for c in num_cols if c not in low_var]
cat_cols = ["protocol_type","service_group","flag"]

X = df_fe[keep_num + cat_cols]
y = df_fe["label_binary"].values

# Simple split
Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

num_pipe = Pipeline([("imp", SimpleImputer(strategy="median")),
                     ("sc", StandardScaler())])
cat_pipe = Pipeline([("imp", SimpleImputer(strategy="most_frequent")),
                     ("ohe", OneHotEncoder(handle_unknown="ignore"))])

pre = ColumnTransformer([("num", num_pipe, keep_num), ("cat", cat_pipe, cat_cols)])
clf = RandomForestClassifier(n_estimators=300, n_jobs=-1, class_weight="balanced")

from sklearn.pipeline import make_pipeline
pipe = make_pipeline(pre, clf).fit(Xtr, ytr)
pred = pipe.predict(Xte)
print("RF F1:", f1_score(yte, pred))


RF F1: 1.0
