In [1]:
## Feature Engineering â€“ SIEM Event Classification

Objective:
Transform normalized SIEM security events into ML-ready features
while avoiding data leakage and preserving real SOC semantics.


SyntaxError: invalid syntax (3157502663.py, line 3)

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
df = pd.read_csv("data/raw/cyfercode_security_events_v2.csv")

print("Dataset shape:", df.shape)
df.head(15)


Dataset shape: (28220, 11)


Unnamed: 0,event_type,src_ip_type,dst_port,protocol,alert_severity,alert_signature,failed_attempts,session_duration,bytes_sent,is_cloud_asset,label
0,firewall,internal,21,tcp,low,normal,2,159,11684,0,0
1,cloud,internal,53,tcp,low,normal,1,413,748,1,0
2,auth,internal,25,tcp,high,bruteforce,23,516,750,0,1
3,firewall,internal,3389,tcp,low,normal,2,64,358,0,0
4,ids,internal,443,tcp,high,malware,3,728,1620,0,1
5,ids,external,80,tcp,medium,port_scan,2,9,4436,0,1
6,ids,internal,8080,tcp,low,normal,3,140,883,0,0
7,cloud,external,21,tcp,low,normal,1,1521,2569,0,1
8,auth,internal,25,tcp,low,normal,4,37,6094,0,0
9,auth,internal,22,udp,medium,port_scan,4,62,3312,0,1


In [3]:
df.columns


Index(['event_type', 'src_ip_type', 'dst_port', 'protocol', 'alert_severity',
       'alert_signature', 'failed_attempts', 'session_duration', 'bytes_sent',
       'is_cloud_asset', 'label'],
      dtype='object')

In [4]:
target = "label"


In [5]:
num_features = [
    "dst_port",
    "failed_attempts",
    "session_duration",
    "bytes_sent"
]


In [6]:
cat_features = [
    "event_type",
    "src_ip_type",
    "protocol",
    "alert_signature",
    "alert_severity",
    "is_cloud_asset"
]


In [7]:
X = df[num_features + cat_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [8]:

numeric_pipeline = Pipeline(
    steps=[
        ("scaler", StandardScaler())
    ]
)


In [9]:
categorical_pipeline = Pipeline(
    steps=[
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ]
)


In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_pipeline, num_features),
        ("cat", categorical_pipeline, cat_features)
    ]
)


In [11]:
X_train_transformed = preprocessor.fit_transform(X_train)

X_train_transformed.shape


(22576, 21)

In [12]:
feature_names = preprocessor.get_feature_names_out()


In [13]:
import pandas as pd

X_train_transformed_df = pd.DataFrame(
    X_train_transformed.toarray() if hasattr(X_train_transformed, "toarray") else X_train_transformed,
    columns=feature_names
)


In [14]:
X_train_transformed_df.columns


Index(['num__dst_port', 'num__failed_attempts', 'num__session_duration',
       'num__bytes_sent', 'cat__event_type_auth', 'cat__event_type_cloud',
       'cat__event_type_firewall', 'cat__event_type_ids',
       'cat__src_ip_type_external', 'cat__src_ip_type_internal',
       'cat__protocol_tcp', 'cat__protocol_udp',
       'cat__alert_signature_bruteforce', 'cat__alert_signature_malware',
       'cat__alert_signature_normal', 'cat__alert_signature_port_scan',
       'cat__alert_severity_high', 'cat__alert_severity_low',
       'cat__alert_severity_medium', 'cat__is_cloud_asset_0',
       'cat__is_cloud_asset_1'],
      dtype='object')

In [15]:
X_train.iloc[0]


dst_port                  22
failed_attempts            1
session_duration         314
bytes_sent              7123
event_type               ids
src_ip_type         internal
protocol                 tcp
alert_signature       normal
alert_severity          high
is_cloud_asset             0
Name: 16762, dtype: object