In [3]:
# Imports and file paths
import sys
import os
print('Python version:', sys.version)

# Standard data science libraries
import numpy as np
import pandas as pd
from pathlib import Path

# sklearn & joblib will be used
import sklearn
import joblib
print('pandas', pd.__version__, 'sklearn', sklearn.__version__)

# Paths (these files were uploaded in this session)
TRAIN_CSV = "/content/SENG_4610_Training_Data.csv"
FEATURES_CSV = "/content/Feature_Description.csv"

# Check files
print('Training CSV exists:', Path(TRAIN_CSV).exists())
print('Feature description exists:', Path(FEATURES_CSV).exists())

Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
pandas 2.2.2 sklearn 1.6.1
Training CSV exists: True
Feature description exists: True


In [4]:
# Load CSVs
df = pd.read_csv(TRAIN_CSV, encoding = 'latin-1')
feat = pd.read_csv(FEATURES_CSV,  encoding = 'latin-1')

print('Training data shape:', df.shape)
display(df.head(10))
print('\n---\nFeature description preview:')
display(feat.head(20))




Training data shape: (175341, 45)


Unnamed: 0,ï»¿id,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,1,0.121478,tcp,-,FIN,6,4,258,172,74.08749,...,1,1,0,0,0,1,1,0,Normal,0
1,2,0.649902,tcp,-,FIN,14,38,734,42014,78.473372,...,1,2,0,0,0,1,6,0,Normal,0
2,3,1.623129,tcp,-,FIN,8,16,364,13186,14.170161,...,1,3,0,0,0,2,6,0,Normal,0
3,4,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,...,1,3,1,1,0,2,1,0,Normal,0
4,5,0.449454,tcp,-,FIN,10,6,534,268,33.373826,...,1,40,0,0,0,2,39,0,Normal,0
5,6,0.380537,tcp,-,FIN,10,6,534,268,39.41798,...,1,40,0,0,0,2,39,0,Normal,0
6,7,0.637109,tcp,-,FIN,10,8,534,354,26.683033,...,1,40,0,0,0,1,39,0,Normal,0
7,8,0.521584,tcp,-,FIN,10,8,534,354,32.593026,...,1,40,0,0,0,3,39,0,Normal,0
8,9,0.542905,tcp,-,FIN,10,8,534,354,31.313031,...,1,40,0,0,0,3,39,0,Normal,0
9,10,0.258687,tcp,-,FIN,10,6,534,268,57.985135,...,1,40,0,0,0,3,39,0,Normal,0



---
Feature description preview:


Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value


In [5]:
# Quick EDA
print('Columns:', df.columns.tolist())
print('\nDtypes:\n', df.dtypes.value_counts())

# Missing values summary
missing = df.isna().sum().sort_values(ascending=False)
display(missing.head(30))

Columns: ['ï»¿id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']

Dtypes:
 int64      30
float64    11
object      4
Name: count, dtype: int64


Unnamed: 0,0
ï»¿id,0
dur,0
proto,0
service,0
state,0
spkts,0
dpkts,0
sbytes,0
dbytes,0
rate,0


In [6]:
# Confirm target column names (adjust if different)
possible_label_cols = ['label', 'Label', 'attack', 'is_attack']
possible_attack_cat = ['attack_cat', 'attack_catagory', 'attack_category', 'attack_type']

print('Columns present that match common names:')
print([c for c in df.columns if c in possible_label_cols])
print([c for c in df.columns if c in possible_attack_cat])

# If your dataset uses different names, update these variables:
BINARY_TARGET = 'label'           # 0 normal, 1 attack
MULTI_TARGET = 'attack_cat'       # multiclass attack category
ID_COL = 'id'

# Drop ID if present
if ID_COL in df.columns:
    df = df.drop(columns=[ID_COL])
    print('Dropped id column')

Columns present that match common names:
['label']
['attack_cat']


In [7]:
# Clean service & other textual columns
for col in ['service', 'proto', 'state']:
    if col in df.columns:
        df[col] = df[col].astype(str)  # convert to string first to avoid unexpected dtypes
        df[col] = df[col].replace(['-', '?', 'None', 'nan', ''], 'unknown')
        df[col] = df[col].fillna('unknown')

# Show unique counts
for col in ['service', 'proto', 'state']:
    if col in df.columns:
        print(f"-> {col}: unique count =", df[col].nunique(), 'example values =', df[col].unique()[:10])


-> service: unique count = 13 example values = ['unknown' 'ftp' 'smtp' 'snmp' 'http' 'ftp-data' 'dns' 'ssh' 'radius'
 'pop3']
-> proto: unique count = 133 example values = ['tcp' 'udp' 'arp' 'ospf' 'icmp' 'igmp' 'rtp' 'ddp' 'ipv6-frag' 'cftp']
-> state: unique count = 9 example values = ['FIN' 'INT' 'CON' 'ECO' 'REQ' 'RST' 'PAR' 'URN' 'no']


In [8]:
from sklearn.preprocessing import LabelEncoder

# Ensure binary target is numeric 0/1
if df[BINARY_TARGET].dtype == object:
    # if the label is text like 'Normal'/'Attack' or 'normal'/'anomaly'
    df['is_attack'] = df[BINARY_TARGET].str.lower().isin(['1','attack','anomaly','attack','attack!','attack?']).astype(int)
else:
    df['is_attack'] = df[BINARY_TARGET].astype(int)

print('is_attack distribution:')
print(df['is_attack'].value_counts())

# Encode multiclass only when present
if MULTI_TARGET in df.columns:
    le_attack = LabelEncoder()
    df['attack_cat_encoded'] = df[MULTI_TARGET].fillna('unknown').astype(str)
    df['attack_cat_encoded'] = le_attack.fit_transform(df['attack_cat_encoded'])
    print('\nMulticlass mapping (first 20):')
    mapping = dict(zip(le_attack.classes_, le_attack.transform(le_attack.classes_)))
    print(mapping)
else:
    print('No multiclass attack column found; will skip softmax training unless you update MULTI_TARGET.')

print(df['attack_cat'].value_counts())

is_attack distribution:
is_attack
1    119341
0     56000
Name: count, dtype: int64

Multiclass mapping (first 20):
{'Analysis': np.int64(0), 'Backdoor': np.int64(1), 'DoS': np.int64(2), 'Exploits': np.int64(3), 'Fuzzers': np.int64(4), 'Generic': np.int64(5), 'Normal': np.int64(6), 'Reconnaissance': np.int64(7), 'Shellcode': np.int64(8), 'Worms': np.int64(9)}
attack_cat
Normal            56000
Generic           40000
Exploits          33393
Fuzzers           18184
DoS               12264
Reconnaissance    10491
Analysis           2000
Backdoor           1746
Shellcode          1133
Worms               130
Name: count, dtype: int64


In [9]:
# By default drop IPs and timestamp-like fields to avoid leakage and high-cardinality
drop_cols = []
for cand in ['srcip', 'dstip', 'Stime', 'Ltime', 'stime', 'ltime', 'stime', 'stime_ms']:
    if cand in df.columns:
        drop_cols.append(cand)

print('Initial drop_cols:', drop_cols)

# Numeric columns detection
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# exclude target columns
exclude = ['is_attack', 'attack_cat_encoded', BINARY_TARGET]
numeric_cols = [c for c in numeric_cols if c not in exclude]
print('\nNumeric candidate columns (sample):', numeric_cols[:20])

# Categorical columns to encode
cat_cols = [c for c in ['service', 'proto', 'state'] if c in df.columns]
print('\nCategorical columns to encode:', cat_cols)

# Final features list — for pipeline we'll handle encoding/scaling
# Keep numeric_cols and cat_cols; remove drop_cols if present
features = [c for c in numeric_cols + cat_cols if c not in drop_cols]
print('\nFinal features to be used (sample):', features[:40])


Initial drop_cols: []

Numeric candidate columns (sample): ['ï»¿id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb']

Categorical columns to encode: ['service', 'proto', 'state']

Final features to be used (sample): ['ï»¿id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports']


In [10]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

# Define preprocessing steps
num_features = [c for c in features if c in df.columns and np.issubdtype(df[c].dtype, np.number)]
cat_features = [c for c in features if c in df.columns and c not in num_features]

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
], remainder='drop')

print('num_features count:', len(num_features))
print('cat_features count:', len(cat_features))

# Prepare data for binary classification
X = df[features]
y = df['is_attack']

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


num_features count: 40
cat_features count: 3


In [11]:
from sklearn.neural_network import MLPClassifier

# Neural network pipeline (same preprocessing)
mlp_model = Pipeline(steps=[
('preproc', preprocessor),
('clf', MLPClassifier(
hidden_layer_sizes=(64, 32), # example: 2 layers, 64 and 32 neurons
activation='relu', # common choice
solver='adam', # adaptive optimizer
max_iter=200, # can increase if not converging
random_state=42,
early_stopping=True, # stop if validation score doesn't improve
verbose=True # shows progress
))
])
print('Training MLPClassifier...')
mlp_model.fit(X_train, y_train)
print('Done.')


Training MLPClassifier...
Iteration 1, loss = 0.14329366
Validation score: 0.948318
Iteration 2, loss = 0.10062198
Validation score: 0.955660
Iteration 3, loss = 0.09325353
Validation score: 0.956658
Iteration 4, loss = 0.08567800
Validation score: 0.964357
Iteration 5, loss = 0.07821420
Validation score: 0.967707
Iteration 6, loss = 0.07155414
Validation score: 0.967137
Iteration 7, loss = 0.06597820
Validation score: 0.969276
Iteration 8, loss = 0.06168312
Validation score: 0.972840
Iteration 9, loss = 0.05893828
Validation score: 0.972555
Iteration 10, loss = 0.05705680
Validation score: 0.972840
Iteration 11, loss = 0.05520030
Validation score: 0.972198
Iteration 12, loss = 0.05371209
Validation score: 0.976618
Iteration 13, loss = 0.05292465
Validation score: 0.973339
Iteration 14, loss = 0.05143733
Validation score: 0.972626
Iteration 15, loss = 0.05097852
Validation score: 0.975335
Iteration 16, loss = 0.04912985
Validation score: 0.974907
Iteration 17, loss = 0.04894657
Validat

In [12]:
y_pred_nn = mlp_model.predict(X_test)
from sklearn.metrics import classification_report, confusion_matrix
print('\nClassification report (MLP binary):')
print(classification_report(y_test, y_pred_nn, digits=4))
print('\nConfusion matrix:')
print(confusion_matrix(y_test, y_pred_nn))


# hyperparameter to note of
# hidden_layer_sizes: start small (64,32) or (100,) to avoid overfitting.
# activation: 'relu' works well; 'tanh' or 'logistic' are alternatives.
# solver: 'adam' is usually robust; 'sgd' can be slower.
# early_stopping=True avoids wasting time on overfitting.
# max_iter: 200–500 is typical for small to medium datasets.



Classification report (MLP binary):
              precision    recall  f1-score   support

           0     0.9785    0.9685    0.9735     11200
           1     0.9853    0.9900    0.9876     23869

    accuracy                         0.9831     35069
   macro avg     0.9819    0.9793    0.9806     35069
weighted avg     0.9831    0.9831    0.9831     35069


Confusion matrix:
[[10847   353]
 [  238 23631]]
