
# **DNN Model**

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
# Load merged dataset
df=pd.read_csv("../road_dataset/preprocessed/merged/attack_data_without_masquerade.csv")

X = df.drop(columns=['Flag'], errors='ignore')
y = df['Flag']

# Stratified split Train/Test (70/30)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42, stratify=y
)
# Clean column names 
X_train = X_train.copy()
X_test = X_test.copy()
X_train.columns = [c.replace("[", "_").replace("]", "").replace("<", "_") for c in X_train.columns]
X_test.columns = [c.replace("[", "_").replace("]", "").replace("<", "_") for c in X_test.columns]

# Confirm sizes
print("Train samples:", len(X_train))
print("Test samples:", len(X_test))
results = []

Train samples: 1078308
Test samples: 462133


In [13]:
print(X_train.head())

         CAN ID  DLC  DATA_0  DATA_1  DATA_2  DATA_3  DATA_4  DATA_5  DATA_6  \
1537365      60    8       0       0       4       0      41       0       0   
140415      293    8     144       0      64     223      64      63      21   
786236      560    8     253       0       0       2     236       0       4   
607507      403    8       0       8       8       3     232       8       0   
864219      293    8     144       0      65     159      63     224       3   

         DATA_7  
1537365       0  
140415       96  
786236        0  
607507        0  
864219       96  


In [14]:
import math
from sklearn.metrics import matthews_corrcoef



def dnn_model_run(X_train = X_train, X_test = X_test, y_train = y_train, y_test = y_test, 
                  bs = 32, split = .1, epo = 5, vs = .1, vb = 1, pt= 3, pred=.5, pred_bs=1024):
    
    model = Sequential([   
    Input(shape=(X_train.shape[1],)),  # Input layer (should be 10 features)
        Dense(16, activation='relu'),#Dropout(0.3),
        Dense(16, activation='relu'),#Dropout(0.3),
        Dense(16, activation='relu'),#Dropout(0.3),
        Dense(16, activation='relu'),#Dropout(0.3),
        Dense(1, activation='sigmoid')  # Output layer: 1 neurons
    ])  # Rebuild model from scratch
    
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    
    # Early stopping callback
    early_stop = EarlyStopping(monitor='loss', patience=pt, restore_best_weights=True)
    
    history = model.fit(
        X_train, y_train, 
        validation_split= vs, 
        epochs=epo, 
        batch_size=bs, 
        verbose=vb,
        callbacks = [early_stop]
    )
    
    # ----- Predict -----
    y_pred_prob = model.predict(X_test, batch_size= pred_bs)
    y_pred = (y_pred_prob > pred).astype(int)
    
    # ----- Evaluation -----
    # Convert multiclass to binary: 0 = normal, 1 = any attack
    # y_test = (y_test != 0).astype(int)
    # y_pred = (y_pred != 0).astype(int)


    return model, y_pred

def score_calculator(model_name, y_test, y_pred):
    f1 = f1_score(y_test, y_pred, zero_division=0)
    acc = accuracy_score(y_test, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    mcc = matthews_corrcoef(y_test, y_pred)
    
    result = [model_name, tn, tp, f"{f1*100:.1f}%",  fn, fp, mcc]
    return result

In [15]:
"""----------DNN----------"""
model_name = "DNN"; bs = 32; split = .1; epo = 50; vs = .1; vb = 1; pt=3

dnn_model, y_pred = dnn_model_run(X_train, X_test, y_train, y_test, bs, split, epo, vs, vb, pt, pred=.5, pred_bs=1024)
result = score_calculator(model_name, y_test, y_pred)

results.append(result)
results_df = pd.DataFrame(results, columns=["Model", "Benign Samples", "Malicious Samples", "F1 score", "FN", "FP", "MCC"])

display(results_df)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50


Unnamed: 0,Model,Benign Samples,Malicious Samples,F1 score,FN,FP,MCC
0,DNN,447067,9750,78.6%,5149,167,0.797228


In [16]:



from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
import joblib
def evaluate_model(name, model, X_train, y_train, X_test, y_test, save_path=None):
    """
    Train a model, predict multiclass, collapse to binary (0=normal, 1=attack),
    and return evaluation metrics.
    """
    # ----- Train -----
    model.fit(X_train, y_train)

    # ----- Predict (multiclass) -----
    y_pred = model.predict(X_test)

    # ----- Collapse to binary -----
    y_test = (y_test != 0).astype(int)
    y_pred = (y_pred != 0).astype(int)

    # ----- Compute metrics -----
    cm = confusion_matrix(y_test, y_pred, labels=[0, 1])
    tn, fp, fn, tp = cm.ravel()
    f1 = f1_score(y_test, y_pred, zero_division=0)
    acc = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)

    benign_count = int((y_test == 0).sum())     # = tn + fp
    malicious_count = int((y_test == 1).sum())  # = tp + fn

        # Save the trained model if path provided
    if save_path:
        joblib.dump(model, f"{save_path}/{name.lower()}_model.pkl")

    return [
        name,
        benign_count,
        malicious_count,
        f"{f1*100:.1f}%",
        fn,
        fp,
        mcc
    ]


In [17]:
import joblib
# Models dict
models = {
    "DT": DecisionTreeClassifier(random_state=42),
    "RF": RandomForestClassifier(n_estimators=100, random_state=42),
    "ET": ExtraTreesClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

#results = []
for name, model in models.items():
    result = evaluate_model(name, model, X_train, y_train, X_test, y_test, save_path="../models/FN")
    results.append(result)

# Put into DataFrame
results_df = pd.DataFrame(results, columns=[
    "Model", "Benign Samples", "Malicious Samples", "F1 Score", "FN", "FP", "MCC"
])
print(results_df)
dnn_model.save("road/models/FN/dnn_model.h5")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


     Model  Benign Samples  Malicious Samples F1 Score    FN   FP       MCC
0      DNN          447067               9750    78.6%  5149  167  0.797228
1       DT          447234              14899    91.0%  1695  914  0.907526
2       RF          447234              14899    91.0%  1686  917  0.907765
3       ET          447234              14899    91.0%  1703  897  0.907792
4  XGBoost          447234              14899    91.0%  1653  960  0.907579
