# Import libraries and load dataset

In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical


base_path = "/content/drive/MyDrive/CIC-IDS/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV/MachineLearningCVE"

csv_files = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

print("Loading CSVs...")
dfs = [pd.read_csv(os.path.join(base_path, f)) for f in csv_files]
df = pd.concat(dfs, ignore_index=True)
df.columns = df.columns.str.strip()

print("Dataset shape:", df.shape)

Loading CSVs...
Dataset shape: (2830743, 79)


# Drop columns

In [2]:
drop_cols = [
    'Timestamp', 'Flow ID', 'Src IP', 'Dst IP',
    'Src Port', 'Dst Port', 'Protocol'
]

for col in drop_cols:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

# remove rows without a Label
df = df.dropna(subset=["Label"])

# Features and labels

In [3]:
X = df.drop(columns=["Label"])
X = X.select_dtypes(include=[np.number])  # keep only numeric
y = df["Label"]

print("Before fixing inf/nan:", X.shape, y.shape)

# fix infinity and NaNs
X.replace([np.inf, -np.inf], np.nan, inplace=True)

X = X.fillna(X.median(numeric_only=True))

print("After fixing inf/nan:", X.shape, y.shape)
print("Any inf left? ", np.isinf(X.to_numpy()).any())
print("Any NaN left? ", np.isnan(X.to_numpy()).any())

Before fixing inf/nan: (2830743, 78) (2830743,)
After fixing inf/nan: (2830743, 78) (2830743,)
Any inf left?  False
Any NaN left?  False


# Encode labels

In [4]:
le = LabelEncoder()
y_enc = le.fit_transform(y)
num_classes = len(le.classes_)

print("Number of classes:", num_classes)
print("Class mapping:")
for i, cls in enumerate(le.classes_):
    print(f"  {i}: {cls}")

Number of classes: 15
Class mapping:
  0: BENIGN
  1: Bot
  2: DDoS
  3: DoS GoldenEye
  4: DoS Hulk
  5: DoS Slowhttptest
  6: DoS slowloris
  7: FTP-Patator
  8: Heartbleed
  9: Infiltration
  10: PortScan
  11: SSH-Patator
  12: Web Attack � Brute Force
  13: Web Attack � Sql Injection
  14: Web Attack � XSS


# Train/test split and scaling

In [5]:
X_train, X_test, y_train, y_test = train_test_split(
    X.values,
    y_enc,
    test_size=0.2,
    random_state=42,
    stratify=y_enc
)

print("Train shape:", X_train.shape, " Test shape:", X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# one-hot encode labels for multi-class softmax
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

Train shape: (2264594, 78)  Test shape: (566149, 78)


# Defining BPNN

In [6]:
input_dim = X_train_scaled.shape[1]
print("Input dimension (features):", input_dim)

from tensorflow.keras.layers import Input # Added for missing Input layer
model = Sequential()
model.add(Input(shape=(input_dim,)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(num_classes, activation='softmax'))

model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

Input dimension (features): 78


# Training the model

In [7]:
history = model.fit(
    X_train_scaled,
    y_train_cat,
    epochs=20,           # adjusted
    batch_size=1024,
    validation_split=0.2,
    verbose=1
)

Epoch 1/20
[1m1770/1770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 11ms/step - accuracy: 0.9135 - loss: 0.3404 - val_accuracy: 0.9704 - val_loss: 0.0697
Epoch 2/20
[1m1770/1770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.9692 - loss: 0.0788 - val_accuracy: 0.9728 - val_loss: 0.0581
Epoch 3/20
[1m1770/1770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 11ms/step - accuracy: 0.9731 - loss: 0.0647 - val_accuracy: 0.9776 - val_loss: 0.0507
Epoch 4/20
[1m1770/1770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.9753 - loss: 0.0585 - val_accuracy: 0.9778 - val_loss: 0.0487
Epoch 5/20
[1m1770/1770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 11ms/step - accuracy: 0.9765 - loss: 0.0549 - val_accuracy: 0.9763 - val_loss: 0.0486
Epoch 6/20
[1m1770/1770[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.9774 - loss: 0.0525 - val_accuracy: 0.9801 - val_loss: 0.0425
Epoc

# Evaluation

In [8]:
print("\nEvaluating on test set...")
test_loss, test_acc = model.evaluate(X_test_scaled, y_test_cat, verbose=0)
print(f"Test loss: {test_loss:.4f}")
print(f"Test accuracy: {test_acc:.4f}")

# Predicted class indices
y_pred_proba = model.predict(X_test_scaled, verbose=0)
y_pred = np.argmax(y_pred_proba, axis=1)

print("\nClassification report (multi-class):")
print(classification_report(y_test, y_pred, target_names=le.classes_))

print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

overall_acc = accuracy_score(y_test, y_pred)
print(f"Overall accuracy (from sklearn): {overall_acc:.4f}")


Evaluating on test set...
Test loss: 0.0247
Test accuracy: 0.9926

Classification report (multi-class):
                            precision    recall  f1-score   support

                    BENIGN       0.99      1.00      1.00    454620
                       Bot       0.97      0.36      0.53       393
                      DDoS       1.00      1.00      1.00     25606
             DoS GoldenEye       0.99      0.98      0.98      2059
                  DoS Hulk       0.99      0.96      0.97     46215
          DoS Slowhttptest       0.88      0.98      0.93      1100
             DoS slowloris       0.98      0.96      0.97      1159
               FTP-Patator       0.99      0.98      0.99      1588
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       0.67      0.29      0.40         7
                  PortScan       0.98      1.00      0.99     31786
               SSH-Patator       0.95      0.97      0.96      1179
  Web Atta

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
