# Import libraries and load dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [28]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical


base_path = "/content/drive/MyDrive/CIC-IDS/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV/MachineLearningCVE"

csv_files = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

dfs = []
for f in csv_files:
    df_part = pd.read_csv(os.path.join(base_path, f))
    df_part["SourceFile"] = f
    dfs.append(df_part)

df_all = pd.concat(dfs, ignore_index=True)
df_all.columns = df_all.columns.str.strip()

print("Full dataset shape:", df_all.shape)

Full dataset shape: (2830743, 80)


# Drop columns

In [29]:
# drop id columns
drop_cols = [
    'Timestamp', 'Flow ID', 'Src IP', 'Dst IP',
    'Src Port', 'Dst Port', 'Protocol'
]
for col in drop_cols:
    if col in df_all.columns:
        df_all.drop(columns=[col], inplace=True)

# remove rows without label
df_all = df_all.dropna(subset=["Label"])

mask_mon_thu = df_all["SourceFile"].str.contains("Monday|Tuesday|Wednesday|Thursday", case=False, regex=True)
mask_fri     = df_all["SourceFile"].str.contains("Friday", case=False, regex=True)

df_mon_thu = df_all[mask_mon_thu].copy()
df_fri     = df_all[mask_fri].copy()

print("Mon–Thu raw shape:", df_mon_thu.shape)
print("Friday raw shape:", df_fri.shape)

Mon–Thu raw shape: (2127498, 80)
Friday raw shape: (703245, 80)


# Sampling

In [30]:
from sklearn.preprocessing import LabelEncoder

# encode labels
le = LabelEncoder()
df_all["LabelEnc"] = le.fit_transform(df_all["Label"])
print("Number of classes total:", len(le.classes_))

# attach encoded labels to subsets
df_mon_thu["LabelEnc"] = df_all.loc[df_mon_thu.index, "LabelEnc"]
df_fri["LabelEnc"]     = df_all.loc[df_fri.index, "LabelEnc"]

# balanced sampling
def balanced_sample(df, label_col, n_per_class=100000, random_state=42):
    groups = []
    for lab, g in df.groupby(label_col):
        take = min(len(g), n_per_class)
        groups.append(g.sample(n=take, random_state=random_state))
    return pd.concat(groups, ignore_index=True)

df_mon_thu_bal = balanced_sample(df_mon_thu, "LabelEnc", n_per_class=100000)
df_fri_bal     = balanced_sample(df_fri, "LabelEnc", n_per_class=100000)

print("Mon–Thu balanced shape:", df_mon_thu_bal.shape)
print("Friday balanced shape:", df_fri_bal.shape)
print("Mon–Thu class counts:\n", df_mon_thu_bal["LabelEnc"].value_counts())
print("Friday class counts:\n", df_fri_bal["LabelEnc"].value_counts())

Number of classes total: 15
Mon–Thu balanced shape: (237650, 81)
Friday balanced shape: (301966, 81)
Mon–Thu class counts:
 LabelEnc
0     100000
4     100000
3      10293
7       7938
11      5897
6       5796
5       5499
12      1507
14       652
9         36
13        21
8         11
Name: count, dtype: int64
Friday class counts:
 LabelEnc
0     100000
2     100000
10    100000
1       1966
Name: count, dtype: int64


In [31]:
num_cols = df_mon_thu_bal.select_dtypes(include=[np.number]).columns

def prepare_X_y(df, num_cols, label_col="LabelEnc"):
    X = df[num_cols].copy()

    if label_col in X.columns:
        X = X.drop(columns=[label_col])

    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X = X.fillna(X.median(numeric_only=True))

    X = X.astype("float32")
    y = df[label_col].to_numpy()
    return X, y

X_mon_thu, y_mon_thu = prepare_X_y(df_mon_thu_bal, num_cols)
X_fri,     y_fri     = prepare_X_y(df_fri_bal, num_cols)

print("Mon–Thu X shape:", X_mon_thu.shape)
print("Friday X shape:", X_fri.shape)

import numpy as np
print("Any NaN Mon–Thu:", np.isnan(X_mon_thu.to_numpy()).any())
print("Any NaN Friday :", np.isnan(X_fri.to_numpy()).any())
print("Any inf Mon–Thu:", np.isinf(X_mon_thu.to_numpy()).any())
print("Any inf Friday :", np.isinf(X_fri.to_numpy()).any())

Mon–Thu X shape: (237650, 78)
Friday X shape: (301966, 78)
Any NaN Mon–Thu: False
Any NaN Friday : False
Any inf Mon–Thu: False
Any inf Friday : False


define phase 1
train on Monday–Thursday, test on all Friday

In [15]:
# # Masks based on SourceFile
# mask_mon_thu = df_all["SourceFile"].str.contains("Monday|Tuesday|Wednesday|Thursday", case=False, regex=True)
# mask_fri     = df_all["SourceFile"].str.contains("Friday", case=False, regex=True)

# X_mon_thu = X_all[mask_mon_thu].to_numpy()
# y_mon_thu = y_all_enc[mask_mon_thu]

# X_fri_all = X_all[mask_fri].to_numpy()
# y_fri_all = y_all_enc[mask_fri]

# print("Mon–Thu shape:", X_mon_thu.shape, " Friday shape:", X_fri_all.shape)

# Train/test split

In [32]:
from sklearn.model_selection import train_test_split

X_fri_train, X_fri_test, y_fri_train, y_fri_test = train_test_split(
    X_fri,
    y_fri,
    test_size=0.3,
    random_state=42,
    stratify=y_fri
)

print("Friday train:", X_fri_train.shape, " Friday test:", X_fri_test.shape)

Friday train: (211376, 78)  Friday test: (90590, 78)


# Scaling

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_mon_thu)

X_mon_thu_scaled   = scaler.transform(X_mon_thu)
X_fri_scaled       = scaler.transform(X_fri)
X_fri_train_scaled = scaler.transform(X_fri_train)
X_fri_test_scaled  = scaler.transform(X_fri_test)

# Defining BPNN

In [34]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical

num_classes = len(le.classes_)
input_dim = X_mon_thu_scaled.shape[1]
print("Input dim:", input_dim, "Num classes:", num_classes)

def build_bpnn_stronger(input_dim, num_classes):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model


Input dim: 78 Num classes: 15


# Phase 1

In [35]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

# Phase 1 training
y_mon_thu_cat = to_categorical(y_mon_thu, num_classes=num_classes)

model_p1 = build_bpnn_stronger(input_dim, num_classes)

history_p1 = model_p1.fit(
    X_mon_thu_scaled,
    y_mon_thu_cat,
    epochs=15,             # moderate training
    batch_size=512,        # smaller batch to reduce RAM spikes
    validation_split=0.2,
    verbose=1
)

# Phase 1 evaluation on all Friday
y_fri_pred_proba_p1 = model_p1.predict(X_fri_scaled, verbose=0)
y_fri_pred_p1 = np.argmax(y_fri_pred_proba_p1, axis=1)

print("\n[Phase 1] Friday evaluation (balanced subset):")
print("Accuracy:", accuracy_score(y_fri, y_fri_pred_p1))
print("F1-macro   :", f1_score(y_fri, y_fri_pred_p1, average="macro"))
print("F1-weighted:", f1_score(y_fri, y_fri_pred_p1, average="weighted"))

labels_fri = np.unique(y_fri)
target_names_fri = le.inverse_transform(labels_fri)

print("\n[Phase 1] Classification report (Friday):")
print(classification_report(
    y_fri,
    y_fri_pred_p1,
    labels=labels_fri,
    target_names=target_names_fri,
    zero_division=0
))


Epoch 1/15
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8849 - loss: 0.4032 - val_accuracy: 0.4232 - val_loss: 14.9921
Epoch 2/15
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.9824 - loss: 0.0594 - val_accuracy: 0.4238 - val_loss: 20.1936
Epoch 3/15
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.9856 - loss: 0.0441 - val_accuracy: 0.4239 - val_loss: 25.5033
Epoch 4/15
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 12ms/step - accuracy: 0.9863 - loss: 0.0397 - val_accuracy: 0.4236 - val_loss: 29.5741
Epoch 5/15
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9876 - loss: 0.0363 - val_accuracy: 0.4238 - val_loss: 32.9380
Epoch 6/15
[1m372/372[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 14ms/step - accuracy: 0.9877 - loss: 0.0352 - val_accuracy: 0.4238 - val_loss: 38.6255
Epoch 7/15
[1m3

# Phase 2

In [36]:
print("\n=== Phase 2: Adaptation with Mon–Thu + Friday-train ===")

# combine Mon–Thu and Friday-train for adaptation
X_phase2_train = np.vstack([X_mon_thu_scaled, X_fri_train_scaled])
y_phase2_train = np.concatenate([y_mon_thu, y_fri_train])
y_phase2_cat   = to_categorical(y_phase2_train, num_classes=num_classes)

model_p2 = build_bpnn_stronger(input_dim, num_classes)

history_p2 = model_p2.fit(
    X_phase2_train,
    y_phase2_cat,
    epochs=15,
    batch_size=512,
    validation_split=0.2,
    verbose=1
)

# evaluate on Friday-test
y_fri_test_pred_proba = model_p2.predict(X_fri_test_scaled, verbose=0)
y_fri_test_pred = np.argmax(y_fri_test_pred_proba, axis=1)

print("\n[Phase 2] Friday-test evaluation (after adaptation):")
print("Accuracy:", accuracy_score(y_fri_test, y_fri_test_pred))
print("F1-macro   :", f1_score(y_fri_test, y_fri_test_pred, average="macro"))
print("F1-weighted:", f1_score(y_fri_test, y_fri_test_pred, average="weighted"))

labels_fri_test = np.unique(y_fri_test)
target_names_fri_test = le.inverse_transform(labels_fri_test)

print("\n[Phase 2] Classification report (Friday-test):")
print(classification_report(
    y_fri_test,
    y_fri_test_pred,
    labels=labels_fri_test,
    target_names=target_names_fri_test,
    zero_division=0
))



=== Phase 2: Adaptation with Mon–Thu + Friday-train ===
Epoch 1/15
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.8040 - loss: 0.6566 - val_accuracy: 0.9749 - val_loss: 0.0837
Epoch 2/15
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9586 - loss: 0.1428 - val_accuracy: 0.9759 - val_loss: 0.0787
Epoch 3/15
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9665 - loss: 0.1139 - val_accuracy: 0.9843 - val_loss: 0.0558
Epoch 4/15
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.9706 - loss: 0.0977 - val_accuracy: 0.9827 - val_loss: 0.0544
Epoch 5/15
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9734 - loss: 0.0878 - val_accuracy: 0.9868 - val_loss: 0.0587
Epoch 6/15
[1m702/702[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9744 - loss: 0.0839 - 