# Import libraries and load dataset

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical


base_path = "/content/drive/MyDrive/CIC-IDS/CIC-IDS-2017/Dataset/CIC-IDS-2017/CSVs/MachineLearningCSV/MachineLearningCVE"

csv_files = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Tuesday-WorkingHours.pcap_ISCX.csv",
    "Wednesday-workingHours.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
    "Friday-WorkingHours-Morning.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"
]

dfs = []
for f in csv_files:
    df_part = pd.read_csv(os.path.join(base_path, f))
    df_part["SourceFile"] = f
    dfs.append(df_part)

df_all = pd.concat(dfs, ignore_index=True)
df_all.columns = df_all.columns.str.strip()

print("Full dataset shape:", df_all.shape)

Full dataset shape: (2830743, 80)


# Drop columns

In [11]:
# drop id columns
drop_cols = [
    'Timestamp', 'Flow ID', 'Src IP', 'Dst IP',
    'Src Port', 'Dst Port', 'Protocol'
]
for col in drop_cols:
    if col in df_all.columns:
        df_all.drop(columns=[col], inplace=True)

df_all = df_all.dropna(subset=["Label"])

# select only numeric features
num_cols = df_all.select_dtypes(include=[np.number]).columns
X_num = df_all[num_cols].copy()  # copy to avoid SettingWithCopy issues

X_num.replace([np.inf, -np.inf], np.nan, inplace=True)

X_num = X_num.fillna(X_num.median(numeric_only=True))

# convert to float32 to save memory
X_num = X_num.astype("float32")

y_all = df_all["Label"]
src = df_all["SourceFile"]

print("Numeric shape:", X_num.shape)


Numeric shape: (2830743, 78)


In [12]:
mask_mon_thu = src.str.contains("Monday|Tuesday|Wednesday|Thursday", case=False, regex=True)
mask_fri     = src.str.contains("Friday", case=False, regex=True)

# Encode labels

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# encode labels once
le = LabelEncoder()
y_all_enc = le.fit_transform(y_all)
num_classes = len(le.classes_)
print("Classes:", num_classes)

# indices
mon_thu_idx = np.where(mask_mon_thu)[0]
fri_idx     = np.where(mask_fri)[0]

# stratified sampling for Mon–Thu
mon_thu_idx_sub, _ = train_test_split(
    mon_thu_idx,
    train_size=300_000,
    stratify=y_all_enc[mon_thu_idx],
    random_state=42
)

# stratified sampling for Friday
fri_idx_sub, _ = train_test_split(
    fri_idx,
    train_size=100_000,
    stratify=y_all_enc[fri_idx],
    random_state=42
)

print("Mon–Thu subset size:", len(mon_thu_idx_sub))
print("Friday subset size:", len(fri_idx_sub))

Classes: 15
Mon–Thu subset size: 300000
Friday subset size: 100000


In [14]:
X_mon_thu = X_num.iloc[mon_thu_idx_sub].to_numpy(dtype="float32")
y_mon_thu = y_all_enc[mon_thu_idx_sub]

X_fri_all = X_num.iloc[fri_idx_sub].to_numpy(dtype="float32")
y_fri_all = y_all_enc[fri_idx_sub]

print("Mon–Thu:", X_mon_thu.shape, " Friday:", X_fri_all.shape)

Mon–Thu: (300000, 78)  Friday: (100000, 78)


define phase 1
train on Monday–Thursday, test on all Friday

In [5]:
# # Masks based on SourceFile
# mask_mon_thu = df_all["SourceFile"].str.contains("Monday|Tuesday|Wednesday|Thursday", case=False, regex=True)
# mask_fri     = df_all["SourceFile"].str.contains("Friday", case=False, regex=True)

# X_mon_thu = X_all[mask_mon_thu].to_numpy()
# y_mon_thu = y_all_enc[mask_mon_thu]

# X_fri_all = X_all[mask_fri].to_numpy()
# y_fri_all = y_all_enc[mask_fri]

# print("Mon–Thu shape:", X_mon_thu.shape, " Friday shape:", X_fri_all.shape)

Mon–Thu shape: (2127498, 78)  Friday shape: (703245, 78)


define phase 2

In [15]:
X_fri_train, X_fri_test, y_fri_train, y_fri_test = train_test_split(
    X_fri_all,
    y_fri_all,
    test_size=0.3,
    random_state=42,
    stratify=y_fri_all
)

print("Friday train:", X_fri_train.shape, " Friday test:", X_fri_test.shape)

Friday train: (70000, 78)  Friday test: (30000, 78)


# Scaling

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_mon_thu)

X_mon_thu_scaled   = scaler.transform(X_mon_thu)
X_fri_all_scaled   = scaler.transform(X_fri_all)
X_fri_train_scaled = scaler.transform(X_fri_train)
X_fri_test_scaled  = scaler.transform(X_fri_test)

# Defining BPNN

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras import Input
from tensorflow.keras.utils import to_categorical

def build_bpnn(input_dim, num_classes):
    model = Sequential()
    model.add(Input(shape=(input_dim,)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(
        optimizer='adam',
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

input_dim = X_mon_thu_scaled.shape[1]
print("Input dim:", input_dim)

Input dim: 78


# Phase 1

In [19]:
from sklearn.metrics import classification_report, accuracy_score, f1_score

print("\n=== Phase 1: Train on Mon–Thu, test on Friday (no adaptation) ===")

y_mon_thu_cat = to_categorical(y_mon_thu, num_classes=num_classes)

model_p1 = build_bpnn(input_dim, num_classes)

history_p1 = model_p1.fit(
    X_mon_thu_scaled,
    y_mon_thu_cat,
    epochs=10,
    batch_size=1024,
    validation_split=0.2,
    verbose=1
)

# evaluate on Friday
y_fri_pred_proba_p1 = model_p1.predict(X_fri_all_scaled, verbose=0)
y_fri_pred_p1 = np.argmax(y_fri_pred_proba_p1, axis=1)

print("\n[Phase 1] Friday-only evaluation:")
print("Accuracy:", accuracy_score(y_fri_all, y_fri_pred_p1))
print("F1-macro   :", f1_score(y_fri_all, y_fri_pred_p1, average="macro"))
print("F1-weighted:", f1_score(y_fri_all, y_fri_pred_p1, average="weighted"))

labels_fri = np.unique(y_fri_all)
target_names_fri = le.inverse_transform(labels_fri)
print("Classes present in Friday subset:")
for i, name in zip(labels_fri, target_names_fri):
    print(i, "→", name)

print("\n[Phase 1] Classification report (Friday):")
print(classification_report(
    y_fri_all,
    y_fri_pred_p1,
    labels=labels_fri,
    target_names=target_names_fri,
    zero_division=0
))


=== Phase 1: Train on Mon–Thu, test on Friday (no adaptation) ===
Epoch 1/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - accuracy: 0.8233 - loss: 0.8109 - val_accuracy: 0.9721 - val_loss: 0.0998
Epoch 2/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9653 - loss: 0.1180 - val_accuracy: 0.9749 - val_loss: 0.0714
Epoch 3/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9702 - loss: 0.0896 - val_accuracy: 0.9765 - val_loss: 0.0584
Epoch 4/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9727 - loss: 0.0742 - val_accuracy: 0.9789 - val_loss: 0.0503
Epoch 5/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9753 - loss: 0.0636 - val_accuracy: 0.9804 - val_loss: 0.0444
Epoch 6/10
[1m235/235[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9770 - loss: 0.0564 - 

# Phase 2

In [20]:
print("\n=== Phase 2: Adaptation – retrain with Mon–Thu + part of Friday ===")

# combine Mon–Thu + Friday-train for adaptation
X_phase2_train = np.vstack([X_mon_thu_scaled, X_fri_train_scaled])
y_phase2_train = np.concatenate([y_mon_thu, y_fri_train])
y_phase2_cat   = to_categorical(y_phase2_train, num_classes=num_classes)

model_p2 = build_bpnn(input_dim, num_classes)

history_p2 = model_p2.fit(
    X_phase2_train,
    y_phase2_cat,
    epochs=10,
    batch_size=1024,
    validation_split=0.2,
    verbose=1
)

# evaluate on held-out Friday test subset
y_fri_pred_proba_p2 = model_p2.predict(X_fri_test_scaled, verbose=0)
y_fri_pred_p2 = np.argmax(y_fri_pred_proba_p2, axis=1)

print("\n[Phase 2] Friday-test evaluation (after adaptation):")
print("Accuracy:", accuracy_score(y_fri_test, y_fri_pred_p2))
print("F1-macro   :", f1_score(y_fri_test, y_fri_pred_p2, average="macro"))
print("F1-weighted:", f1_score(y_fri_test, y_fri_pred_p2, average="weighted"))

labels_fri_test = np.unique(y_fri_test)
target_names_fri_test = le.inverse_transform(labels_fri_test)

print("\n[Phase 2] Classification report (Friday test):")
print(classification_report(
    y_fri_test,
    y_fri_pred_p2,
    labels=labels_fri_test,
    target_names=target_names_fri_test,
    zero_division=0
))


=== Phase 2: Adaptation – retrain with Mon–Thu + part of Friday ===
Epoch 1/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.8546 - loss: 0.7195 - val_accuracy: 0.6069 - val_loss: 4.7842
Epoch 2/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9671 - loss: 0.1084 - val_accuracy: 0.6066 - val_loss: 5.6947
Epoch 3/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9716 - loss: 0.0833 - val_accuracy: 0.6093 - val_loss: 6.6891
Epoch 4/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9743 - loss: 0.0674 - val_accuracy: 0.6097 - val_loss: 7.1686
Epoch 5/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9761 - loss: 0.0586 - val_accuracy: 0.6084 - val_loss: 7.6294
Epoch 6/10
[1m290/290[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9788 - loss: 0.0515 