In [36]:
import numpy as np
import pandas as pd
from glob import glob
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [37]:
files = glob(r"P:\SSM project\Datasets\System_intrusion\*.csv")
df_list = []

for file in files:
    print(f"Reading {file}")
    df = pd.read_csv(file, low_memory=False)

    if df.columns.equals(df.iloc[0]):  
        df = df[1:]  

    df_list.append(df)

data = pd.concat(df_list, ignore_index=True)


Reading P:\SSM project\Datasets\System_intrusion\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv
Reading P:\SSM project\Datasets\System_intrusion\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv
Reading P:\SSM project\Datasets\System_intrusion\Friday-WorkingHours-Morning.pcap_ISCX.csv
Reading P:\SSM project\Datasets\System_intrusion\Monday-WorkingHours.pcap_ISCX.csv
Reading P:\SSM project\Datasets\System_intrusion\Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv
Reading P:\SSM project\Datasets\System_intrusion\Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv
Reading P:\SSM project\Datasets\System_intrusion\Tuesday-WorkingHours.pcap_ISCX.csv
Reading P:\SSM project\Datasets\System_intrusion\Wednesday-workingHours.pcap_ISCX.csv


In [38]:
print(data.head(5))
print(data.dtypes)
print(data)


    Destination Port   Flow Duration   Total Fwd Packets  \
0              54865               3                   2   
1              55054             109                   1   
2              55055              52                   1   
3              46236              34                   1   
4              54863               3                   2   

    Total Backward Packets  Total Length of Fwd Packets  \
0                        0                           12   
1                        1                            6   
2                        1                            6   
3                        1                            6   
4                        0                           12   

    Total Length of Bwd Packets   Fwd Packet Length Max  \
0                             0                       6   
1                             6                       6   
2                             6                       6   
3                             6                 

In [39]:
data.columns = data.columns.str.strip()

benign_keep = 100000  
benign_df = data[data["Label"] == "BENIGN"]
attack_df = data[data["Label"] != "BENIGN"]

benign_downsampled = benign_df.sample(n=benign_keep, random_state=42)
balanced_data = pd.concat([benign_downsampled, attack_df], ignore_index=True)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_data["Label"].value_counts())
print(balanced_data.shape)


Label
DoS Hulk                      231073
PortScan                      158930
DDoS                          128027
BENIGN                        100000
DoS GoldenEye                  10293
FTP-Patator                     7938
SSH-Patator                     5897
DoS slowloris                   5796
DoS Slowhttptest                5499
Bot                             1966
Web Attack � Brute Force        1507
Web Attack � XSS                 652
Web Attack � Sql Injection        21
Heartbleed                        11
Name: count, dtype: int64
(657613, 79)


In [40]:
import pandas as pd

data.columns = data.columns.str.strip()

max_per_class = 10000  

balanced_list = []

for label, group in data.groupby("Label"):
    if len(group) > max_per_class:
        balanced_list.append(group.sample(n=max_per_class, random_state=42))
    else:
        balanced_list.append(group)

balanced_data = pd.concat(balanced_list, ignore_index=True)
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

print(balanced_data["Label"].value_counts())
print(balanced_data.shape)

Label
DDoS                          10000
PortScan                      10000
BENIGN                        10000
DoS GoldenEye                 10000
DoS Hulk                      10000
FTP-Patator                    7938
SSH-Patator                    5897
DoS slowloris                  5796
DoS Slowhttptest               5499
Bot                            1966
Web Attack � Brute Force       1507
Web Attack � XSS                652
Web Attack � Sql Injection       21
Heartbleed                       11
Name: count, dtype: int64
(79287, 79)


In [41]:
df.columns = df.columns.str.strip()
print(df.columns.tolist())

['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Fwd Packet Length Max', 'Fwd Packet Length Min', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Bwd Packet Length Min', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length', 'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s', 'Min Packet Length', 'Max Packet Length', 'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count', 'SYN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count', 'CWE Flag Count', 'ECE Flag Count

In [42]:
from sklearn.preprocessing import LabelEncoder
original_labels=df['Label'].copy()
label_encoder = LabelEncoder()
df['Label'] = label_encoder.fit_transform(df['Label'])

for i, class_name in enumerate(label_encoder.classes_):
    print(f"{i}->{class_name}")

0->BENIGN
1->DoS GoldenEye
2->DoS Hulk
3->DoS Slowhttptest
4->DoS slowloris
5->Heartbleed


In [43]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)

In [44]:
X = df.drop('Label', axis=1)
y = df['Label']

In [45]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


In [47]:
pip install imblearn

Note: you may need to restart the kernel to use updated packages.




In [48]:
model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [49]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.9997035044329703

Confusion Matrix:
 [[87928     0     7     2     0     0]
 [    1  2052     5     1     0     0]
 [    0     5 46019     0     1     0]
 [    2     0     0  1091     7     0]
 [    4     0     1     5  1149     0]
 [    0     0     0     0     0     2]]

Classification Report:
                   precision    recall  f1-score   support

          BENIGN       1.00      1.00      1.00     87937
   DoS GoldenEye       1.00      1.00      1.00      2059
        DoS Hulk       1.00      1.00      1.00     46025
DoS Slowhttptest       0.99      0.99      0.99      1100
   DoS slowloris       0.99      0.99      0.99      1159
      Heartbleed       1.00      1.00      1.00         2

        accuracy                           1.00    138282
       macro avg       1.00      1.00      1.00    138282
    weighted avg       1.00      1.00      1.00    138282



In [50]:
import joblib
joblib.dump(model, 'system_intrusion.pkl')

['system_intrusion.pkl']

In [51]:
joblib.dump(scaler, "intrusion_scaler.pkl")

['intrusion_scaler.pkl']