In [2]:
import pandas as pd

# Load the dataset to inspect its structure
file_path = 'ISCX_5class_each_normalized_cuttedfloefeature.csv'
data = pd.read_csv(file_path)

# Display the first few rows and the summary info to understand the structure
data.head()


Unnamed: 0,label,Protocol,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,Fwd Pkt Len Mean,Fwd Pkt Len Std,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,0,0.352941,0.000694,0.002068,0.000184,5.1e-05,0.02858,0.05391,0.050995,0.0,...,0.002086,0,0.000489,8e-06,0.000492,0.000487,0.444513,3.5e-05,0.444527,0.444499
1,0,0.352941,0.000694,0.001929,0.000237,6.6e-05,0.036807,0.069429,0.065675,0.0,...,0.002086,0,0.000503,8e-06,0.000506,0.0005,0.465009,5e-06,0.465011,0.465007
2,0,0.352941,0.000694,0.002151,0.000326,9.1e-05,0.050685,0.095607,0.090437,0.0,...,0.002086,0,0.000534,7e-06,0.000536,0.000531,0.432067,0.166036,0.499569,0.364566
3,0,0.352941,0.022567,0.039318,0.039066,0.015104,0.689494,0.0,0.261592,0.462599,...,0.048838,0,0.011745,0.036071,0.043437,0.001408,0.15647,0.174647,0.303488,0.059005
4,0,0.352941,0.004185,0.011055,0.001784,0.002152,0.189117,0.0,0.08209,0.116242,...,0.007519,0,0.015958,0.028481,0.036435,0.00561,0.162346,0.185729,0.324083,0.068399


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 67 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   label              100000 non-null  int64  
 1   Protocol           100000 non-null  float64
 2   Tot Fwd Pkts       100000 non-null  float64
 3   Tot Bwd Pkts       100000 non-null  float64
 4   TotLen Fwd Pkts    100000 non-null  float64
 5   TotLen Bwd Pkts    100000 non-null  float64
 6   Fwd Pkt Len Max    100000 non-null  float64
 7   Fwd Pkt Len Min    100000 non-null  float64
 8   Fwd Pkt Len Mean   100000 non-null  float64
 9   Fwd Pkt Len Std    100000 non-null  float64
 10  Bwd Pkt Len Max    100000 non-null  float64
 11  Bwd Pkt Len Min    100000 non-null  float64
 12  Bwd Pkt Len Mean   100000 non-null  float64
 13  Bwd Pkt Len Std    100000 non-null  float64
 14  Fwd IAT Tot        100000 non-null  float64
 15  Fwd IAT Mean       100000 non-null  float64
 16  Fwd

In [13]:
unique_labels = data['label'].unique()

print("\nUnique labels in the dataset:")
print(unique_labels)


Unique labels in the dataset:
[0 1 2 3 4]


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate the features and target
X = data.drop(columns=['label'])
y = data['label']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Check the shape of the training and testing sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((80000, 66), (20000, 66), (80000,), (20000,))

In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Train the XGBoost model
xgb_model = XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
xgb_report = classification_report(y_test, xgb_predictions)

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Classification Report:\n", xgb_report)


XGBoost Accuracy: 0.99995
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4000
           1       1.00      1.00      1.00      4000
           2       1.00      1.00      1.00      4000
           3       1.00      1.00      1.00      4000
           4       1.00      1.00      1.00      4000

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [16]:
from sklearn.model_selection import cross_val_score

# Perform 5-fold cross-validation
scores = cross_val_score(xgb_model, X, y, cv=5)

# Average accuracy across the folds
print(f"Cross-Validated Accuracy: {scores.mean():.5f}")

Cross-Validated Accuracy: 0.99240


In [18]:
from sklearn.metrics import confusion_matrix
y_pred = xgb_model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Confusion Matrix:
[[4000    0    0    0    0]
 [   0 4000    0    0    0]
 [   0    0 4000    0    0]
 [   0    0    0 4000    0]
 [   0    1    0    0 3999]]


In [6]:
from sklearn.svm import SVC

# Train the SVM model
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train, y_train)

# Make predictions
svm_predictions = svm_model.predict(X_test)

# Evaluate the model
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_report = classification_report(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("SVM Classification Report:\n", svm_report)


SVM Accuracy: 0.9837
SVM Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98      4000
           1       0.99      0.99      0.99      4000
           2       0.99      0.98      0.98      4000
           3       0.96      0.99      0.98      4000
           4       0.99      0.99      0.99      4000

    accuracy                           0.98     20000
   macro avg       0.98      0.98      0.98     20000
weighted avg       0.98      0.98      0.98     20000



In [7]:
from sklearn.neighbors import KNeighborsClassifier

# Train the KNN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Make predictions
knn_predictions = knn_model.predict(X_test)

# Evaluate the model
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_report = classification_report(y_test, knn_predictions)

print("KNN Accuracy:", knn_accuracy)
print("KNN Classification Report:\n", knn_report)


KNN Accuracy: 0.9888
KNN Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      4000
           1       0.99      0.99      0.99      4000
           2       0.99      0.98      0.99      4000
           3       0.98      0.99      0.98      4000
           4       0.99      0.99      0.99      4000

    accuracy                           0.99     20000
   macro avg       0.99      0.99      0.99     20000
weighted avg       0.99      0.99      0.99     20000



In [8]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_report = classification_report(y_test, rf_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)


Random Forest Accuracy: 0.99985
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4000
           1       1.00      1.00      1.00      4000
           2       1.00      1.00      1.00      4000
           3       1.00      1.00      1.00      4000
           4       1.00      1.00      1.00      4000

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [9]:
from sklearn.ensemble import RandomForestClassifier

# Train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_report = classification_report(y_test, rf_predictions)

print("Random Forest Accuracy:", rf_accuracy)
print("Random Forest Classification Report:\n", rf_report)


Random Forest Accuracy: 0.99985
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4000
           1       1.00      1.00      1.00      4000
           2       1.00      1.00      1.00      4000
           3       1.00      1.00      1.00      4000
           4       1.00      1.00      1.00      4000

    accuracy                           1.00     20000
   macro avg       1.00      1.00      1.00     20000
weighted avg       1.00      1.00      1.00     20000



In [10]:
import numpy as np
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten, Dropout
from tensorflow.keras.utils import to_categorical

# Reshape input for CNN (CNNs expect 3D input)
X_train_cnn = np.expand_dims(X_train, axis=2)
X_test_cnn = np.expand_dims(X_test, axis=2)

# One-hot encode the labels
y_train_cnn = to_categorical(y_train)
y_test_cnn = to_categorical(y_test)

# Build the CNN model
cnn_model = Sequential([
    Conv1D(32, kernel_size=3, activation='relu', input_shape=(66, 1)),
    Dropout(0.5),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(5, activation='softmax')
])

# Compile the model
cnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the CNN model
cnn_model.fit(X_train_cnn, y_train_cnn, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_cnn, y_test_cnn)
print("CNN Accuracy:", cnn_accuracy)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.8918 - loss: 0.3735 - val_accuracy: 0.9753 - val_loss: 0.0796
Epoch 2/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9709 - loss: 0.0893 - val_accuracy: 0.9820 - val_loss: 0.0601
Epoch 3/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9770 - loss: 0.0693 - val_accuracy: 0.9827 - val_loss: 0.0579
Epoch 4/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9802 - loss: 0.0620 - val_accuracy: 0.9857 - val_loss: 0.0450
Epoch 5/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9812 - loss: 0.0574 - val_accuracy: 0.9856 - val_loss: 0.0445
Epoch 6/10
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9831 - loss: 0.0521 - val_accuracy: 0.9869 - val_loss: 0.0407
Epoch 7/10
[1m2000/2000[0

In [11]:
# Due to complexity, a simplified GAN code is provided
# You can experiment with tuning it later for better performance

import tensorflow as tf
from tensorflow.keras import layers

# Generator model
def build_generator():
    model = Sequential([
        layers.Dense(128, activation='relu', input_dim=66),
        layers.Dense(66, activation='tanh')
    ])
    return model

# Discriminator model
def build_discriminator():
    model = Sequential([
        layers.Dense(128, activation='relu', input_dim=66),
        layers.Dense(1, activation='sigmoid')
    ])
    return model

generator = build_generator()
discriminator = build_discriminator()

# Compile the discriminator
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Build the GAN
discriminator.trainable = False
gan_input = layers.Input(shape=(66,))
generated_data = generator(gan_input)
gan_output = discriminator(generated_data)
gan = tf.keras.Model(gan_input, gan_output)

# Compile the GAN
gan.compile(optimizer='adam', loss='binary_crossentropy')

# Train GAN (simplified training loop)
def train_gan(epochs=1000, batch_size=32):
    for epoch in range(epochs):
        # Random noise for generator
        noise = np.random.normal(0, 1, (batch_size, 66))

        # Generate fake samples
        generated_samples = generator.predict(noise)

        # Get a random set of real samples
        idx = np.random.randint(0, X_train.shape[0], batch_size)
        real_samples = X_train[idx]

        # Train the discriminator
        d_loss_real = discriminator.train_on_batch(real_samples, np.ones((batch_size, 1)))
        d_loss_fake = discriminator.train_on_batch(generated_samples, np.zeros((batch_size, 1)))

        # Train the generator
        g_loss = gan.train_on_batch(noise, np.ones((batch_size, 1)))

        if epoch % 100 == 0:
            print(f"Epoch {epoch}: D Loss Real {d_loss_real}, D Loss Fake {d_loss_fake}, G Loss {g_loss}")

train_gan()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 539ms/step




Epoch 0: D Loss Real [array(0.43659908, dtype=float32), array(1., dtype=float32)], D Loss Fake [array(0.79065514, dtype=float32), array(0.5, dtype=float32)], G Loss [array(0.79065514, dtype=float32), array(0.79065514, dtype=float32), array(0.5, dtype=float32)]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 84ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 86ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31

In [12]:
from tensorflow.keras import layers, models

# Build the Autoencoder
autoencoder = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(66,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(66, activation='sigmoid')
])

# Compile the autoencoder
autoencoder.compile(optimizer='adam', loss='mse')

# Train the autoencoder
autoencoder.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.2)

# Evaluate the autoencoder (anomaly detection)
reconstructed = autoencoder.predict(X_test)
reconstruction_loss = np.mean(np.abs(X_test - reconstructed), axis=1)
print("Autoencoder Reconstruction Loss:", reconstruction_loss)


Epoch 1/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.6255 - val_loss: 0.5432
Epoch 2/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.5681 - val_loss: 0.5417
Epoch 3/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - loss: 0.5612 - val_loss: 0.5404
Epoch 4/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.5611 - val_loss: 0.5400
Epoch 5/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - loss: 0.5775 - val_loss: 0.5397
Epoch 6/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.5584 - val_loss: 0.5395
Epoch 7/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.5662 - val_loss: 0.5397
Epoch 8/20
[1m2000/2000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - loss: 0.5381 - val_loss: 0.5394
Epoch 9/20
[1m2000/2000