In [1]:
import pandas as pd
import numpy as np

# tensorflow
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, Embedding, Flatten, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('/content/drive/MyDrive/intrusion detection/Data.csv')
label = pd.read_csv('/content/drive/MyDrive/intrusion detection/Label.csv')
df['label']  = label


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


df['original_label'] = df['label']

df['binary_label'] = df['label'].apply(lambda x: 0 if x == 0 else 1)

X_binary = df.drop(['label', 'binary_label', 'original_label'], axis=1)
y_binary = df['binary_label']

X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(
    X_binary, y_binary, test_size=0.2, random_state=42
)




In [18]:
X_binary.columns.tolist()

['Flow Duration',
 'Total Fwd Packet',
 'Total Bwd packets',
 'Total Length of Fwd Packet',
 'Total Length of Bwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Min',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow IAT Max',
 'Flow IAT Min',
 'Fwd IAT Total',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Max',
 'Fwd IAT Min',
 'Bwd IAT Total',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd IAT Min',
 'Fwd PSH Flags',
 'Bwd PSH Flags',
 'Fwd URG Flags',
 'Bwd URG Flags',
 'Fwd Header Length',
 'Bwd Header Length',
 'Fwd Packets/s',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'FIN Flag Count',
 'SYN Flag Count',
 'RST Flag Count',
 'PSH Flag Count',
 'ACK Flag Count',
 'URG Flag Count',
 'CWR Flag Cou

In [5]:
# Train Binary Classifier
from sklearn.ensemble import RandomForestClassifier
binary_clf = RandomForestClassifier(random_state=42)
binary_clf.fit(X_binary_train, y_binary_train)

y_binary_pred = binary_clf.predict(X_binary_test)
print("Binary Classification Report:\n", classification_report(y_binary_test, y_binary_pred))

Binary Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     71701
           1       0.93      0.99      0.96     17882

    accuracy                           0.98     89583
   macro avg       0.96      0.98      0.97     89583
weighted avg       0.98      0.98      0.98     89583



In [7]:
import joblib

joblib.dump(binary_clf, "binary_classifier_rf.pkl")

['binary_classifier_rf.pkl']

In [8]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

non_benign_data = df[(df['binary_label'] == 1) & (df['original_label'].isin([3,4, 5, 7]))]

X_multi = non_benign_data.drop(['label', 'binary_label', 'original_label'], axis=1)
y_multi = non_benign_data['original_label']

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_multi_encoded = label_encoder.fit_transform(y_multi)


X_multi_train, X_multi_test, y_multi_train, y_multi_test = train_test_split(
    X_multi, y_multi_encoded, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_multi_train_scaled = scaler.fit_transform(X_multi_train)
X_multi_test_scaled = scaler.transform(X_multi_test)

In [9]:
# Save label encoder
joblib.dump(label_encoder, "label_encoder.pkl")
print("Label encoder saved successfully!")

# Save scaler
joblib.dump(scaler, "scaler.pkl")
print("Scaler saved successfully!")

Label encoder saved successfully!
Scaler saved successfully!


In [10]:
# Light Gradient Boost classifier

import lightgbm as lgb
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight(class_weight='balanced', y=y_multi_train)

lgb_clf = lgb.LGBMClassifier(n_estimators=100, random_state=42)
lgb_clf.fit(X_multi_train_scaled, y_multi_train, sample_weight=sample_weights)

y_multi_pred = lgb_clf.predict(X_multi_test_scaled)
print("LightGBM Classification Report:\n", classification_report(y_multi_test, y_multi_pred))


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.032319 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12567
[LightGBM] [Info] Number of data points in the train set: 65412, number of used features: 65
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294




LightGBM Classification Report:
               precision    recall  f1-score   support

           0       0.36      0.57      0.44       917
           1       0.91      0.64      0.75      6171
           2       0.74      0.93      0.82      5898
           3       0.78      0.72      0.75      3368

    accuracy                           0.76     16354
   macro avg       0.70      0.71      0.69     16354
weighted avg       0.79      0.76      0.76     16354



In [11]:
import joblib

# Save the LightGBM model
joblib.dump(lgb_clf, 'lgb_classifier_model.pkl')
print("LGB Model saved to 'lgb_classifier_model.pkl'")

LGB Model saved to 'lgb_classifier_model.pkl'


In [12]:
# Random Forest
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_clf.fit(X_multi_train_scaled, y_multi_train)

y_multi_pred = rf_clf.predict(X_multi_test_scaled)
print("Random Forest Classification Report:\n", classification_report(y_multi_test, y_multi_pred))


Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.71      0.31      0.43       917
           1       0.79      0.81      0.80      6171
           2       0.76      0.87      0.81      5898
           3       0.81      0.69      0.74      3368

    accuracy                           0.78     16354
   macro avg       0.77      0.67      0.70     16354
weighted avg       0.78      0.78      0.77     16354



In [13]:
import joblib

# Save the Random Forest model
joblib.dump(rf_clf, 'random_forest_model.pkl')
print("Random Forest model saved to 'random_forest_model.pkl'")


Random Forest model saved to 'random_forest_model.pkl'


In [14]:
# neural network model
model = Sequential([
    Dense(128, input_dim=X_multi_train_scaled.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(4, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(X_multi_train_scaled, y_multi_train, epochs=10, batch_size=32, validation_split=0.2, verbose=1)

loss, accuracy = model.evaluate(X_multi_test_scaled, y_multi_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

y_multi_pred = np.argmax(model.predict(X_multi_test_scaled), axis=1)
print("Multiclass Classification Report:\n", classification_report(y_multi_test, y_multi_pred))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.5678 - loss: 0.9763 - val_accuracy: 0.6523 - val_loss: 0.7998
Epoch 2/10
[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.6532 - loss: 0.8122 - val_accuracy: 0.6956 - val_loss: 0.7330
Epoch 3/10
[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.6956 - loss: 0.7530 - val_accuracy: 0.7236 - val_loss: 0.6942
Epoch 4/10
[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.7177 - loss: 0.7097 - val_accuracy: 0.7269 - val_loss: 0.6801
Epoch 5/10
[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - accuracy: 0.7286 - loss: 0.6843 - val_accuracy: 0.7365 - val_loss: 0.6596
Epoch 6/10
[1m1636/1636[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.7297 - loss: 0.6854 - val_accuracy: 0.7408 - val_loss: 0.6643
Epoch 7/10
[1m1

In [16]:
model.save("multiclass_nn_model.h5")
print("Model saved successfully!")




Model saved successfully!


In [17]:
non_benign_data.head()


Unnamed: 0,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label,original_label,binary_label
0,214392,9,21,388.0,24564.0,194.0,0.0,43.111111,85.545959,1460.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4,1
1,2376792,9,3,752.0,0.0,188.0,0.0,83.555556,99.0847,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,7,1
2,131350,10,3,7564.0,0.0,1460.0,0.0,756.4,690.497277,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4,1
3,164796,6,3,770.0,0.0,385.0,0.0,128.333333,198.813145,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,3,1
5,235721,7,5,360.0,600.0,180.0,0.0,51.428571,87.831007,300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,4,1


In [102]:
# Step 2: Filter Non-Benign samples
non_benign_data = df[(df['binary_label'] == 1) & (df['original_label'].isin([4, 5, 7]))]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(non_benign_data)


X_multi = non_benign_data.drop(['label', 'binary_label', 'original_label'], axis=1)  # Features
y_multi = non_benign_data['original_label']


X_multi_train, X_multi_test, y_multi_train, y_multi_test = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42
)

# Train Multiclass Classifier
multi_clf = RandomForestClassifier(random_state=42)
multi_clf.fit(X_multi_train, y_multi_train)

# Evaluate
y_multi_pred = multi_clf.predict(X_multi_test)
print("Multiclass Classification Report:\n", classification_report(y_multi_test, y_multi_pred))



# Prepare data for multiclass classification
                          # Labels

# X_multi_train, X_multi_test, y_multi_train, y_multi_test = train_test_split(
#     X_multi, y_multi, test_size=0.2, random_state=42
# )

# # Train multiclass classifier
# multi_clf = RandomForestClassifier(random_state=42)
# multi_clf.fit(X_multi_train, y_multi_train)

# # Evaluate multiclass classifier
# y_multi_pred = multi_clf.predict(X_multi_test)
# print("Multiclass Classification Report:\n", classification_report(y_multi_test, y_multi_pred))


Multiclass Classification Report:
               precision    recall  f1-score   support

           4       0.86      0.81      0.84      6242
           5       0.78      0.89      0.83      5902
           7       0.82      0.70      0.75      3316

    accuracy                           0.82     15460
   macro avg       0.82      0.80      0.81     15460
weighted avg       0.82      0.82      0.82     15460



In [112]:
non_benign_data.head()

Unnamed: 0,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label,binary_label,original_label
0,214392,9,21,388.0,24564.0,194.0,0.0,43.111111,85.545959,1460.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,4
1,2376792,9,3,752.0,0.0,188.0,0.0,83.555556,99.0847,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,1,7
2,131350,10,3,7564.0,0.0,1460.0,0.0,756.4,690.497277,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,4
5,235721,7,5,360.0,600.0,180.0,0.0,51.428571,87.831007,300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,4
7,414846,7,7,394.0,552.0,197.0,0.0,56.285714,96.126157,276.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1,4


In [114]:
X_multi = non_benign_data.drop(['label', 'binary_label', 'original_label'], axis=1)  # Features
y_multi = non_benign_data['original_label']

imputer = SimpleImputer(strategy='mean')

X_imputed = imputer.fit_transform(non_benign_data)

num_columns = non_benign_data.shape[1]
k = min(20, num_columns)

k_best = SelectKBest(score_func=f_classif, k=k)
X_new = k_best.fit_transform(X_imputed, y_multi)

  f = msb / msw
  f = msb / msw


In [115]:
selected_features_mask = k_best.get_support()
print(selected_features_mask)

[False False False False False  True False False  True  True False  True
  True False False False False False  True False False False False False
 False False False False False False False False False False False False
 False False  True  True  True  True  True  True False  True False False
 False False False  True False  True False False False False False False
 False False False False  True  True False  True False False False False
 False False False False  True False  True]


In [116]:
elected_feature_names = df.columns[selected_features_mask]


In [117]:
elected_feature_names

Index(['Fwd Packet Length Max', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow IAT Min', 'Packet Length Max',
       'Packet Length Mean', 'Packet Length Std', 'Packet Length Variance',
       'FIN Flag Count', 'SYN Flag Count', 'PSH Flag Count',
       'Average Packet Size', 'Bwd Segment Size Avg', 'FWD Init Win Bytes',
       'Bwd Init Win Bytes', 'Fwd Seg Size Min', 'label', 'original_label'],
      dtype='object')

In [72]:
df

Unnamed: 0,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Fwd Act Data Pkts,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min
0,214392,9,21,388.0,24564.0,194.0,0.0,43.111111,85.545959,1460.0,...,2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2376792,9,3,752.0,0.0,188.0,0.0,83.555556,99.084700,0.0,...,4,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,131350,10,3,7564.0,0.0,1460.0,0.0,756.400000,690.497277,0.0,...,6,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,164796,6,3,770.0,0.0,385.0,0.0,128.333333,198.813145,0.0,...,2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,163418,6,3,400.0,0.0,200.0,0.0,66.666667,103.279556,0.0,...,2,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
447910,348,1,2,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447911,1639,4,4,408.0,192.0,116.0,88.0,102.000000,16.165808,72.0,...,3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447912,312,1,2,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
447913,358,1,2,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0,32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [74]:
df.head(30)

Unnamed: 0,Flow Duration,Total Fwd Packet,Total Bwd packets,Total Length of Fwd Packet,Total Length of Bwd Packet,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,label
0,214392,9,21,388.0,24564.0,194.0,0.0,43.111111,85.545959,1460.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,2376792,9,3,752.0,0.0,188.0,0.0,83.555556,99.0847,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
2,131350,10,3,7564.0,0.0,1460.0,0.0,756.4,690.497277,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
3,164796,6,3,770.0,0.0,385.0,0.0,128.333333,198.813145,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
4,163418,6,3,400.0,0.0,200.0,0.0,66.666667,103.279556,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6
5,235721,7,5,360.0,600.0,180.0,0.0,51.428571,87.831007,300.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
6,2570808,9,3,1472.0,0.0,368.0,0.0,163.555556,193.95303,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
7,414846,7,7,394.0,552.0,197.0,0.0,56.285714,96.126157,276.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
8,184968,7,7,384.0,3576.0,192.0,0.0,54.857143,93.686407,1460.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
9,182141,7,11,432.0,8858.0,216.0,0.0,61.714286,105.397208,1460.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4


In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

df['label'] = df['label'].apply(lambda x: 0 if x == 0 else 1)

y = df['label']               # Labels
X = df.drop('label', axis=1)  # Features


In [81]:
y

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1
...,...
447910,0
447911,0
447912,0
447913,0


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9821059799292277
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.98      0.99     71701
           1       0.93      0.99      0.96     17882

    accuracy                           0.98     89583
   macro avg       0.96      0.98      0.97     89583
weighted avg       0.98      0.98      0.98     89583



In [7]:
label

Unnamed: 0,Label
0,0
1,0
2,0
3,0
4,0
...,...
115495,10
115496,10
115497,10
115498,10


In [9]:
df=df.astype(int)

In [10]:
label.isnull().sum()

Unnamed: 0,0
Label,0


In [11]:
label.unique()

AttributeError: 'DataFrame' object has no attribute 'unique'

In [11]:
df.isnull().sum()

Unnamed: 0,0
Dst Port,0
Protocol,0
Fwd Pkt Len Min,0
Fwd Pkt Len Std,0
Bwd Pkt Len Min,0
Flow Byts/s,0
Flow IAT Mean,0
Flow IAT Std,0
Flow IAT Min,0
Fwd IAT Std,0


In [15]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

In [16]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(df)

num_columns = df.shape[1]
k = min(20, num_columns)

k_best = SelectKBest(score_func=f_classif, k=k)
X_new = k_best.fit_transform(X_imputed, label)

  y = column_or_1d(y, warn=True)
  f = msb / msw


In [17]:
selected_features_mask = k_best.get_support()
print(selected_features_mask)

[ True False False False False  True False  True  True  True False False
 False  True  True False False False False  True False False False False
 False False False False False False False False False False False  True
  True  True  True  True  True False  True  True False False False False
 False False  True  True  True False False False False False False False
 False False False False False False False  True False False False False
 False False False False]


In [18]:
df.columns

Index(['Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Min',
       'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Flow Bytes/s',
       'Flow Packets/s', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
       'Flow IAT Min', 'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std',
       'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean',
       'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags',
       'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Count', 'RST Flag Count', 'PSH Flag C

In [19]:
elected_feature_names = df.columns[selected_features_mask]


In [20]:
elected_feature_names


Index(['Flow Duration', 'Fwd Packet Length Max', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Flow Bytes/s',
       'Flow Packets/s', 'Fwd IAT Total', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'FIN Flag Count', 'SYN Flag Count',
       'Down/Up Ratio', 'Average Packet Size', 'Fwd Segment Size Avg',
       'Fwd Seg Size Min'],
      dtype='object')

In [21]:
new_columns = ['Flow Duration', 'Fwd Packet Length Max', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max', 'Flow Bytes/s',
       'Flow Packets/s', 'Fwd IAT Total', 'Fwd Packets/s', 'Bwd Packets/s',
       'Packet Length Min', 'Packet Length Max', 'Packet Length Mean',
       'Packet Length Std', 'FIN Flag Count', 'SYN Flag Count',
       'Down/Up Ratio', 'Average Packet Size', 'Fwd Segment Size Avg',
       'Fwd Seg Size Min']

In [22]:
len(new_columns)

20

In [23]:
df_new=df[new_columns]


In [24]:
df_new['label']=label

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['label']=label


In [25]:
df_new.head()

Unnamed: 0,Flow Duration,Fwd Packet Length Max,Fwd Packet Length Mean,Fwd Packet Length Std,Bwd Packet Length Max,Flow Bytes/s,Flow Packets/s,Fwd IAT Total,Fwd Packets/s,Bwd Packets/s,...,Packet Length Max,Packet Length Mean,Packet Length Std,FIN Flag Count,SYN Flag Count,Down/Up Ratio,Average Packet Size,Fwd Segment Size Avg,Fwd Seg Size Min,label
0,214392,194.0,43.111111,85.545959,1460.0,116384.939737,139.930594,213501.0,41.979178,97.951416,...,1460.0,804.903226,702.892469,2,4,2.0,831.733333,43.111111,20,4
1,2376792,188.0,83.555556,99.0847,0.0,316.392852,5.048822,2323484.0,3.786617,1.262206,...,188.0,57.846154,90.312279,2,4,0.0,62.666667,83.555556,20,7
2,131350,1460.0,756.4,690.497277,0.0,57586.600685,98.972212,119039.0,76.13247,22.839741,...,1460.0,540.285714,675.150516,1,4,0.0,581.846154,756.4,20,4
3,164796,385.0,128.333333,198.813145,0.0,4672.443506,54.612976,111397.0,36.408651,18.204325,...,385.0,77.0,162.330253,1,4,0.0,85.555556,128.333333,20,3
4,163418,200.0,66.666667,103.279556,0.0,2447.710778,55.073493,112374.0,36.715662,18.357831,...,200.0,40.0,84.327404,1,4,0.0,44.444444,66.666667,20,6


In [26]:
X1=df_new.iloc[:,:-1].values
y1=df_new.iloc[:,-1].values

In [27]:
n_classes = 10

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.3, random_state=42)


In [29]:
ann = Sequential()
ann.add(Dense(units=20, activation='sigmoid'))
ann.add(Dense(units=20, activation='sigmoid'))
ann.add(Dense(units=n_classes, activation='softmax'))

In [30]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10)


In [31]:
from tensorflow.keras.layers import  Add

class ClassificationModel:
    def __init__(self, numerical_feature_count, num_classes):
        """
        Initializes the ClassificationModel class.

        :param numerical_feature_count: Number of numerical features.
        :param num_classes: Number of classes in the target variable.
        """
        self.numerical_feature_count = numerical_feature_count
        self.num_classes = num_classes
        self.model = self._build_model()

    def _dense_block(self, x, units, dropout_rate):
        """
        Creates a dense block with normalization, activation, and dropout.

        :param x: Input tensor.
        :param units: Number of units for the dense layer.
        :param dropout_rate: Dropout rate.
        :return: Output tensor after the dense block.
        """
        x = Dense(units, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(dropout_rate)(x)
        return x

    def _build_model(self):
        """
        Builds the TensorFlow model for classification tasks.

        :return: Compiled TensorFlow model.
        """
        numerical_input = Input(shape=(self.numerical_feature_count,), name="numerical_input")

        # Input normalization
        x = BatchNormalization()(numerical_input)

        # Add deeper dense layers with residual connections
        x = self._dense_block(x, 256, 0.3)
        shortcut = x

        x = self._dense_block(x, 256, 0.3)
        x = Add()([x, shortcut])  # Residual connection

        x = self._dense_block(x, 256, 0.3)
        shortcut = x

        x = self._dense_block(x, 256, 0.3)
        x = Add()([x, shortcut])  # Residual connection

        x = self._dense_block(x, 128, 0.3)
        shortcut = x

        x = self._dense_block(x, 128, 0.3)
        x = Add()([x, shortcut])

        x = self._dense_block(x, 64, 0.3)

        # Output layer
        output = Dense(self.num_classes, activation='softmax', name="output")(x)

        model = Model(inputs=numerical_input, outputs=output)

        # Compile the model
        model.compile(
            optimizer=Adam(learning_rate=0.001),
            loss='sparse_categorical_crossentropy',
            metrics=['accuracy']
        )

        return model

    def summary(self):
        self.model.summary()

# Define the parameters
numerical_feature_count = 20
num_classes = 10

# Instantiate and summarize the model
model = ClassificationModel(numerical_feature_count, num_classes).model
model.summary()


In [33]:
model.fit(X_train,y_train,batch_size=32,epochs=5,callbacks=[early_stopping])


Epoch 1/5
[1m9799/9799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 14ms/step - accuracy: 0.8969 - loss: 0.2860
Epoch 2/5
[1m9799/9799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 13ms/step - accuracy: 0.9004 - loss: 0.2768
Epoch 3/5
[1m9799/9799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 13ms/step - accuracy: 0.9014 - loss: 0.2735
Epoch 4/5
[1m9799/9799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 13ms/step - accuracy: 0.9024 - loss: 0.2704
Epoch 5/5
[1m9799/9799[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 13ms/step - accuracy: 0.9039 - loss: 0.2664


<keras.src.callbacks.history.History at 0x7852eac299f0>

In [32]:
model.save('intrusion_model-84.h5')




In [34]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test)



[1m4200/4200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 4ms/step


In [36]:
from sklearn.metrics import classification_report

# y_pred = model.predict(X_test)
y_pred = np.argmax(y_pred, axis=1)

report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.94      0.99      0.96    107600
           1       0.00      0.00      0.00       106
           2       0.00      0.00      0.00       150
           3       0.00      0.00      0.00      1348
           4       0.74      0.68      0.71      9342
           5       0.58      0.55      0.56      8765
           6       0.65      0.31      0.42      1381
           7       0.69      0.41      0.52      5013
           8       0.00      0.00      0.00       599
           9       0.00      0.00      0.00        71

    accuracy                           0.89    134375
   macro avg       0.36      0.29      0.32    134375
weighted avg       0.87      0.89      0.88    134375



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
