In [3]:
import numpy as np
import pandas as pd
import time

import csv

from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split ,cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

from sklearn.preprocessing import StandardScaler



In [4]:

# Load data
df = pd.read_csv("Network_kali_dataset.csv")

# Map class labels
df['class'] = df['class'].map({'normal': 0, 'anomaly': 1})

# Drop duplicates and nulls
df.drop_duplicates(inplace=True)
df.dropna(how='any', inplace=True)

# Convert categorical to numeric
df['Service'] = df['Service'].map({'TCP': 0, 'UDP': 1, 'ICMP': 2})
df['Protocol'] = df['Protocol'].map({'QUIC': 1, 'UDP': 3, 'ICMP': 3, 'TCP': 4, 'DNS': 5, 'TLS': 6, 'HTTP': 7, 'MDNS': 8, 'Unknown': 9})
df['Ethernet Type'] = df['Ethernet Type'].map({'IPv4': 4, 'IPv6': 6})

# Convert hex fields to int
df['Checksum'] = df['Checksum'].apply(lambda x: int(x, 16))
df['TCP Flags'] = df['TCP Flags'].apply(lambda x: int(x, 16))

# Drop unnecessary columns
df.drop(columns=["Source IP", "Destination IP"], inplace=True)

# Now that all data is numeric, we can apply feature selection
filter_cols = VarianceThreshold(threshold=0.01)
filter_cols.fit(df)

# Drop low variance columns
col_ids = np.where(filter_cols.variances_ <= 0.01)[0]
dropped_cols = [df.columns[i] for i in col_ids]
df.drop(columns=dropped_cols, axis=1, inplace=True)

# Scale the data
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

# Print first few rows (you may want to turn df_scaled back to DataFrame if needed)
print(df_scaled[:5])


[[ 1.17118621  0.95642527  0.57306029 -0.67627827  1.3805297  -1.47430514
  -0.68677896 -0.61971721 -0.18068355 -0.20615433 -0.96314062]
 [ 1.17118621  0.80952503  0.57306029 -0.67627827  1.3609935  -1.47430514
  -0.68677896  0.0766374  -0.18068355 -0.20615433 -0.96314062]
 [-0.85383519 -0.55419883  0.57306029  0.32633433 -0.41619399 -0.09147478
  -0.68677896  1.19458729 -0.18068355 -0.20615433  1.03826999]
 [-0.85383519 -0.55419883  2.16789174 -0.69893127 -1.21860754 -0.09147478
  -0.68677896 -1.19938564 -0.18068355  4.85310699  1.03826999]
 [ 1.17118621  0.95642527  0.57306029 -0.67627827  1.03775824 -1.47430514
  -0.68677896  1.28272899 -0.18068355 -0.20615433 -0.96314062]]


In [6]:
(df.head())

Unnamed: 0,Ethernet Type,Packet Length,Service,Source Port,Destination Port,Protocol,TCP Flags,Checksum,TCP Window Size,ICMP Type,class
0,6,1294,1,443,60002,1,0,22671,0,0,0
1,6,1174,1,443,59551,1,0,36552,0,0,0
2,4,60,1,20050,18524,3,0,58837,0,0,1
3,4,60,2,0,0,3,0,11116,0,8,1
4,6,1294,1,443,52089,1,0,60594,0,0,0


In [7]:
# Splitting the model
y = df[['class']]
X = df.drop(columns=['class',], axis=1)

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
print("Input Training Set Shape:", X_train.shape)
print("Input Testing Set Shape:", X_test.shape)
print("Output Training Set Shape:", y_train.shape)
print("Output Testing Set Shape:", y_test.shape)

Input Training Set Shape: (16438, 10)
Input Testing Set Shape: (8097, 10)
Output Training Set Shape: (16438, 1)
Output Testing Set Shape: (8097, 1)


## Decision Tree

In [10]:
dec_classifier = DecisionTreeClassifier(ccp_alpha=0.01, random_state=42) # ccp 0.01 for prunning and and less overfitting
start_time = time.time()
dec_classifier.fit(X_train, y_train.values.ravel()) ## values = coverting dataframe values into numpy array ; ravel = it converts the numpy array to 1D array
end_time = time.time()
print("Total training time: ", end_time - start_time)


start_time = time.time()
y_test_pred = dec_classifier.predict(X_train)
y_test_pred2 = dec_classifier.predict(X_test)
end_time = time.time()

print("Total testing time: ", end_time - start_time)

Total training time:  0.03443455696105957
Total testing time:  0.0009987354278564453


In [11]:

print("Training accuracy is:", str(round(100 * dec_classifier.score(X_train, y_train), 2)) + str("%"))
print("Testing accuracy is:", str(round(100 * dec_classifier.score(X_test, y_test), 2)) + str("%"))
dec_tree_train_accuracy = 100 * dec_classifier.score(X_train, y_train)
dec_tree_test_accuracy = 100 * dec_classifier.score(X_test, y_test)

print("1 = Anomaly\n0 = Normal")
print(y_test['class'].value_counts())

# Compute confusion matrix
cm = confusion_matrix(y_test, y_test_pred2)
print(cm)
print("Decision Tree Prediction::::")
print(f"Correct Guess :: {cm[0][0] + cm[1][1]}\nWrong Guess :: {cm[0][1] + cm [1][0]}\nTotal Guess :: {cm[0][0] + cm[1][1] + cm[0][1] + cm [1][0]}\n\n  ")


# Accuracy
accuracy2 = (accuracy_score(y_test, y_test_pred2)) 
print(f"Accuracy : {accuracy2* 100:.2f} %")


# Generate classification report
dec_report = classification_report(y_test, y_test_pred2, target_names=['Normal', 'Anomaly'])
print("Decision tree Classification Report:")
print(dec_report)

from sklearn.model_selection import cross_val_score

scores = cross_val_score(dec_classifier, X_train, y_train, cv=5)
print(f"Cross-Validation Accuracy: {scores.mean():.4f} ± {scores.std():.4f}")


Training accuracy is: 99.22%
Testing accuracy is: 99.21%
1 = Anomaly
0 = Normal
class
0    4162
1    3935
Name: count, dtype: int64
[[4141   21]
 [  43 3892]]
Decision Tree Prediction::::
Correct Guess :: 8033
Wrong Guess :: 64
Total Guess :: 8097

  
Accuracy : 99.21 %
Decision tree Classification Report:
              precision    recall  f1-score   support

      Normal       0.99      0.99      0.99      4162
     Anomaly       0.99      0.99      0.99      3935

    accuracy                           0.99      8097
   macro avg       0.99      0.99      0.99      8097
weighted avg       0.99      0.99      0.99      8097

Cross-Validation Accuracy: 0.9922 ± 0.0009


## Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

# Initialize the Random Forest classifier
rf_classifier = RandomForestClassifier(
    n_estimators=100,         # Number of trees
    max_depth=None,           # You can set this to control overfitting
    random_state=42,
    n_jobs=-1,                # Use all CPU cores
    class_weight='balanced'  # Handles imbalanced data
)

# Train the model
start_time = time.time()
rf_classifier.fit(X_train, y_train.values.ravel())
end_time = time.time()
print("Random Forest - Total training time: ", end_time - start_time)

# Predict on both training and testing sets
start_time = time.time()
y_train_pred = rf_classifier.predict(X_train)
y_test_pred = rf_classifier.predict(X_test)
end_time = time.time()
print("Random Forest - Total testing time: ", end_time - start_time)

# Accuracy
train_acc = rf_classifier.score(X_train, y_train)
test_acc = rf_classifier.score(X_test, y_test)

print(f"Training Accuracy: {train_acc * 100:.2f}%")
print(f"Testing Accuracy: {test_acc * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\n1 = Anomaly\n0 = Normal")
print(y_test['class'].value_counts())
print("\nConfusion Matrix:")
print(cm)
print(f"Correct Guess :: {cm[0][0] + cm[1][1]}")
print(f"Wrong Guess   :: {cm[0][1] + cm[1][0]}")
print(f"Total Guess   :: {cm.sum()}\n")

# Accuracy Score
accuracy_rf = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy_rf * 100:.2f}%")

# Classification Report
rf_report = classification_report(y_test, y_test_pred, target_names=['Normal', 'Anomaly'])
print("\nRandom Forest Classification Report:")
print(rf_report)

# Cross-Validation
rf_cv_scores = cross_val_score(rf_classifier, X_train, y_train.values.ravel(), cv=5)
print(f"Cross-Validation Accuracy: {rf_cv_scores.mean():.4f} ± {rf_cv_scores.std():.4f}")


Random Forest - Total training time:  0.305225133895874
Random Forest - Total testing time:  0.08390140533447266
Training Accuracy: 99.98%
Testing Accuracy: 99.80%

1 = Anomaly
0 = Normal
class
0    4162
1    3935
Name: count, dtype: int64

Confusion Matrix:
[[4158    4]
 [  12 3923]]
Correct Guess :: 8081
Wrong Guess   :: 16
Total Guess   :: 8097

Accuracy: 99.80%

Random Forest Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00      4162
     Anomaly       1.00      1.00      1.00      3935

    accuracy                           1.00      8097
   macro avg       1.00      1.00      1.00      8097
weighted avg       1.00      1.00      1.00      8097

Cross-Validation Accuracy: 0.9971 ± 0.0003


## K-NN

In [14]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize the KNN classifier
knn_classifier = KNeighborsClassifier(
    n_neighbors=5,      # You can tune this (odd numbers preferred)
    weights='uniform',  # 'distance' can also be used
    n_jobs=-1           # Use all CPU cores for parallel processing
)

# Train the model (KNN "trains" by storing the data)
start_time = time.time()
knn_classifier.fit(X_train, y_train.values.ravel())
end_time = time.time()
print("KNN - Total training time: ", end_time - start_time)

# Predict on both training and testing sets
start_time = time.time()
y_train_pred = knn_classifier.predict(X_train)
y_test_pred = knn_classifier.predict(X_test)
end_time = time.time()
print("KNN - Total testing time: ", end_time - start_time)

# Accuracy
train_acc = knn_classifier.score(X_train, y_train)
test_acc = knn_classifier.score(X_test, y_test)

print(f"Training Accuracy: {train_acc * 100:.2f}%")
print(f"Testing Accuracy: {test_acc * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\n1 = Anomaly\n0 = Normal")
print(y_test['class'].value_counts())
print("\nConfusion Matrix:")
print(cm)
print(f"Correct Guess :: {cm[0][0] + cm[1][1]}")
print(f"Wrong Guess   :: {cm[0][1] + cm[1][0]}")
print(f"Total Guess   :: {cm.sum()}\n")

# Accuracy Score
accuracy_knn = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy_knn * 100:.2f}%")

# Classification Report
knn_report = classification_report(y_test, y_test_pred, target_names=['Normal', 'Anomaly'])
print("\nKNN Classification Report:")
print(knn_report)

# Cross-Validation
knn_cv_scores = cross_val_score(knn_classifier, X_train, y_train.values.ravel(), cv=5, n_jobs=-1)
print(f"Cross-Validation Accuracy: {knn_cv_scores.mean():.4f} ± {knn_cv_scores.std():.4f}")


KNN - Total training time:  0.1448221206665039
KNN - Total testing time:  1.299407720565796
Training Accuracy: 99.53%
Testing Accuracy: 99.32%

1 = Anomaly
0 = Normal
class
0    4162
1    3935
Name: count, dtype: int64

Confusion Matrix:
[[4134   28]
 [  27 3908]]
Correct Guess :: 8042
Wrong Guess   :: 55
Total Guess   :: 8097

Accuracy: 99.32%

KNN Classification Report:
              precision    recall  f1-score   support

      Normal       0.99      0.99      0.99      4162
     Anomaly       0.99      0.99      0.99      3935

    accuracy                           0.99      8097
   macro avg       0.99      0.99      0.99      8097
weighted avg       0.99      0.99      0.99      8097

Cross-Validation Accuracy: 0.9934 ± 0.0012


## XgBoost

In [16]:
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import time

# Initialize XGBoost Classifier
xgb_classifier = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    use_label_encoder=False,  # Prevents warning in newer versions
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1
)

# Train the model
start_time = time.time()
xgb_classifier.fit(X_train, y_train.values.ravel())
end_time = time.time()
print("XGBoost - Total training time: ", end_time - start_time)

# Predict on training and testing data
start_time = time.time()
y_train_pred = xgb_classifier.predict(X_train)
y_test_pred = xgb_classifier.predict(X_test)
end_time = time.time()
print("XGBoost - Total testing time: ", end_time - start_time)

# Accuracy
train_acc = xgb_classifier.score(X_train, y_train)
test_acc = xgb_classifier.score(X_test, y_test)

print(f"Training Accuracy: {train_acc * 100:.2f}%")
print(f"Testing Accuracy: {test_acc * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\n1 = Anomaly\n0 = Normal")
print(y_test['class'].value_counts())
print("\nConfusion Matrix:")
print(cm)
print(f"Correct Guess :: {cm[0][0] + cm[1][1]}")
print(f"Wrong Guess   :: {cm[0][1] + cm[1][0]}")
print(f"Total Guess   :: {cm.sum()}\n")

# Accuracy Score
accuracy_xgb = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy_xgb * 100:.2f}%")

# Classification Report
xgb_report = classification_report(y_test, y_test_pred, target_names=['Normal', 'Anomaly'])
print("\nXGBoost Classification Report:")
print(xgb_report)

# Cross-Validation
xgb_cv_scores = cross_val_score(xgb_classifier, X_train, y_train.values.ravel(), cv=5, n_jobs=-1)
print(f"Cross-Validation Accuracy: {xgb_cv_scores.mean():.4f} ± {xgb_cv_scores.std():.4f}")


Parameters: { "use_label_encoder" } are not used.



XGBoost - Total training time:  0.13578391075134277
XGBoost - Total testing time:  0.011801004409790039
Training Accuracy: 99.94%
Testing Accuracy: 99.72%

1 = Anomaly
0 = Normal
class
0    4162
1    3935
Name: count, dtype: int64

Confusion Matrix:
[[4156    6]
 [  17 3918]]
Correct Guess :: 8074
Wrong Guess   :: 23
Total Guess   :: 8097

Accuracy: 99.72%

XGBoost Classification Report:
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00      4162
     Anomaly       1.00      1.00      1.00      3935

    accuracy                           1.00      8097
   macro avg       1.00      1.00      1.00      8097
weighted avg       1.00      1.00      1.00      8097

Cross-Validation Accuracy: 0.9974 ± 0.0006


## Naïve Bayes 

In [18]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.model_selection import cross_val_score
import time

# Initialize the Gaussian Naive Bayes classifier
nb_classifier = GaussianNB()

# Train the model
start_time = time.time()
nb_classifier.fit(X_train, y_train.values.ravel())
end_time = time.time()
print("Naive Bayes - Total training time: ", end_time - start_time)

# Predict on training and testing data
start_time = time.time()
y_train_pred = nb_classifier.predict(X_train)
y_test_pred = nb_classifier.predict(X_test)
end_time = time.time()
print("Naive Bayes - Total testing time: ", end_time - start_time)

# Accuracy
train_acc = nb_classifier.score(X_train, y_train)
test_acc = nb_classifier.score(X_test, y_test)

print(f"Training Accuracy: {train_acc * 100:.2f}%")
print(f"Testing Accuracy: {test_acc * 100:.2f}%")

# Confusion Matrix
cm = confusion_matrix(y_test, y_test_pred)
print("\n1 = Anomaly\n0 = Normal")
print(y_test['class'].value_counts())
print("\nConfusion Matrix:")
print(cm)
print(f"Correct Guess :: {cm[0][0] + cm[1][1]}")
print(f"Wrong Guess   :: {cm[0][1] + cm[1][0]}")
print(f"Total Guess   :: {cm.sum()}\n")

# Accuracy Score
accuracy_nb = accuracy_score(y_test, y_test_pred)
print(f"Accuracy: {accuracy_nb * 100:.2f}%")

# Classification Report
nb_report = classification_report(y_test, y_test_pred, target_names=['Normal', 'Anomaly'])
print("\nNaive Bayes Classification Report:")
print(nb_report)

# Cross-Validation
nb_cv_scores = cross_val_score(nb_classifier, X_train, y_train.values.ravel(), cv=5)
print(f"Cross-Validation Accuracy: {nb_cv_scores.mean():.4f} ± {nb_cv_scores.std():.4f}")


Naive Bayes - Total training time:  0.009032011032104492
Naive Bayes - Total testing time:  0.00749659538269043
Training Accuracy: 95.80%
Testing Accuracy: 95.65%

1 = Anomaly
0 = Normal
class
0    4162
1    3935
Name: count, dtype: int64

Confusion Matrix:
[[3842  320]
 [  32 3903]]
Correct Guess :: 7745
Wrong Guess   :: 352
Total Guess   :: 8097

Accuracy: 95.65%

Naive Bayes Classification Report:
              precision    recall  f1-score   support

      Normal       0.99      0.92      0.96      4162
     Anomaly       0.92      0.99      0.96      3935

    accuracy                           0.96      8097
   macro avg       0.96      0.96      0.96      8097
weighted avg       0.96      0.96      0.96      8097

Cross-Validation Accuracy: 0.9423 ± 0.0291
