In [30]:
import pandas as pd
import numpy as np

import xgboost as xgb



In [31]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix,classification_report

from sklearn.feature_selection import SelectKBest, mutual_info_classif
import seaborn as sns
import matplotlib.pyplot as plt

In [32]:
data=pd.read_csv('../data/Data.csv')
labels=pd.read_csv('../data/Label.csv')

In [33]:
# Replace all values >= 1 with 1 (converting all attack flows to 1)
labels['Label'] = labels['Label'].apply(lambda la: 1 if la >= 1 else 0)

# Verify the changes
print(labels['Label'].value_counts())

Label
0    358332
1     89583
Name: count, dtype: int64


In [34]:
print("Data shape before processing:", data.shape)

# Check for missing values in each column
missing_values_column = data.isnull().sum()
if missing_values_column.sum() > 0:
    print("Columns with missing values:")
    print(missing_values_column[missing_values_column > 0])  # Display columns with missing values
else:
    print("No missing values found in the data.")

# Find the number of unique values in each column
unique_values = data.nunique()
print("Number of unique values in each column:")
print(unique_values)

# Drop columns with only one unique value
for col in data.columns:
    if data[col].nunique() == 1:
        data.drop(col, axis=1, inplace=True)
        print(f"Dropped column: {col}")

Data shape before processing: (447915, 76)
No missing values found in the data.
Number of unique values in each column:
Flow Duration                 174373
Total Fwd Packet                 847
Total Bwd packets                928
Total Length of Fwd Packet      7697
Total Length of Bwd Packet      9322
                               ...  
Active Min                      1778
Idle Mean                       2280
Idle Std                         611
Idle Max                        2280
Idle Min                        2280
Length: 76, dtype: int64
Dropped column: Bwd PSH Flags
Dropped column: Fwd URG Flags
Dropped column: Bwd URG Flags
Dropped column: URG Flag Count
Dropped column: CWR Flag Count
Dropped column: ECE Flag Count
Dropped column: Fwd Bytes/Bulk Avg
Dropped column: Fwd Packet/Bulk Avg
Dropped column: Fwd Bulk Rate Avg


In [35]:
X_train, X_test, y_train, y_test = train_test_split(data, labels['Label'], test_size=0.2, random_state=42)

In [36]:
best_features = SelectKBest(score_func=mutual_info_classif, k=20).fit(data, labels['Label'])

In [37]:
X_train_selected = best_features.transform(X_train)
X_test_selected = best_features.transform(X_test)


In [38]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_selected, y_train)

In [39]:
y_pred=rf_classifier.predict(X_test_selected)

evaluation

In [40]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.979482714354286
Precision: 0.925931825422109
Recall: 0.9752264847332514
F1 Score: 0.9499400806188038

Confusion Matrix:
 [[70306  1395]
 [  443 17439]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99     71701
           1       0.93      0.98      0.95     17882

    accuracy                           0.98     89583
   macro avg       0.96      0.98      0.97     89583
weighted avg       0.98      0.98      0.98     89583

