In [11]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
data1 = pd.read_csv('../data/Data.csv')
labels1 = pd.read_csv('../data/Label.csv')

combined_data = pd.concat([data1, labels1], axis=1)

# Filter rows where 'Label' is 0
filtered_data = combined_data[combined_data['Label'] > 0]

# Extract filtered data and labels
filtered_data_only = filtered_data.drop(columns=['Label'])
filtered_labels_only = filtered_data['Label']

print("Filtered data shape:", filtered_data_only.shape)
print("Filtered labels shape:", filtered_labels_only.shape)

Filtered data shape: (89583, 76)
Filtered labels shape: (89583,)


In [4]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(filtered_data_only, filtered_labels_only, test_size=0.2, random_state=42)

In [5]:
print("Training data shape:", X_train1.shape, y_train1.shape)
print("Testing data shape:", X_test1.shape, y_test1.shape)

Training data shape: (71666, 76) (71666,)
Testing data shape: (17917, 76) (17917,)


In [6]:
y_train1 = y_train1 - 1
y_test1 = y_test1 - 1

In [7]:
print(np.unique(y_train1))
print(np.unique(y_test1))

[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8]


In [8]:
print(filtered_labels_only.nunique())

9


In [9]:
xgb_classifier = xgb.XGBClassifier(random_state=42)

In [13]:
best_features = SelectKBest(score_func=chi2, k=10)
fit = best_features.fit(X_train1, y_train1)

In [15]:
dfscores = pd.DataFrame(fit.scores_, columns=['Score'])
dfcolumns = pd.DataFrame(X_train1.columns, columns=['Feature'])

In [16]:
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)

In [25]:
feature_scores = feature_scores.sort_values(by='Score', ascending=False)

top_40_features = feature_scores.head(30)['Feature'].values

In [33]:
print(top_40_features)

['Flow Bytes/s' 'Fwd IAT Total' 'Flow Duration' 'Active Max' 'Active Mean'
 'Bwd Bulk Rate Avg' 'Active Std' 'Idle Max' 'Idle Mean' 'Idle Min'
 'Active Min' 'Flow IAT Max' 'Fwd IAT Max' 'Bwd IAT Total' 'Bwd IAT Max'
 'Fwd IAT Std' 'Idle Std' 'Flow IAT Std' 'Bwd IAT Std'
 'Packet Length Variance' 'Flow IAT Mean' 'Fwd IAT Mean'
 'Bwd Bytes/Bulk Avg' 'Fwd Packets/s' 'Flow Packets/s'
 'Total Length of Bwd Packet' 'Subflow Bwd Bytes'
 'Total Length of Fwd Packet' 'Bwd IAT Mean' 'Subflow Fwd Bytes']


In [26]:
X_train_reduced = X_train1[top_40_features]
X_test_reduced = X_test1[top_40_features]

In [27]:
print("Reduced X_train1 shape:", X_train_reduced.shape)
print("Reduced X_test1 shape:", X_test_reduced.shape)

Reduced X_train1 shape: (71666, 30)
Reduced X_test1 shape: (17917, 30)


In [28]:
xgb_classifier.fit(X_train_reduced, y_train1)

In [29]:
y_pred1 = xgb_classifier.predict(X_test_reduced)

In [30]:
print("\nClassification Report:")
report = classification_report(y_test1, y_pred1, target_names=[f"Class {i}" for i in np.unique(y_test1)])
print(report)


Classification Report:
              precision    recall  f1-score   support

     Class 0       0.44      0.41      0.43        68
     Class 1       0.89      0.40      0.55        77
     Class 2       0.71      0.17      0.27       917
     Class 3       0.78      0.76      0.77      6191
     Class 4       0.68      0.94      0.79      5859
     Class 5       0.83      0.59      0.69       929
     Class 6       0.88      0.67      0.76      3370
     Class 7       0.56      0.23      0.33       457
     Class 8       0.61      0.29      0.39        49

    accuracy                           0.75     17917
   macro avg       0.71      0.50      0.55     17917
weighted avg       0.76      0.75      0.73     17917



using random froest 

In [31]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_reduced, y_train1)



# Predict on the test data
y_pred = rf_classifier.predict(X_test_reduced)

In [32]:
print("\nClassification Report:")
report = classification_report(y_test1, y_pred, target_names=[f"Class {i}" for i in np.unique(y_test1)])
print(report)


Classification Report:
              precision    recall  f1-score   support

     Class 0       0.43      0.29      0.35        68
     Class 1       0.82      0.43      0.56        77
     Class 2       0.54      0.18      0.27       917
     Class 3       0.73      0.79      0.76      6191
     Class 4       0.71      0.84      0.77      5859
     Class 5       0.78      0.57      0.66       929
     Class 6       0.76      0.69      0.73      3370
     Class 7       0.47      0.23      0.30       457
     Class 8       0.22      0.04      0.07        49

    accuracy                           0.73     17917
   macro avg       0.61      0.45      0.50     17917
weighted avg       0.72      0.73      0.71     17917

