In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import xgboost as xgb
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data1 = pd.read_csv('../data/Data.csv')
labels1 = pd.read_csv('../data/Label.csv')

combined_data = pd.concat([data1, labels1], axis=1)

# Filter rows where 'Label' is 0
filtered_data = combined_data[combined_data['Label'] > 0]

# Extract filtered data and labels
filtered_data_only = filtered_data.drop(columns=['Label'])
filtered_labels_only = filtered_data['Label']

print("Filtered data shape:", filtered_data_only.shape)
print("Filtered labels shape:", filtered_labels_only.shape)

Filtered data shape: (89583, 76)
Filtered labels shape: (89583,)


In [3]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(filtered_data_only, filtered_labels_only, test_size=0.2, random_state=42)

In [4]:
print("Training data shape:", X_train1.shape, y_train1.shape)
print("Testing data shape:", X_test1.shape, y_test1.shape)
y_train1 = y_train1 - 1
y_test1 = y_test1 - 1
print(np.unique(y_train1))
print(np.unique(y_test1))

Training data shape: (71666, 76) (71666,)
Testing data shape: (17917, 76) (17917,)
[0 1 2 3 4 5 6 7 8]
[0 1 2 3 4 5 6 7 8]


In [9]:
best_features = SelectKBest(score_func=mutual_info_classif, k=30)

In [10]:
X_train_selected = best_features.fit_transform(X_train1, y_train1)
X_test_selected = best_features.transform(X_test1) 

In [11]:
# Print selected feature indices
selected_indices = best_features.get_support(indices=True)
print("Selected feature indices:", selected_indices)

# Optionally, get the feature names
selected_feature_names = filtered_data_only.columns[selected_indices]
print("Selected feature names:", selected_feature_names)

Selected feature indices: [ 0  1  2  3  4  5  6  7  8  9 11 12 13 25 33 34 37 38 39 40 41 45 46 51
 52 53 57 58 59 66]
Selected feature names: Index(['Flow Duration', 'Total Fwd Packet', 'Total Bwd packets',
       'Total Length of Fwd Packet', 'Total Length of Bwd Packet',
       'Fwd Packet Length Max', 'Fwd Packet Length Min',
       'Fwd Packet Length Mean', 'Fwd Packet Length Std',
       'Bwd Packet Length Max', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Bwd IAT Mean',
       'Fwd Header Length', 'Bwd Header Length', 'Packet Length Min',
       'Packet Length Max', 'Packet Length Mean', 'Packet Length Std',
       'Packet Length Variance', 'PSH Flag Count', 'ACK Flag Count',
       'Average Packet Size', 'Fwd Segment Size Avg', 'Bwd Segment Size Avg',
       'Bwd Bytes/Bulk Avg', 'Bwd Packet/Bulk Avg', 'Bwd Bulk Rate Avg',
       'Fwd Act Data Pkts'],
      dtype='object')


In [13]:
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train_selected, y_train1)

In [14]:
y_pred = xgb_classifier.predict(X_test_selected)

In [15]:
print("\nClassification Report:")
report = classification_report(y_test1, y_pred, target_names=[f"Class {i}" for i in np.unique(y_test1)])
print(report)


Classification Report:
              precision    recall  f1-score   support

     Class 0       0.42      0.44      0.43        68
     Class 1       0.81      0.49      0.61        77
     Class 2       0.77      0.30      0.44       917
     Class 3       0.80      0.78      0.79      6191
     Class 4       0.69      0.94      0.80      5859
     Class 5       0.90      0.74      0.81       929
     Class 6       0.90      0.66      0.76      3370
     Class 7       0.59      0.28      0.38       457
     Class 8       0.63      0.35      0.45        49

    accuracy                           0.77     17917
   macro avg       0.72      0.55      0.61     17917
weighted avg       0.78      0.77      0.76     17917



using random forest

In [16]:
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_selected, y_train1)

# Predict on the test data
y_pred = rf_classifier.predict(X_test_selected)

In [17]:
print("\nClassification Report:")
report = classification_report(y_test1, y_pred, target_names=[f"Class {i}" for i in np.unique(y_test1)])
print(report)


Classification Report:
              precision    recall  f1-score   support

     Class 0       0.50      0.46      0.48        68
     Class 1       0.59      0.57      0.58        77
     Class 2       0.55      0.40      0.46       917
     Class 3       0.76      0.78      0.77      6191
     Class 4       0.76      0.77      0.77      5859
     Class 5       0.81      0.78      0.79       929
     Class 6       0.70      0.71      0.70      3370
     Class 7       0.40      0.39      0.40       457
     Class 8       0.45      0.45      0.45        49

    accuracy                           0.73     17917
   macro avg       0.61      0.59      0.60     17917
weighted avg       0.73      0.73      0.73     17917

