In [1]:
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score

from sklearn.model_selection import RandomizedSearchCV

from sklearn.model_selection import GridSearchCV

import numpy as np

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from collections import Counter





from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
data=pd.read_csv('../Data/Data.csv')
label=pd.read_csv('../Data/Label.csv')

In [29]:


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, label['Label'], test_size=0.2, random_state=42)

In [30]:
print(X_train.describe(), X_test.describe(), y_train.describe(), y_test.describe())

       Flow Duration  Total Fwd Packet  Total Bwd packets  \
count   3.583320e+05     358332.000000      358332.000000   
mean    5.965464e+05         22.586163          27.156129   
std     4.880156e+06        128.750057         114.994688   
min     1.000000e+00          1.000000           0.000000   
25%     3.490000e+02          1.000000           2.000000   
50%     5.779500e+03          3.000000           2.000000   
75%     1.780708e+05         17.000000          15.000000   
max     1.199975e+08      19616.000000       11021.000000   

       Total Length of Fwd Packet  Total Length of Bwd Packet  \
count                3.583320e+05                3.583320e+05   
mean                 4.798193e+03                2.159995e+04   
std                  1.529527e+05                1.455858e+05   
min                  0.000000e+00                0.000000e+00   
25%                  0.000000e+00                0.000000e+00   
50%                  9.000000e+01                1.060000e+0

In [31]:
under_sampler = RandomUnderSampler(sampling_strategy={0: 40000}, random_state=42)  # Undersample Class 0 to 20,000 samples
smote = SMOTE(sampling_strategy={2: 3000,1:2000,7: 14000, 8: 3000,9:3000}, random_state=42)  # Oversample Classes 2, 7, and 8

In [32]:
X_train_resampled, y_train_resampled = under_sampler.fit_resample(X_train, y_train)

In [33]:
X_train_final, y_train_final = smote.fit_resample(X_train_resampled, y_train_resampled)

In [34]:
print("New class distribution:")
print(pd.Series(y_train_final).value_counts())

New class distribution:
Label
0    40000
4    24692
5    23764
7    14000
6     3765
3     3551
2     3000
8     3000
9     3000
1     2000
Name: count, dtype: int64


In [35]:

scaler = StandardScaler()
X_train_final_scaled = scaler.fit_transform(X_train_final)

In [36]:
X_test_scaled = scaler.transform(X_test) 

In [37]:

pca_95 = PCA(n_components=0.95) 
X_train_final_pca_95 = pca_95.fit_transform(X_train_final_scaled)
X_test_pca_95 = pca_95.transform(X_test_scaled)


In [38]:

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_final_pca_95, y_train_final)

In [39]:
y_pred = rf_classifier.predict(X_test_pca_95)

In [40]:
accuracy = accuracy_score(y_test, y_pred)
print(f"\nRandom Forest Accuracy after Resampling: {accuracy:.4f}")

print("\nClassification Report:")
report = classification_report(y_test, y_pred, target_names=[f"Class {i}" for i in np.unique(y_test)])
print(report)


Random Forest Accuracy after Resampling: 0.9193

Classification Report:
              precision    recall  f1-score   support

     Class 0       1.00      0.98      0.99     71701
     Class 1       0.19      0.59      0.29        68
     Class 2       0.27      0.45      0.34       104
     Class 3       0.38      0.17      0.24       916
     Class 4       0.69      0.75      0.72      6259
     Class 5       0.62      0.77      0.69      5849
     Class 6       0.74      0.56      0.64       867
     Class 7       0.68      0.69      0.68      3392
     Class 8       0.27      0.26      0.27       381
     Class 9       0.09      0.33      0.14        46

    accuracy                           0.92     89583
   macro avg       0.49      0.56      0.50     89583
weighted avg       0.93      0.92      0.92     89583

