In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, \
    classification_report

In [2]:
labels = pd.read_csv('../data/Label.csv')
data = pd.read_csv('../data/Data.csv')

In [3]:
print("Data shape before processing:", data.shape)

# Check for missing values in each column
missing_values_column = data.isnull().sum()
if missing_values_column.sum() > 0:
    print("Columns with missing values:")
    print(missing_values_column[missing_values_column > 0])  # Display columns with missing values
else:
    print("No missing values found in the data.")

# Find the number of unique values in each column
unique_values = data.nunique()
print("Number of unique values in each column:")
print(unique_values)

# Drop columns with only one unique value
for col in data.columns:
    if data[col].nunique() == 1:
        data.drop(col, axis=1, inplace=True)
        print(f"Dropped column: {col}")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data, labels['Label'], test_size=0.2, random_state=42)


Data shape before processing: (447915, 76)
No missing values found in the data.
Number of unique values in each column:
Flow Duration                 174373
Total Fwd Packet                 847
Total Bwd packets                928
Total Length of Fwd Packet      7697
Total Length of Bwd Packet      9322
                               ...  
Active Min                      1778
Idle Mean                       2280
Idle Std                         611
Idle Max                        2280
Idle Min                        2280
Length: 76, dtype: int64
Dropped column: Bwd PSH Flags
Dropped column: Fwd URG Flags
Dropped column: Bwd URG Flags
Dropped column: URG Flag Count
Dropped column: CWR Flag Count
Dropped column: ECE Flag Count
Dropped column: Fwd Bytes/Bulk Avg
Dropped column: Fwd Packet/Bulk Avg
Dropped column: Fwd Bulk Rate Avg


In [4]:
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (358332, 67)
Testing set shape: (89583, 67)


In [13]:

# Feature selection using Chi-Square test on the training data
best_features = SelectKBest(score_func=chi2, k=10)
fit = best_features.fit(X_train, y_train)

# Get the scores for each feature
dfscores = pd.DataFrame(fit.scores_, columns=['Score'])
dfcolumns = pd.DataFrame(X_train.columns, columns=['Feature'])

# Concatenate feature names and scores
feature_scores = pd.concat([dfcolumns, dfscores], axis=1)

# Print the 10 best features based on Chi-Square score
print("Top 10 features based on Chi-Square Test:")


Top 10 features based on Chi-Square Test:


In [30]:
feature_scores = feature_scores.sort_values(by='Score', ascending=False)

top_30_features = feature_scores.head(40)['Feature'].values

In [31]:
X_train_reduced = X_train[top_30_features]
X_test_reduced = X_test[top_30_features]

In [32]:
print("Reduced X_train shape:", X_train_reduced.shape)
print("Reduced X_test shape:", X_test_reduced.shape)

Reduced X_train shape: (358332, 40)
Reduced X_test shape: (89583, 40)


In [33]:
rf_classifier = RandomForestClassifier(random_state=42)

In [34]:
rf_classifier.fit(X_train_reduced, y_train)

In [35]:
y_pred = rf_classifier.predict(X_test_reduced)

In [36]:
print("\nClassification Report:")
report = classification_report(y_test, y_pred, target_names=[f"Class {i}" for i in np.unique(y_test)])
print(report)


Classification Report:
              precision    recall  f1-score   support

     Class 0       0.99      0.98      0.99     71701
     Class 1       0.31      0.38      0.34        68
     Class 2       0.87      0.46      0.60       104
     Class 3       0.57      0.24      0.34       916
     Class 4       0.74      0.79      0.77      6259
     Class 5       0.66      0.80      0.72      5849
     Class 6       0.82      0.71      0.76       867
     Class 7       0.75      0.69      0.72      3392
     Class 8       0.46      0.33      0.38       381
     Class 9       0.86      0.26      0.40        46

    accuracy                           0.93     89583
   macro avg       0.70      0.57      0.60     89583
weighted avg       0.93      0.93      0.93     89583

