In [ ]:
## DATA SPLITTING/

In [2]:
import pandas as pd

try:
    # Load the original CSV file
    file_path = "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv" 
    data = pd.read_csv(file_path)
    
    # Get the number of features (columns) in the dataset
    num_features = data.shape[1]
    print(f"Number of features (columns) in the dataset: {num_features}")
    
    # Count the occurrences of each class in the label column
    label_counts = data[' Label'].value_counts()
    print(label_counts)
    
    # # Shuffle the dataset (optional, but recommended if the data is ordered)
    # data = data.sample(frac=1, random_state=42).reset_index(drop=True)
    # 
    # # Split the data into 90% and 10%
    # split_ratio = 0.9
    # split_index = int(len(data) * split_ratio)
    # 
    # data_90 = data[:split_index]  # First 90%
    # data_10 = data[split_index:]  # Remaining 10%
    # 
    # # Save the split data to separate CSV files
    # data_90.to_csv("Training_2.csv", index=False)  # Save 90% to a file
    # data_10.to_csv("Testing_2.csv", index=False)  # Save 10% to a file
    # 
    # print("Data successfully split into Training.csv (90%) and Testing.csv (10%).")
    
except FileNotFoundError:
    print(f"Error: The file {file_path} was not found. Please check the path and try again")
except PermissionError:
    print("Error: Permission denied. Please check the path and try again")
except Exception as e:
    print(f"An unexpected error occurred {e}")


Number of features (columns) in the dataset: 85
 Label
DDoS      128027
BENIGN     97718
Name: count, dtype: int64


## CLEAN MODEL

### RANDOM FOREST CLASSIFIER
#### Implementation for training, evaluating, and analyzing the performance of a Random Forest for network traffic classification

In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import cross_val_score

# Load and preprocess the dataset
file_path = "Training_2.csv"
data = pd.read_csv(file_path)

# Encode labels
label_encoder = LabelEncoder()
data[' Label'] = label_encoder.fit_transform(data[' Label'])

# # Visualize feature importances to check which column to drop
# importances = rf_classifier.feature_importances_
# indices = np.argsort(importances)[::-1]
# features = data.drop([' Label'], axis=1).columns
# 
# plt.figure(figsize=(10, 6))
# plt.title("Feature Importances")
# plt.bar(range(X.shape[1]), importances[indices], align="center")
# plt.xticks(range(X.shape[1]), features[indices], rotation=90)
# plt.xlabel("Feature")
# plt.ylabel("Importance")
# plt.tight_layout()
# plt.show()

# Identify and drop high-cardinality non-numeric columns
columns_to_drop = ['Flow ID', ' Source IP', ' Destination IP', ' Timestamp']
data = data.drop(columns=columns_to_drop, axis=1, errors='ignore')

# Identify remaining non-numeric columns
non_numeric_columns = data.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_columns}")

# Encode remaining non-numeric columns if they exist
if len(non_numeric_columns) > 0:
    encoder = OneHotEncoder(sparse_output=False)
    encoded_features = encoder.fit_transform(data[non_numeric_columns])
    
    # Drop the original non-numeric columns and append the encoded features
    X = data.drop([' Label'] + list(non_numeric_columns), axis=1).values
    X = np.hstack((X, encoded_features))
else:
    # If no non-numeric columns, proceed normally
    X = data.drop([' Label'], axis=1).values

# Handle infinity values and NaN
X = data.drop([' Label'] + list(non_numeric_columns), axis=1).values
X = np.where(np.isinf(X), np.nan, X)  # Replace infinity with NaN
imputer = SimpleImputer(strategy='mean')  # Replace NaN with feature mean
X = imputer.fit_transform(X)

# Separate labels
y = data[' Label'].values

# Instantiate the Random Forest model with class_weight balanced
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

# Cross-validation to verify the impact of class_weight
cv_scores = cross_val_score(rf_classifier, X, y, cv=3, scoring='accuracy')
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Fit the Random Forest model to the training data
rf_classifier.fit(X_train, y_train)

# Make predictions
y_pred_train = rf_classifier.predict(X_train)
y_pred_test = rf_classifier.predict(X_test)

# Evaluate performance
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print(f"Training Accuracy: {train_accuracy:.4f}")
print(f"Testing Accuracy: {test_accuracy:.4f}")
print("\nClassification Report (Test Data):\n", classification_report(y_test, y_pred_test))

#generate a confusion matrix to see the performance
cm = confusion_matrix(y_test, y_pred_test)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.show()


Non-numeric columns: Index([], dtype='object')
Cross-validation scores: [0.99988187 0.99994094 0.99992617]
Mean cross-validation score: 0.9999163263988429
Training Accuracy: 1.0000
Testing Accuracy: 0.9999

Classification Report (Test Data):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     34955
           1       1.00      1.00      1.00     46313

    accuracy                           1.00     81268
   macro avg       1.00      1.00      1.00     81268
weighted avg       1.00      1.00      1.00     81268
