In [1]:
# Importing required modules to develop an ML model

import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# --- Step 1: Load the Data -----
# Cleaning the dataset

file_path = 'data/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv'
try:
    df = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check your file path.")
    # Stop execution here if running in an environment where file must be present
    # raise

# --- Step 1a: Clean Column Names ---
# Remove leading/trailing spaces from column names
df.columns = df.columns.str.strip()

# --- Step 1b: Handle Infinite and Missing Values ---
# Replace infinite values with NaN (necessary for most ML algorithms)
df.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with any NaN values for simplicity (consider imputation for production)
initial_row_count = len(df)
df.dropna(inplace=True)
print(f"Dropped rows with NaN values. Remaining rows: {len(df)}")

# --- Step 1c: Drop Duplicate Rows ---
df.drop_duplicates(inplace=True)
print(f"Total rows after cleaning: {len(df)}")

Dropped rows with NaN values. Remaining rows: 1044525
Total rows after cleaning: 1039072


In [3]:
# Features (X) and Target (y) ---
# Assuming 'Label' is the target column
if 'Label' not in df.columns:
    print("Error: 'Label' column not found. Please identify the correct target column.")
else:
    X = df.drop('Label', axis=1)
    y = df['Label']
    # Encode the Target Variable (y) ---
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    # Store the mapping for later interpretation
    label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    print(f"Target Label Mapping: {label_mapping}")
    object_cols = X.select_dtypes(include=['object']).columns
    if len(object_cols) > 0:
        print(f"Non-numeric feature columns found and will be dropped: {list(object_cols)}")

        X.drop(columns=object_cols, inplace=True)
    # --- Step 2d: Split the Data ---
    # 70% for training, 30% for testing, with 'stratify' to maintain label proportions
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
    )
    print(f"\nX_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")

Target Label Mapping: {'Benign': np.int64(0), 'Bot': np.int64(1)}
Non-numeric feature columns found and will be dropped: ['Timestamp']

X_train shape: (727350, 78)
X_test shape: (311722, 78)


In [4]:
# --- Step 3: Train the Random Forest Model ---
# Initialize the Random Forest Classifier
# n_estimators: number of trees in the forest
# random_state: for reproducibility
# n_jobs: use all available cores for faster training
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15, min_samples_leaf=5)

print("\nStarting model training...")
rf_model.fit(X_train, y_train)
print("Model training complete.")


Starting model training...
Model training complete.


In [6]:
# --- Step 4a: Make Predictions ---
y_pred = rf_model.predict(X_test)

In [7]:
# --- 4. Creating the PKL File (Saving the Model) ---
MODEL_FILENAME = 'random_forest_traffic_classifier.pkl'
print(f"\n3. Saving the trained model to '{MODEL_FILENAME}'...")
try:
    with open(MODEL_FILENAME, 'wb') as file:
        pickle.dump(rf_model, file)
    print(f"   ✅ Success! The model has been saved as {MODEL_FILENAME}")
except Exception as e:
    print(f"   ❌ ERROR during saving: {e}")


3. Saving the trained model to 'random_forest_traffic_classifier.pkl'...
   ✅ Success! The model has been saved as random_forest_traffic_classifier.pkl


In [8]:
# --- Step 4b: Calculate Performance Metrics ---

# 1. Accuracy Score
accuracy = accuracy_score(y_test, y_pred)
print(f"\n--- Model Evaluation Results ---")
print(f"Accuracy: {accuracy * 100:.2f}%")

# 2. Classification Report
print("\nClassification Report:")
# Target names are derived from the encoder classes
target_names = le.classes_
print(classification_report(y_test, y_pred, target_names=target_names))

# 3. Confusion Matrix (for a visual breakdown of errors)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 5))
sns.heatmap(
    cm,
    annot=True,
    fmt='d', xc
    cmap='Blues',
    cbar=False,
    xticklabels=target_names,
    yticklabels=target_names
)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.savefig('confusion_matrix.png')
plt.close()
print("Confusion matrix saved as confusion_matrix.png")

# --- Step 4c: Feature Importance (Optional but useful) ---
# Identify the most important features in the model
importances = rf_model.feature_importances_
feature_names = X.columns
sorted_indices = np.argsort(importances)[::-1]

plt.figure(figsize=(10, 6))
sns.barplot(x=importances[sorted_indices[:15]], y=feature_names[sorted_indices[:15]], color='skyblue')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.savefig('feature_importances.png')
plt.close()
print("Feature importances plot saved as feature_importances.png")


--- Model Evaluation Results ---
Accuracy: 99.98%

Classification Report:
              precision    recall  f1-score   support

      Benign       1.00      1.00      1.00    227029
         Bot       1.00      1.00      1.00     84693

    accuracy                           1.00    311722
   macro avg       1.00      1.00      1.00    311722
weighted avg       1.00      1.00      1.00    311722

Confusion matrix saved as confusion_matrix.png
Feature importances plot saved as feature_importances.png
