In [1]:
import os, time, warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
import joblib

warnings.filterwarnings("ignore")
sns.set_style("whitegrid")

os.makedirs("../saved_models", exist_ok=True)
os.makedirs("../results/reports", exist_ok=True)
os.makedirs("../results/figures", exist_ok=True)

print("✓ Libraries loaded and directories ensured.")

✓ Libraries loaded and directories ensured.


In [2]:
# Features
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")

# Recreate y_train_multi and y_test_multi from original if not in CSV files
df_orig = pd.read_csv("../data/raw/CICIDS2017_cleaned.csv")
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_all = le.fit_transform(df_orig["Attack Type"].values)

# Align to the same split (random_state=42, test_size=0.2, stratify=y_all)
X_all = df_orig.drop(columns=["Attack Type"])
X_train_full, X_test_full, y_train_multi, y_test_multi = train_test_split(
    X_all, y_all, test_size=0.2, random_state=42, stratify=y_all
)

# Shape checks
assert X_train.shape[0] == y_train_multi.shape[0]
assert X_test.shape[0] == y_test_multi.shape[0]

class_names = list(le.classes_)
print("✓ Data ready:", X_train.shape, X_test.shape)
print("Classes:", class_names)

✓ Data ready: (2016600, 52) (504151, 52)
Classes: ['Bots', 'Brute Force', 'DDoS', 'DoS', 'Normal Traffic', 'Port Scanning', 'Web Attacks']


In [3]:
# Reduced RF config for large dataset (2M samples)
print("="*60)
print("CONFIGURING RANDOM FOREST FOR LARGE DATASET")
print("="*60)

# Compute class weights (balanced) based on training distribution
classes = np.unique(y_train_multi)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train_multi)
class_weight_dict = {cls: w for cls, w in zip(classes, weights)}
print("Computed class weights:", class_weight_dict)

# REDUCED complexity for 2M+ samples to avoid memory issues
rf = RandomForestClassifier(
    n_estimators=100,           # REDUCED from 300 (faster, less memory)
    max_depth=25,               # LIMIT depth (prevents very deep trees)
    min_samples_split=5,        # INCREASED (prunes small splits)
    min_samples_leaf=2,         # INCREASED (prunes small leaves)
    max_features="sqrt",        # Square root of features (standard)
    max_samples=0.8,            # Use 80% of data per tree (reduces memory per tree)
    bootstrap=True,
    class_weight=class_weight_dict,
    n_jobs=2,                   # REDUCED from -1 (use only 2 cores to save memory)
    random_state=42,
    verbose=1                   # Show progress
)

print("\n✓ RF configured for large dataset:")
print(f"  n_estimators: 100")
print(f"  max_depth: 25")
print(f"  n_jobs: 2 (serial-like to save memory)")
print(f"  max_samples: 80% per tree")
print("="*60)

CONFIGURING RANDOM FOREST FOR LARGE DATASET
Computed class weights: {np.int64(0): np.float64(184.78878401905985), np.int64(1): np.float64(39.3559718969555), np.int64(2): np.float64(2.8130348720910283), np.int64(3): np.float64(1.8586654770814361), np.int64(4): np.float64(0.1718842359755939), np.int64(5): np.float64(3.9705838920228005), np.int64(6): np.float64(168.07801300216704)}

✓ RF configured for large dataset:
  n_estimators: 100
  max_depth: 25
  n_jobs: 2 (serial-like to save memory)
  max_samples: 80% per tree


In [4]:
start = time.time()
rf.fit(X_train, y_train_multi)
train_time = time.time() - start
print(f"✓ RF trained in {train_time:.2f} sec ({train_time/60:.2f} min)")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  5.7min


✓ RF trained in 658.42 sec (10.97 min)


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed: 11.0min finished
