In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# ============================================
# 1. LOAD & CLEAN DATA
# ============================================

print("[+] Loading CICIDS dataset...")
df = pd.read_csv("cicids.csv")   # <-- put dataset path here

# Replace infinite values and drop rows with NaN
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

print(f"[+] Dataset Loaded: {df.shape[0]} rows, {df.shape[1]} columns")

# ============================================
# 2. PREPARE FEATURES & LABELS
# ============================================

X = df.drop("Label", axis=1)
y = df["Label"]

print("[+] Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ============================================
# 3. SCALE FEATURES
# ============================================

print("[+] Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ============================================
# 4. TRAIN MODEL
# ============================================

print("[+] Training Random Forest model...")
model = RandomForestClassifier(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train_scaled, y_train)

# ============================================
# 5. EVALUATE MODEL
# ============================================

print("\n[+] Evaluation Results:\n")
preds = model.predict(X_test_scaled)
print(classification_report(y_test, preds))

# ============================================
# 6. SAVE MODEL + SCALER
# ============================================

joblib.dump(model, "cicids_model.pkl")
joblib.dump(scaler, "cicids_scaler.pkl")
print("[+] Model and scaler saved.")



print("\n[+] Loading new flows for prediction...")
new_flows = pd.read_csv("new_flows.csv")   # <-- features only

print("[+] Scaling new data...")
new_scaled = scaler.transform(new_flows)

print("[+] Predicting...")
new_preds = model.predict(new_scaled)
new_probs = model.predict_proba(new_scaled)

print("\n=== Prediction Results ===")
for i, p in enumerate(new_preds):
    print(f"Flow {i}: {p} (confidence: {max(new_probs[i]):.4f})")

print("\n[+] Done.")


[+] Loading CICIDS dataset...


FileNotFoundError: [Errno 2] No such file or directory: 'cicids.csv'