# Version 1

In [None]:
# !pip uninstall -y scikit-learn
# !pip install scikit-learn==1.8.0

In [None]:
# import os
# os.remove("/kaggle/working/ddos_model_realtime.pkl")

In [None]:

import os
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder

# --------------------------------------------------
# CONFIG
# --------------------------------------------------
DATA_PATH = '/kaggle/input/ddos-evaluation-dataset-cic-ddos2019'
SAMPLE_FRACTION = 0.1        # Increase if RAM allows
CHUNKSIZE = 10_000
N_ESTIMATORS = 150
RANDOM_STATE = 42
MODEL_PATH = '/kaggle/working/ddos_model.pkl'

# --------------------------------------------------
# STEP 1: LOAD DATASET (CHUNKED + SAMPLED)
# --------------------------------------------------
csv_files = [f for f in os.listdir(DATA_PATH) if f.endswith('.csv')]
if not csv_files:
    raise RuntimeError("No CSV files found in dataset path")

dataframes = []

for file in csv_files:
    print(f"Loading {file}")
    file_path = os.path.join(DATA_PATH, file)

    for chunk in pd.read_csv(file_path, chunksize=CHUNKSIZE, low_memory=False):
        chunk.columns = chunk.columns.str.strip()
        sampled = chunk.sample(frac=SAMPLE_FRACTION, random_state=RANDOM_STATE)
        dataframes.append(sampled)

df = pd.concat(dataframes, ignore_index=True)
print(f"Dataset loaded: {df.shape}")

# --------------------------------------------------
# STEP 2: PREPROCESS
# --------------------------------------------------
if 'Label' not in df.columns:
    raise KeyError(f"'Label' column not found. Available columns:\n{df.columns.tolist()}")

df['Label'] = df['Label'].astype(str).str.lower().apply(
    lambda x: 0 if x == 'benign' else 1
)

FEATURES = [
    'Source IP',
    'Destination IP',
    'Source Port',
    'Destination Port',
    'Protocol',
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Fwd Packet Length Max',
    'Bwd Packet Length Max',
    'SYN Flag Count',
    'ACK Flag Count'
]

AVAILABLE_FEATURES = [f for f in FEATURES if f in df.columns]
if not AVAILABLE_FEATURES:
    raise RuntimeError("None of the requested features exist in the dataset")

X = df[AVAILABLE_FEATURES].copy()
y = df['Label']

# Encode IPs safely
for col in ['Source IP', 'Destination IP']:
    if col in X.columns:
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))

X = X.fillna(0)

print(f"Preprocessing complete. X shape: {X.shape}")

# --------------------------------------------------
# STEP 3: TRAIN / TEST SPLIT
# --------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=RANDOM_STATE
)

# --------------------------------------------------
# STEP 4: TRAIN MODEL
# --------------------------------------------------
model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    n_jobs=-1,
    class_weight='balanced',
    random_state=RANDOM_STATE
)

print("Training model...")
model.fit(X_train, y_train)

# --------------------------------------------------
# STEP 5: EVALUATE
# --------------------------------------------------
y_pred = model.predict(X_test)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=4))

# --------------------------------------------------
# STEP 6: SAVE MODEL
# --------------------------------------------------
joblib.dump(model, MODEL_PATH)
print(f"Model saved to {MODEL_PATH}")


 

In [None]:
import sklearn
print("sklearn version:", sklearn.__version__)


# version 2

In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier

from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb

# -----------------------------------
# DATASET ROOT
# -----------------------------------

DATA_ROOT = "/kaggle/input/datasets/rodrigorosasilva/cic-ddos2019-30gb-full-dataset-csv-files"


csv_files = []
for root, dirs, files in os.walk(DATA_ROOT):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

print(f"Found {len(csv_files)} CSV files")

# -----------------------------------
# LOAD DATA (Progress Bar)
# -----------------------------------

df_list = []
print("\nðŸ”„ Loading CSV files...")

for file in tqdm(csv_files, desc="Loading Files"):
    temp_df = pd.read_csv(file, low_memory=False, nrows=10000)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)
print("Dataset shape:", df.shape)

# -----------------------------------
# CLEAN DATA
# -----------------------------------

print("\nðŸ§¹ Cleaning dataset...")

df.columns = df.columns.str.strip()

drop_cols = ["Flow ID", "Source IP", "Destination IP", "Timestamp", "Unnamed: 0"]
for col in drop_cols:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

if "Label" not in df.columns:
    raise ValueError("Label column missing")

df["Label"] = df["Label"].astype("category").cat.codes
df = df.select_dtypes(include=[np.number])

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

X = df.drop(columns=["Label"])
y = df["Label"]

# -----------------------------------
# SPLIT
# -----------------------------------

print("\nâœ‚ Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# -----------------------------------
# SMOTE
# -----------------------------------

print("\nâš– Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# -----------------------------------
# DEFINE MODELS
# -----------------------------------

extra = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    eval_metric="mlogloss",
    random_state=42,
    verbosity=0
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    num_leaves=31,
    random_state=42,
    verbosity=-1
)



# -----------------------------------
# PIPELINE
# -----------------------------------

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=30)),
    ("classifier", VotingClassifier(
        estimators=[
            ("extra", extra),
            ("xgb", xgb_model),
            ("lgb", lgb_model)
        ],
        voting="soft",
        n_jobs=-1
    ))
])

# -----------------------------------
# TRAIN WITH PROGRESS BAR
# -----------------------------------

print("\nðŸš€ Training pipeline...\n")

total_steps = 3

with tqdm(total=total_steps, desc="Training Progress") as pbar:

    # Step 1: Scaling
    pbar.set_description("Scaling Data")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_train_bal)
    pbar.update(1)

    # Step 2: PCA
    pbar.set_description("Applying PCA")
    pca = PCA(n_components=30)
    X_pca = pca.fit_transform(X_scaled)
    pbar.update(1)

    # Step 3: Train Ensemble
    pbar.set_description("Training Ensemble (ExtraTrees + XGB + LGB)")
    pipeline.named_steps["classifier"].fit(X_pca, y_train_bal)
    pbar.update(1)

print("\nâœ… Training complete")

# -----------------------------------
# EVALUATE
# -----------------------------------

print("\nðŸ“Š Evaluating model...")

X_test_scaled = scaler.transform(X_test)
X_test_pca = pca.transform(X_test_scaled)

y_pred = pipeline.named_steps["classifier"].predict(X_test_pca)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# -----------------------------------
# SAVE MODEL (FULL PIPELINE)
# -----------------------------------

final_pipeline = Pipeline([
    ("scaler", scaler),
    ("pca", pca),
    ("classifier", pipeline.named_steps["classifier"])
])

joblib.dump(final_pipeline, "/kaggle/working/ddos_model-latest-1.pkl", compress=3)

print("\nðŸ”¥ Model saved as ddos_model_latest.pkl")


In [None]:
# !pip uninstall -y imbalanced-learn
# !pip install imbalanced-learn --upgrade


# version 3

In [1]:
import os
import time
import joblib
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier

from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb

# -----------------------------------
# DATASET ROOT
# -----------------------------------

DATA_ROOT = "/kaggle/input/datasets/rodrigorosasilva/cic-ddos2019-30gb-full-dataset-csv-files"

csv_files = []
for root, dirs, files in os.walk(DATA_ROOT):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

print(f"Found {len(csv_files)} CSV files")

# -----------------------------------
# LOAD DATA
# -----------------------------------

df_list = []
print("\nðŸ”„ Loading CSV files...")

for file in tqdm(csv_files, desc="Loading Files"):
    temp_df = pd.read_csv(file, low_memory=False, nrows=10000)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)
print("Dataset shape:", df.shape)

# -----------------------------------
# CLEAN DATA
# -----------------------------------

print("\nðŸ§¹ Cleaning dataset...")

df.columns = df.columns.str.strip()

drop_cols = ["Flow ID", "Source IP", "Destination IP", "Timestamp", "Unnamed: 0"]
for col in drop_cols:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

if "Label" not in df.columns:
    raise ValueError("Label column missing")

df["Label"] = df["Label"].astype("category").cat.codes
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

# -----------------------------------
# SELECT ONLY DETECTOR FEATURES
# -----------------------------------

detector_features = [
    "Flow Duration",
    "Total Fwd Packets",
    "Total Backward Packets",
    "Total Length of Fwd Packets",
    "Total Length of Bwd Packets",
    "Flow Bytes/s",
    "Flow Packets/s",
    "Fwd Packet Length Mean",
    "Fwd Packet Length Std",
    "Bwd Packet Length Mean",
    "Bwd Packet Length Std",
    "Fwd IAT Mean",
    "Fwd IAT Std",
    "Bwd IAT Mean",
    "Bwd IAT Std",
    "SYN Flag Count",
    "ACK Flag Count",
    "FIN Flag Count",
    "RST Flag Count",
    "PSH Flag Count",
    "URG Flag Count"
]

missing = [col for col in detector_features if col not in df.columns]
if missing:
    raise Exception(f"Missing required columns: {missing}")

X = df[detector_features]
y = df["Label"]

# -----------------------------------
# SPLIT
# -----------------------------------

print("\nâœ‚ Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# -----------------------------------
# SMOTE
# -----------------------------------

print("\nâš– Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# -----------------------------------
# DEFINE MODELS
# -----------------------------------

extra = ExtraTreesClassifier(
    n_estimators=150,
    max_depth=20,
    random_state=42,
    n_jobs=-1
)

xgb_model = xgb.XGBClassifier(
    n_estimators=150,
    max_depth=6,
    learning_rate=0.1,
    eval_metric="mlogloss",
    random_state=42,
    verbosity=0
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=150,
    max_depth=6,
    num_leaves=31,
    random_state=42,
    verbosity=-1
)

# -----------------------------------
# PIPELINE (NO PCA NOW)
# -----------------------------------

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("classifier", VotingClassifier(
        estimators=[
            ("extra", extra),
            ("xgb", xgb_model),
            ("lgb", lgb_model)
        ],
        voting="soft",
        n_jobs=-1
    ))
])

# -----------------------------------
# TRAIN
# -----------------------------------

print("\nðŸš€ Training model...\n")
pipeline.fit(X_train_bal, y_train_bal)

print("\nâœ… Training complete")

# -----------------------------------
# EVALUATE
# -----------------------------------

print("\nðŸ“Š Evaluating model...")

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# -----------------------------------
# SAVE
# -----------------------------------

joblib.dump(pipeline, "/kaggle/working/ddos_model_realtime.pkl", compress=3)

print("\nðŸ”¥ Model saved as ddos_model_realtime.pkl")


Found 18 CSV files

ðŸ”„ Loading CSV files...


Loading Files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 18/18 [00:01<00:00, 10.45it/s]


Dataset shape: (180000, 88)

ðŸ§¹ Cleaning dataset...

âœ‚ Splitting dataset...

âš– Applying SMOTE...

ðŸš€ Training model...


âœ… Training complete

ðŸ“Š Evaluating model...




Accuracy: 0.5955487630146146
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      4135
           1       0.98      0.80      0.88      2733
           2       0.33      0.64      0.43      2946
           3       0.39      0.08      0.14      2958
           4       0.70      0.95      0.80       601
           5       0.57      0.61      0.59      2902
           6       0.52      0.23      0.32      2993
           7       0.48      0.93      0.63      2979
           8       0.37      0.69      0.48      2932
           9       0.34      0.01      0.02      2948
          10       0.59      0.52      0.55      2854
          11       0.61      0.90      0.73      5717
          12       0.64      0.22      0.33      1525
          13       0.98      0.71      0.82      5563
          14       0.62      1.00      0.76      2624
          15       0.44      0.28      0.34      2942
          16       0.42      0.21      0.28      299

# final version

In [1]:
import os
import time
import joblib
import numpy as np
import pandas as pd

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier

from imblearn.over_sampling import SMOTE
import xgboost as xgb
import lightgbm as lgb

# -----------------------------------
# DATASET ROOT
# -----------------------------------

DATA_ROOT = "/kaggle/input/datasets/rodrigorosasilva/cic-ddos2019-30gb-full-dataset-csv-files"
# DATA_ROOT = "/kaggle/input/ddos-evaluation-dataset-cic-ddos2019"

csv_files = []
for root, dirs, files in os.walk(DATA_ROOT):
    for file in files:
        if file.endswith(".csv"):
            csv_files.append(os.path.join(root, file))

print(f"Found {len(csv_files)} CSV files")

# -----------------------------------
# LOAD DATA (Progress Bar)
# -----------------------------------

df_list = []
print("\nðŸ”„ Loading CSV files...")

for file in tqdm(csv_files, desc="Loading Files"):
    temp_df = pd.read_csv(file, low_memory=False, nrows=200000)
    df_list.append(temp_df)

df = pd.concat(df_list, ignore_index=True)
print("Dataset shape:", df.shape)

# -----------------------------------
# FIXED: SELECT ONLY FEATURES USED IN DETECTION
# -----------------------------------

# Define the exact features computed in detection (to avoid mismatch)
detection_features = [
    "Flow Duration", "Total Fwd Packets", "Total Backward Packets",
    "Total Length of Fwd Packets", "Total Length of Bwd Packets",
    "Flow Bytes/s", "Flow Packets/s", "Fwd Packet Length Mean",
    "Fwd Packet Length Std", "Bwd Packet Length Mean", "Bwd Packet Length Std",
    "Fwd IAT Mean", "Fwd IAT Std", "Bwd IAT Mean", "Bwd IAT Std",
    "SYN Flag Count", "ACK Flag Count", "FIN Flag Count",
    "RST Flag Count", "PSH Flag Count", "URG Flag Count"
]

# Clean and process Label first
df.columns = df.columns.str.strip()
drop_cols = ["Flow ID", "Source IP", "Destination IP", "Timestamp", "Unnamed: 0"]
for col in drop_cols:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)

if "Label" not in df.columns:
    raise ValueError("Label column missing")

df["Label"] = df["Label"].astype("category").cat.codes  # Convert to numeric codes

# Now select numeric columns (includes Label now)
df = df.select_dtypes(include=[np.number])

# Select only detection features + Label (exclude Label from available_features)
available_features = [col for col in detection_features if col in df.columns]
df = df[available_features + ["Label"]]  # Keep only matching features + label

print(f"Selected {len(available_features)} features for training: {available_features}")

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

X = df.drop(columns=["Label"])
y = df["Label"]

# -----------------------------------
# SPLIT
# -----------------------------------

print("\nâœ‚ Splitting dataset...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# -----------------------------------
# SMOTE
# -----------------------------------

print("\nâš– Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# -----------------------------------
# DEFINE MODELS
# -----------------------------------

extra = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    eval_metric="mlogloss",
    random_state=42,
    verbosity=0
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=6,
    num_leaves=31,
    random_state=42,
    verbosity=-1
)

# -----------------------------------
# PIPELINE (Fixed: Fit the full pipeline, not pieces)
# -----------------------------------

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=min(30, len(available_features)))),  # Adjust PCA if fewer features
    ("classifier", VotingClassifier(
        estimators=[
            ("extra", extra),
            ("xgb", xgb_model),
            ("lgb", lgb_model)
        ],
        voting="soft",
        n_jobs=-1
    ))
])

# -----------------------------------
# TRAIN (Fixed: Fit the entire pipeline at once)
# -----------------------------------

print("\nðŸš€ Training pipeline...\n")

start_time = time.time()
pipeline.fit(X_train_bal, y_train_bal)  # Fit the whole pipeline
training_time = time.time() - start_time

print(f"âœ… Training complete in {training_time:.2f} seconds")

# -----------------------------------
# EVALUATE
# -----------------------------------

print("\nðŸ“Š Evaluating model...")

y_pred = pipeline.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# -----------------------------------
# SAVE MODEL (FULL PIPELINE)
# -----------------------------------

joblib.dump(pipeline, "/kaggle/working/ddos_model-latest-1.pkl", compress=3)

print("\nðŸ”¥ Model saved as ddos_model-latest-1.pkl")
print(f"Feature names in model: {pipeline.feature_names_in_}")

  if entities is not ():


Found 18 CSV files

ðŸ”„ Loading CSV files...


Loading Files: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 18/18 [00:52<00:00,  2.94s/it]


Dataset shape: (3591694, 88)
Selected 21 features for training: ['Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets', 'Total Length of Bwd Packets', 'Flow Bytes/s', 'Flow Packets/s', 'Fwd Packet Length Mean', 'Fwd Packet Length Std', 'Bwd Packet Length Mean', 'Bwd Packet Length Std', 'Fwd IAT Mean', 'Fwd IAT Std', 'Bwd IAT Mean', 'Bwd IAT Std', 'SYN Flag Count', 'ACK Flag Count', 'FIN Flag Count', 'RST Flag Count', 'PSH Flag Count', 'URG Flag Count']

âœ‚ Splitting dataset...

âš– Applying SMOTE...

ðŸš€ Training pipeline...

âœ… Training complete in 596.89 seconds

ðŸ“Š Evaluating model...




Accuracy: 0.7393902938466264
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      7904
           1       0.75      0.73      0.74      4849
           2       0.29      0.68      0.40       371
           3       0.31      0.11      0.16      1299
           4       1.00      0.96      0.98     52260
           5       0.16      0.36      0.22       824
           6       0.12      0.52      0.20       141
           7       0.32      0.25      0.28     18122
           8       0.32      0.49      0.38     21956
           9       0.05      0.55      0.09        65
          10       0.42      0.57      0.48      1495
          11       0.30      0.51      0.38       266
          12       0.09      0.34      0.14       322
          13       0.99      0.92      0.96     52682
          14       0.85      0.99      0.91     15482
          15       0.50      0.43      0.47     32362
          16       0.90      0.69      0.78     2381

test