 Step 1: Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


 Step 2: Load Dataset

In [None]:
# Replace with your actual dataset path
file_path = 'YAAD/APA-DDoS-Dataset.csv'
df = pd.read_csv(file_path)

# Preview
print("Dataset shape:", df.shape)
df.head()


 Step 3: Data Cleaning

In [None]:
# Remove any columns with too many missing values
df.dropna(axis=1, thresh=len(df)*0.9, inplace=True)

# Drop rows with remaining NaNs
df.dropna(inplace=True)

# Remove duplicates if any
df.drop_duplicates(inplace=True)


 Step 4: Label Encoding

In [None]:
# Replace with actual label column
df['Label'] = df['Label'].apply(lambda x: 1 if 'DDoS' in str(x) else 0)  # 1 = attack, 0 = benign


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Example: assume 'df' is your full DataFrame
dataplt = df.copy()  # assign it to dataplt

# Set style and plot
sns.set_style("whitegrid")
ppt = sns.pairplot(dataplt, hue="Label", height=3)


# Pairplot with less time complexity and bad visibility.
sns.set_style("whitegrid");

ppt =sns.pairplot(dataplt, hue="Label", height=3);
handles = ppt._legend_data.values()
labels = dataplt["Label"].unique()
legend = plt.legend(handles=handles, labels=labels.all(), loc="upper right")
plt.show()

In [None]:
!pip install  dataplt

Step 5: Feature Scaling

In [None]:
# Drop non-numeric and label columns for features
X = df.select_dtypes(include=[np.number]).drop('Label', axis=1)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Labels
y = df['Label']


Step 6: Split the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

print("Training samples:", X_train.shape[0])
print("Test samples:", X_test.shape[0])


Step 7: Train a Baseline Model (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)


Step 7: Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Visualize Confusion Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=["Benign", "DDoS"], yticklabels=["Benign", "DDoS"])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


Step 4: Feature Importance

In [None]:
import numpy as np

importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

# Show top 10 features
top_features = [X.columns[i] for i in indices[:10]]
print("Top 10 important features:", top_features)


Step 1: Hyperparameter Tuning (GridSearchCV)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize model
rf = RandomForestClassifier(random_state=42)

# Grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='f1', cv=5, n_jobs=-1, verbose=1)

# Fit
grid_search.fit(X_train, y_train)

# Best model
best_model = grid_search.best_estimator_
print("Best parameters:", grid_search.best_params_)


Step 2: Model Evaluation (on Test Set)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

# Metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("AUC Score:", roc_auc_score(y_test, y_prob))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


 Step 4: Model Deployment Preparation 

In [None]:
import joblib

# Save the trained model
joblib.dump(best_model, 'ddos_detector_model.pkl')

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')


In [None]:
# Save this as app.py
from flask import Flask, request, jsonify
import joblib
import numpy as np

app = Flask(__name__)
model = joblib.load('ddos_detector_model.pkl')
scaler = joblib.load('scaler.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json['data']
    data_scaled = scaler.transform([data])
    prediction = model.predict(data_scaled)[0]
    return jsonify({'prediction': int(prediction)})

if __name__ == '__main__':
    app.run(debug=True)


In [None]:
{
  "data": [0.1, 0.23, 4.5, 23, 0, ...]  # scaled features vector (same order as training)
}


 Step 1: Setup Real-Time Traffic Capture

In [None]:
from scapy.all import sniff
import joblib
import numpy as np

# Load model and scaler
model = joblib.load('ddos_detector_model.pkl')
scaler = joblib.load('scaler.pkl')

# Feature extraction function (very basic example)
def extract_features(pkt):
    try:
        return [
            len(pkt),  # packet size
            pkt.ttl if hasattr(pkt, 'ttl') else 64,  # Time to live
            pkt.time  # Timestamp
        ]
    except:
        return None

# Callback for each packet
def process_packet(pkt):
    features = extract_features(pkt)
    if features:
        data_scaled = scaler.transform([features])
        prediction = model.predict(data_scaled)[0]
        if prediction == 1:
            print("🚨 DDoS Attack Detected")
        else:
            print("✅ Normal Traffic")

# Start live capture
sniff(filter="ip", prn=process_packet, store=0)


In [None]:
pip install scapy


 Step 2: Integration into Network Infrastructure

 Step 3: Live Testing & Validation