In [1]:
!pip install numpy pandas scikit-learn xgboost flask requests boto3 joblib



In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import xgboost as xgb
import joblib
from flask import Flask, request, jsonify

# Load dataset
df = pd.read_csv("DDos.csv")

df.columns = df.columns.str.strip()  # Remove spaces in column names

df = df.dropna()  # Drop missing values

df['Label'] = df['Label'].astype(str).str.strip()  # Ensure labels are strings
df['Label'] = df['Label'].apply(lambda x: 1 if 'DDoS' in x else 0)  # Convert to binary

# Check label distribution
print("Class distribution before SMOTE:")
print(df['Label'].value_counts())

# Define features
df['Total_Bytes'] = df['Total Length of Fwd Packets'] + df['Total Length of Bwd Packets']
df['Packet_Count'] = df['Total Fwd Packets'] + df['Total Backward Packets']
features = ['Flow Duration', 'Total_Bytes', 'Average Packet Size', 'Packet_Count']

X = df[features]
y = df['Label']

if len(y.unique()) > 1:
    smote = SMOTE(sampling_strategy='auto', random_state=42)
    X, y = smote.fit_resample(X, y)
    print("Applied SMOTE successfully.")
else:
    print("Skipping SMOTE: Only one class found in the dataset.")

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train XGBoost model
model = xgb.XGBClassifier(
    learning_rate=0.1,
    max_depth=6,
    n_estimators=200,
    subsample=0.8,
    use_label_encoder=False,
    eval_metric="logloss"
)
model.fit(X_train, y_train)

# Save model and scaler
joblib.dump(model, "xgboost_ddos.pkl")
joblib.dump(scaler, "scaler.pkl")
print("Model and scaler saved successfully.")


Class distribution before SMOTE:
Label
1    34952
0    31284
Name: count, dtype: int64
Applied SMOTE successfully.


Parameters: { "use_label_encoder" } are not used.



Model and scaler saved successfully.


In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Make predictions
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Display classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Display confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Accuracy: 0.9894
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6954
           1       0.99      0.99      0.99      7027

    accuracy                           0.99     13981
   macro avg       0.99      0.99      0.99     13981
weighted avg       0.99      0.99      0.99     13981

Confusion Matrix:
[[6876   78]
 [  70 6957]]


In [24]:
df = pd.read_csv("DDos.csv")
print(df.columns)

Index([' Destination Port', ' Flow Duration', ' Total Fwd Packets',
       ' Total Backward Packets', 'Total Length of Fwd Packets',
       ' Total Length of Bwd Packets', ' Fwd Packet Length Max',
       ' Fwd Packet Length Min', ' Fwd Packet Length Mean',
       ' Fwd Packet Length Std', 'Bwd Packet Length Max',
       ' Bwd Packet Length Min', ' Bwd Packet Length Mean',
       ' Bwd Packet Length Std', 'Flow Bytes/s', ' Flow Packets/s',
       ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min',
       'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max',
       ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std',
       ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', ' Max Packet Length', ' Packet Length Mean',
       ' Packet Length Std', ' Packet Length Variance', '

In [26]:
df.columns = df.columns.str.strip()  # Remove spaces
print(df.columns)  # Check again

Index(['Destination Port', 'Flow Duration', 'Total Fwd Packets',
       'Total Backward Packets', 'Total Length of Fwd Packets',
       'Total Length of Bwd Packets', 'Fwd Packet Length Max',
       'Fwd Packet Length Min', 'Fwd Packet Length Mean',
       'Fwd Packet Length Std', 'Bwd Packet Length Max',
       'Bwd Packet Length Min', 'Bwd Packet Length Mean',
       'Bwd Packet Length Std', 'Flow Bytes/s', 'Flow Packets/s',
       'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min',
       'Fwd IAT Total', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max',
       'Fwd IAT Min', 'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std',
       'Bwd IAT Max', 'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags',
       'Fwd URG Flags', 'Bwd URG Flags', 'Fwd Header Length',
       'Bwd Header Length', 'Fwd Packets/s', 'Bwd Packets/s',
       'Min Packet Length', 'Max Packet Length', 'Packet Length Mean',
       'Packet Length Std', 'Packet Length Variance', 'FIN Flag Count',
       'SYN Flag Co

In [28]:
for col in df.columns:
    if "label" in col.lower():
        print("Possible label column:", col)

Possible label column: Label


In [30]:
df.rename(columns={" Label ": "Label"}, inplace=True)

In [32]:
print(df['Label'].value_counts())
print(df['Label'].unique())

Label
DDoS      34952
BENIGN    31284
Name: count, dtype: int64
['BENIGN' 'DDoS' nan]


In [34]:
import joblib

scaler = joblib.load("scaler.pkl")
print("Scaler input shape:", scaler.n_features_in_)

Scaler input shape: 4
