<a href="https://colab.research.google.com/github/VINOTH1916/offi/blob/main/Copy_of_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import time
import psutil
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from scipy.sparse import csr_matrix
from sklearn.impute import SimpleImputer

# Memory Usage Function
def memory_usage():
    process = psutil.Process()
    return process.memory_info().rss / 1024 ** 2

# Load Dataset
data = pd.read_csv('/content/sample_data/balanced_ddos_data.csv')  # Replace with the actual path to your dataset

# Fix Column Names: Remove leading/trailing spaces
data.columns = data.columns.str.strip()

# Encode 'Label': BENIGN -> 0, Other Attack Types -> 1
data['Label'] = data['Label'].map(lambda x: 0 if x == 'BENIGN' else 1)

# Data Preprocessing
X = data.drop(['Label', 'Source IP', 'Destination IP'], axis=1).astype(np.float32)  # Drop irrelevant columns
y = data['Label']

# Handle Missing Values: Impute with column mean
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Convert features to sparse matrix for efficiency
X = csr_matrix(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
start_time = time.time()
clf = ExtraTreesClassifier(n_estimators=100, max_depth=20, max_features='sqrt', random_state=42)
clf.fit(X_train, y_train)
training_time = time.time() - start_time

# Training Metrics
y_pred_train = clf.predict(X_train)
train_cm = confusion_matrix(y_train, y_pred_train)
train_cr = classification_report(y_train, y_pred_train)
train_acc = accuracy_score(y_train, y_pred_train)

# Testing Metrics
y_pred_test = clf.predict(X_test)
test_cm = confusion_matrix(y_test, y_pred_test)
test_cr = classification_report(y_test, y_pred_test)
test_acc = accuracy_score(y_test, y_pred_test)

# Cross-validation for Bias and Variance Estimation
predicted = cross_val_predict(clf, X, y, cv=5)
bias = np.mean(predicted - y)
variance = np.var(predicted)

# Results
print("Training Accuracy:", train_acc)
print("Training Confusion Matrix:\n", train_cm)
print("Training Classification Report:\n", train_cr)
print("Testing Accuracy:", test_acc)
print("Testing Confusion Matrix:\n", test_cm)
print("Testing Classification Report:\n", test_cr)
print("Training Time:", training_time, "seconds")
print("Memory Usage:", memory_usage(), "MB")
print("Bias:", bias)
print("Variance:", variance)

# Optional: Calculate AUC-ROC
if len(set(y)) == 2:  # Only calculate if it's binary classification
    y_pred_prob = clf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)
    print("AUC-ROC Score:", auc)


In [None]:
# capture_network.py
import psutil
import time
import pandas as pd

# Simulated capture (replace with actual traffic monitoring logic)
def capture_network_data():
    network_data = {
        'Source IP': ['192.168.1.1'],
        'Destination IP': ['8.8.8.8'],
        'Protocol': [17],  # Example: 17 for UDP, replace with actual protocols
        'Flow Duration': [1000],  # Example
        'Total Length of Fwd Packets': [1500],
        'Total Length of Bwd Packets': [0],
        'Flow IAT Mean': [10],
        'Flow IAT Max': [50],
        'Flow IAT Min': [1],
        'Packet Length Mean': [60],
        'FIN Flag Count': [0],
        'SYN Flag Count': [0],
        'PSH Flag Count': [0],
        'ACK Flag Count': [1]
    }

    return pd.DataFrame(network_data)

# Main function to capture data
if __name__ == "__main__":
    captured_data = capture_network_data()
    captured_data.to_csv('captured_network_data.csv', index=False)
    print("Captured data saved to 'captured_network_data.csv'.")


In [63]:
# preprocess_data.py
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Function to encode IP addresses (simple version)
def encode_ip(df):
    # Convert IP addresses into a simple numerical representation using their hashed values or first octet
    df['Source IP'] = df['Source IP'].apply(lambda x: sum([int(i) for i in x.split('.')]))  # Sum of octets as an example
    df['Destination IP'] = df['Destination IP'].apply(lambda x: sum([int(i) for i in x.split('.')]))  # Same here
    return df

# Preprocess data (scale and handle missing values)
def preprocess_data(df):
    df = encode_ip(df)  # Encode IP columns
    df = df.fillna(0)  # Handle missing data

    # Ensure that all data is numeric
    df = df.apply(pd.to_numeric, errors='coerce')

    # Standardize the data (scaling)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)  # Apply the same scaling as used during training
    return pd.DataFrame(scaled_data, columns=df.columns)

# Load captured data
captured_data = pd.read_csv('captured_network_data.csv')

# Preprocess the data
processed_data = preprocess_data(captured_data)
processed_data.to_csv('processed_data.csv', index=False)
print("Processed data saved to 'processed_data.csv'.")


Processed data saved to 'processed_data.csv'.


In [None]:
import pandas as pd
import joblib
from sklearn.preprocessing import StandardScaler

# Load the trained model
clf = joblib.load('trained_model.joblib')

# Function to preprocess the data
def preprocess_data(df):
    # Encode IP columns (example)
    # Converting IP addresses to integers (you can use ipaddress module for better handling)
    df['Source IP'] = df['Source IP'].apply(lambda x: sum([int(i) for i in x.split('.')]))
    df['Destination IP'] = df['Destination IP'].apply(lambda x: sum([int(i) for i in x.split('.')]))

    # Handle missing values and ensure all data is numeric
    df = df.fillna(0)
    df = df.apply(pd.to_numeric, errors='coerce')

    # Scaling the data (using the same scaler as in training)
    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df)
    return pd.DataFrame(scaled_data, columns=df.columns)

# Load the captured data for prediction
captured_data = pd.read_csv('captured_network_data.csv')

# Preprocess the captured data
processed_data = preprocess_data(captured_data)

# Strip any extra spaces in column names and ensure the relevant columns
processed_data.columns = processed_data.columns.str.strip()

# Select the same columns as used during training (12 features)
processed_data = processed_data[['Flow Duration',
                                 'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
                                 'Flow IAT Mean', 'Flow IAT Max', 'Flow IAT Min',
                                 'Packet Length Mean', 'FIN Flag Count',
                                 'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'Protocol']]

# Convert DataFrame to numpy array (model input format)
processed_data_np = processed_data.to_numpy()

# Run prediction on the processed data
predictions = clf.predict(processed_data_np)

# Print the predictions (you can map 0 and 1 to more descriptive labels)
predicted_labels = ['BENIGN' if pred == 0 else 'DDoS' for pred in predictions]
print(predicted_labels)


In [62]:
# monitor_network.py
import time
import pandas as pd
import joblib
from capture_network import capture_network_data  # Your network capture function
from preprocess_data import preprocess_data  # Your preprocessing function

# detect_ddos.py
# detect_ddos.py
def detect_ddos(clf, data):
    # Predict using the trained classifier
    predictions = clf.predict(data)

    # Map predictions to human-readable labels
    predicted_labels = ['BENIGN' if pred == 0 else 'DDoS' for pred in predictions]

    # Print predictions for each record
    for i, label in enumerate(predicted_labels):
        print(f"Prediction for record {i+1}: {label}")
    print(f"Total predictions: {len(predicted_labels)}")

# monitor_network.py
def monitor_network():
    while True:
        # Capture network data (replace with real-time capture logic)
        captured_data = capture_network_data()  # This function should return the captured data

        if captured_data is None or captured_data.empty:
            print("No network data captured. Skipping prediction cycle.")
            time.sleep(5)
            continue

        # Preprocess the captured data
        processed_data = preprocess_data(captured_data)

        # Ensure the processed data has exactly 12 features (as in the training)
        processed_data.columns = processed_data.columns.str.strip()  # Remove any spaces from column names
        processed_data = processed_data[['Flow Duration',
                                         'Total Length of Fwd Packets', 'Total Length of Bwd Packets',
                                         'Flow IAT Mean', 'Flow IAT Max', 'Flow IAT Min',
                                         'Packet Length Mean', 'FIN Flag Count',
                                         'SYN Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'Protocol']]

        # Convert DataFrame to numpy array (model input format)
        processed_data_np = processed_data.to_numpy()

        # Detect DDoS attack using pre-trained model
        detect_ddos(clf, processed_data_np)

        # Sleep for 5 seconds before checking again
        time.sleep(5)


if __name__ == "__main__":
    monitor_network()


Prediction for record 1: BENIGN
Total predictions: 1
Prediction for record 1: BENIGN
Total predictions: 1
Prediction for record 1: BENIGN
Total predictions: 1
Prediction for record 1: BENIGN
Total predictions: 1
Prediction for record 1: BENIGN
Total predictions: 1
Prediction for record 1: BENIGN
Total predictions: 1
Prediction for record 1: BENIGN
Total predictions: 1


KeyboardInterrupt: 

In [8]:
import time
import psutil
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from scipy.sparse import csr_matrix
from sklearn.impute import SimpleImputer

# Memory Usage Function
def memory_usage():
    process = psutil.Process()
    return process.memory_info().rss / 1024 ** 2

# Load Dataset
data = pd.read_csv('/content/sample_data/balanced_ddos_data.csv')  # Replace with the actual path to your dataset

# Fix Column Names: Remove leading/trailing spaces
data.columns = data.columns.str.strip()

# Encode 'Label': BENIGN -> 0, Other Attack Types -> 1
data['Label'] = data['Label'].map(lambda x: 0 if x == 'BENIGN' else 1)

# Data Preprocessing
X = data.drop(['Label', 'Source IP', 'Destination IP'], axis=1).astype(np.float32)  # Drop irrelevant columns
y = data['Label']

# Handle Missing Values: Impute with column mean
imputer = SimpleImputer(strategy='mean')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Convert features to sparse matrix for efficiency
X = csr_matrix(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Training
start_time = time.time()
clf = ExtraTreesClassifier(n_estimators=100, max_depth=20, max_features='sqrt', random_state=42)
clf.fit(X_train, y_train)
training_time = time.time() - start_time

# Training Metrics
y_pred_train = clf.predict(X_train)
train_cm = confusion_matrix(y_train, y_pred_train)
train_cr = classification_report(y_train, y_pred_train)
train_acc = accuracy_score(y_train, y_pred_train)

# Testing Metrics
y_pred_test = clf.predict(X_test)
test_cm = confusion_matrix(y_test, y_pred_test)
test_cr = classification_report(y_test, y_pred_test)
test_acc = accuracy_score(y_test, y_pred_test)

# Cross-validation for Bias and Variance Estimation
predicted = cross_val_predict(clf, X, y, cv=5)
bias = np.mean(predicted - y)
variance = np.var(predicted)

# Results
print("Training Accuracy:", train_acc)
print("Training Confusion Matrix:\n", train_cm)
print("Training Classification Report:\n", train_cr)
print("Testing Accuracy:", test_acc)
print("Testing Confusion Matrix:\n", test_cm)
print("Testing Classification Report:\n", test_cr)
print("Training Time:", training_time, "seconds")
print("Memory Usage:", memory_usage(), "MB")
print("Bias:", bias)
print("Variance:", variance)

# Optional: Calculate AUC-ROC
if len(set(y)) == 2:  # Only calculate if it's binary classification
    y_pred_prob = clf.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)
    print("AUC-ROC Score:", auc)


Training Accuracy: 0.9974401419867911
Training Confusion Matrix:
 [[90886   360]
 [   90 84455]]
Training Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     91246
           1       1.00      1.00      1.00     84545

    accuracy                           1.00    175791
   macro avg       1.00      1.00      1.00    175791
weighted avg       1.00      1.00      1.00    175791

Testing Accuracy: 0.9972467461545463
Testing Confusion Matrix:
 [[22499    83]
 [   38 21328]]
Testing Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     22582
           1       1.00      1.00      1.00     21366

    accuracy                           1.00     43948
   macro avg       1.00      1.00      1.00     43948
weighted avg       1.00      1.00      1.00     43948

Training Time: 31.288391590118408 seconds
Memory Usage: 453.9453125 MB
Bias: 0.0013698069