#generate DDoS log based on icmp flood pattern

In [1]:
import random
from datetime import datetime, timedelta
import json

# Function to generate synthetic DDoS logs based on the pattern
def generate_synthetic_ddos_logs(num_pairs_per_ip=500, ip_list=None):
    if ip_list is None:
        ip_list = ["192.168.1.64", "192.168.1.65", "192.168.1.66", "192.168.1.67", "192.168.1.2", "192.168.1.10"]  # Default IPs
    
    logs = []

    # Generate logs for each IP with random timestamps for DDoS
    for ip in ip_list:
        base_time = datetime.now()  # Random base time for each IP
        
        for i in range(num_pairs_per_ip):
            # Generate a very short time difference between requests (simulate DDoS)
            time_diff = random.uniform(0.01, 0.1)
            timestamp = base_time + timedelta(seconds=time_diff * i)

            # Simulate request log entry for DDoS attack (label 1)
            log_request = {
                "timestamp": timestamp.strftime("%H:%M:%S.%f")[:-3],  # Format timestamp as HH:MM:SS.sss
                "dst_host": "mekky-VirtualBox",
                "id": "1",
                "length": "40",
                "type": "request",
                "seq": str(1290 + i),  # Incrementing sequence number for each log
                "src_ip": ip,
                "label": 1  # Label 1 for DDoS attack
            }

            # Simulate reply log entry (with a small delay from request)
            reply_timestamp = timestamp + timedelta(seconds=0.1)  # Reply comes shortly after request
            log_reply = {
                "dst_ip": ip,
                "timestamp": reply_timestamp.strftime("%H:%M:%S.%f")[:-3],  # Same formatting for reply
                "src_host": "mekky-VirtualBox",
                "id": "1",
                "length": "40",
                "type": "reply",
                "seq": str(1290 + i),
                "label": 1  # Label 1 for DDoS attack
            }

            logs.append(log_request)
            logs.append(log_reply)

        # Update base time for next iteration (keep base time incrementing for the next IP)
        base_time = timestamp + timedelta(minutes=random.randint(1, 5))  # Random minute increment for staggered IPs
    
    return logs

# Function to generate normal logs with several different IPs and random timestamps
def generate_normal_logs(num_logs=100, ip_list=None):
    if ip_list is None:
        ip_list = ["192.168.1.1", "192.168.1.2", "192.168.1.10", "192.168.1.5", "192.168.1.86", "192.168.1.17", "192.168.1.34"]
    
    logs = []
    
    # Generate random normal logs with varying time intervals
    for i in range(num_logs):
        # Generate a random timestamp by adding a random time difference to the current time
        timestamp = datetime.now() + timedelta(seconds=random.uniform(60, 300))  # Random timestamp between 1 to 5 minutes

        # Randomly select a source IP from the list
        src_ip = random.choice(ip_list)

        # Simulate request log entry for normal traffic (label 0)
        log_request = {
            "timestamp": timestamp.strftime("%H:%M:%S.%f")[:-3],  # Format timestamp as HH:MM:SS.sss
            "dst_host": "mekky-VirtualBox",
            "id": "1",
            "length": "40",
            "type": "request",
            "seq": str(1290 + i),
            "src_ip": src_ip,
            "label": 0  # Label 0 for normal traffic
        }

        # Simulate reply log entry (with a small delay from request)
        reply_timestamp = timestamp + timedelta(seconds=0.1)
        log_reply = {
            "dst_ip": src_ip,
            "timestamp": reply_timestamp.strftime("%H:%M:%S.%f")[:-3],  # Same formatting for reply
            "src_host": "mekky-VirtualBox",
            "id": "1",
            "length": "40",
            "type": "reply",
            "seq": str(1290 + i),
            "label": 0  # Label 0 for normal traffic
        }

        logs.append(log_request)
        logs.append(log_reply)

    return logs

# Function to generate normal logs with several different IPs and random timestamps
def generate_normal_logs2(num_logs=100, ip_list=None):
    if ip_list is None:
        ip_list = ["192.168.1.12", "192.168.1.13", "192.168.1.22", "192.168.1.25", "192.168.1.28", "192.168.1.34", "192.168.1.39"]
    
    logs = []
    
    # Generate random normal logs with varying time intervals
    for i in range(num_logs):
        # Generate a random timestamp by adding a random time difference to the current time
        timestamp = datetime.now() + timedelta(seconds=random.uniform(60, 300))  # Random timestamp between 1 to 5 minutes

        # Randomly select a source IP from the list
        src_ip = random.choice(ip_list)

        # Simulate request log entry for normal traffic (label 0)
        log_request = {
            "timestamp": timestamp.strftime("%H:%M:%S.%f")[:-3],  # Format timestamp as HH:MM:SS.sss
            "dst_host": "mekky-VirtualBox",
            "id": "1",
            "length": "40",
            "type": "request",
            "seq": str(1290 + i),
            "src_ip": src_ip,
            "label": 0  # Label 0 for normal traffic
        }

        # Simulate reply log entry (with a small delay from request)
        reply_timestamp = timestamp + timedelta(seconds=0.1)
        log_reply = {
            "dst_ip": src_ip,
            "timestamp": reply_timestamp.strftime("%H:%M:%S.%f")[:-3],  # Same formatting for reply
            "src_host": "mekky-VirtualBox",
            "id": "1",
            "length": "40",
            "type": "reply",
            "seq": str(1290 + i),
            "label": 0  # Label 0 for normal traffic
        }

        logs.append(log_request)
        logs.append(log_reply)

    return logs

# Save logs to a .log file
def save_logs_to_file(logs, filename="synthetic_fw_logs.json"):
    with open(filename, 'w') as f:
        for log in logs:
            json.dump(log, f)
            f.write("\n")  # Write each log entry on a new line
    print(f"Logs saved to {filename}")

# Generate synthetic DDoS logs and normal logs
normal_logs1 = generate_normal_logs(100)  # Generating 100 request-reply pairs for normal logs
synthetic_ddos_logs = generate_synthetic_ddos_logs(num_pairs_per_ip=200)  # Generating 120 request-reply pairs for each DDoS IP
normal_logs2 = generate_normal_logs2(100)  # Generating 100 request-reply pairs for normal logs

# Combine both sets of logs
combined_logs = normal_logs1 + synthetic_ddos_logs + normal_logs2

# Sort the combined logs by timestamp in ascending order
combined_logs.sort(key=lambda x: datetime.strptime(x['timestamp'], "%H:%M:%S.%f"))

# Count the number of DDoS and normal logs
num_ddos_logs = sum(log['label'] == 1 for log in combined_logs)
num_normal_logs = sum(log['label'] == 0 for log in combined_logs)

print(f"Number of DDoS logs generated: {num_ddos_logs}")
print(f"Number of Normal logs generated: {num_normal_logs}")

# Save combined logs to file
save_logs_to_file(combined_logs, "synthetic_fw_logs2.json")


Number of DDoS logs generated: 2400
Number of Normal logs generated: 400
Logs saved to synthetic_fw_logs2.json


#Implement Model to capture DDoS attack

In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import joblib
from collections import deque

# Load data from the JSON file
json_file_path = "synthetic_fw_logs2.json"
df = pd.read_json(json_file_path, lines=True)

# Define a threshold for DDoS detection
ddos_threshold = 100
window_size = 60  # Sliding window size in seconds

# Sliding window for processing logs dynamically
log_window = deque()

# Function to process logs in a sliding window
def process_and_extract_features_realtime(df, ddos_threshold=100):
    """
    Process logs using sliding window logic to dynamically calculate counts.
    """
    global log_window
    processed_logs = []
    
    # Sort logs by timestamp
    df = df.sort_values(by='timestamp')
    
    for _, log in df.iterrows():
        log['timestamp'] = pd.to_datetime(log['timestamp'])
        
        # Skip logs with missing or invalid 'src_ip'
        if pd.isna(log['src_ip']) or not isinstance(log['src_ip'], str):
            continue
        
        log_window.append(log)
        
        # Remove logs older than the sliding window
        current_time = log['timestamp']
        while log_window and (current_time - log_window[0]['timestamp']).total_seconds() > window_size:
            log_window.popleft()
        
        # Count requests for each 'src_ip' in the sliding window
        counts = {}
        for window_log in log_window:
            if window_log['type'] == 'request' and isinstance(window_log['src_ip'], str):
                counts[window_log['src_ip']] = counts.get(window_log['src_ip'], 0) + 1
        
        # Add processed log with count
        processed_logs.append({
            "timestamp": log['timestamp'],
            "src_ip": log['src_ip'],
            "count": counts.get(log['src_ip'], 0),
            "label": 1 if counts.get(log['src_ip'], 0) >= ddos_threshold else 0
        })
    
    # Convert processed logs to a DataFrame
    processed_logs_df = pd.DataFrame(processed_logs)
    return processed_logs_df[['count']].values, processed_logs_df['label'].values, processed_logs_df

# Process the dataset using sliding window logic
features, labels, processed_logs = process_and_extract_features_realtime(df, ddos_threshold=ddos_threshold)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced')
rf_model.fit(X_train, y_train)

# Save the trained model
joblib.dump(rf_model, "simplified_random_forest_model_realtime.pkl")
print("Random Forest Classifier Model training completed and saved.")
# Save the trained model

# Evaluate the model
y_pred_lr = rf_model.predict(X_test)
print("Random Forest Evaluation:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr))


Random Forest Classifier Model training completed and saved.
Random Forest Evaluation:
Accuracy: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       148
           1       1.00      1.00      1.00       132

    accuracy                           1.00       280
   macro avg       1.00      1.00      1.00       280
weighted avg       1.00      1.00      1.00       280



Model Validation

In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
import joblib
from collections import deque

# Load the trained model
rf_model = joblib.load("simplified_random_forest_model_realtime.pkl")

# Load the validation data from the JSON file
json_file_path_test = "synthetic_fw_logs_validation.json"
df_test = pd.read_json(json_file_path_test, lines=True)

# Define a threshold for DDoS detection
ddos_threshold = 100
window_size = 60  # Sliding window size in seconds

# Sliding window for processing logs dynamically
log_window = deque()

# Function to process logs in a sliding window for validation
def process_and_extract_features_realtime_validation(df, ddos_threshold=100):
    """
    Process logs using sliding window logic to dynamically calculate counts for validation.
    """
    global log_window
    processed_logs = []
    
    # Sort logs by timestamp
    df = df.sort_values(by='timestamp')
    
    for _, log in df.iterrows():
        log['timestamp'] = pd.to_datetime(log['timestamp'])
        
        # Skip logs with missing or invalid 'src_ip'
        if pd.isna(log['src_ip']) or not isinstance(log['src_ip'], str):
            continue
        
        log_window.append(log)
        
        # Remove logs older than the sliding window
        current_time = log['timestamp']
        while log_window and (current_time - log_window[0]['timestamp']).total_seconds() > window_size:
            log_window.popleft()
        
        # Count requests for each 'src_ip' in the sliding window
        counts = {}
        for window_log in log_window:
            if window_log['type'] == 'request' and isinstance(window_log['src_ip'], str):
                counts[window_log['src_ip']] = counts.get(window_log['src_ip'], 0) + 1
        
        # Add processed log with count
        processed_logs.append({
            "timestamp": log['timestamp'],
            "src_ip": log['src_ip'],
            "count": counts.get(log['src_ip'], 0),
            "label": 1 if counts.get(log['src_ip'], 0) >= ddos_threshold else 0
        })
    
    # Convert processed logs to a DataFrame
    processed_logs_df = pd.DataFrame(processed_logs)
    return processed_logs_df[['count']].values, processed_logs_df['label'].values, processed_logs_df

# Process the validation dataset using sliding window logic
features_test, labels_test, processed_logs = process_and_extract_features_realtime_validation(df_test, ddos_threshold=ddos_threshold)

# Predict on the validation dataset
y_pred = rf_model.predict(features_test)

# Evaluate the model
accuracy = accuracy_score(labels_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(classification_report(labels_test, y_pred))

# Add predictions to the processed logs DataFrame
processed_logs['predicted_label'] = y_pred
processed_logs['predicted_label'] = processed_logs['predicted_label'].apply(lambda x: "DDoS" if x == 1 else "Normal")
processed_logs['actual_label'] = processed_logs['label'].apply(lambda x: "DDoS" if x == 1 else "Normal")

# Print detailed results for manual inspection
print("\nSample Results with Counts and Predictions:")
print(processed_logs[['timestamp', 'src_ip', 'count', 'actual_label', 'predicted_label']].head(10))  # Print the first 10 rows

# Save results to a CSV file for detailed inspection (optional)
processed_logs.to_csv("validation_results.csv", index=False)


Accuracy: 1.0000
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2564
           1       1.00      1.00      1.00       126

    accuracy                           1.00      2690
   macro avg       1.00      1.00      1.00      2690
weighted avg       1.00      1.00      1.00      2690


Sample Results with Counts and Predictions:
                timestamp           src_ip  count actual_label predicted_label
0 2024-12-14 02:00:31.751   192.168.70.147      1       Normal          Normal
1 2024-12-14 02:00:31.752  192.168.123.250      1       Normal          Normal
2 2024-12-14 02:00:31.753  192.168.164.103      1       Normal          Normal
3 2024-12-14 02:00:31.754  192.168.234.232      1       Normal          Normal
4 2024-12-14 02:00:31.756   192.168.161.19      1       Normal          Normal
5 2024-12-14 02:00:31.758  192.168.221.178      1       Normal          Normal
6 2024-12-14 02:00:31.769  192.168.