# Detection of DDoS Attacks from Access Logs Using Machine Learning

In [None]:
# Basic imports
import numpy as np
import pandas as pd


In [None]:
# Pre-processing imports
from sklearn.preprocessing import LabelEncoder, StandardScaler


## Detection per Log/Request

### Dataset Pre-processing (per request)

In [2]:
# General helper functions

def load_dataset(path: str) -> pd.DataFrame:
    # Parse logs into DataFrame
    columns = ["ip", "logname", "user", "timestamp", "method", "path", "query", 
            "protocol", "status", "response_size", "referer", "user_agent", "bytes_received",
            "bytes_sent", "bytes_transferred", "connection_status", "keepalive_count",
            "processing_time", "error_log_id", "label"]

    df = pd.DataFrame(
        np.genfromtxt(path, delimiter="|", dtype=str, encoding="utf-8"),
        columns=columns
    )
    return df


def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
    # Drop rows which are missing label
    df = df[df["label"] != "-"]
    # Drop columns which are static or could be spoofed
    df = df.drop(['logname', 'user', 'method', 'protocol', 'error_log_id', "user_agent", "query", "status", "referer"], axis=1)
    return df


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    # Preprocessing
    df["timestamp"] = pd.to_numeric(df["timestamp"])
    df["response_size"] = pd.to_numeric(df["response_size"])
    df["bytes_received"] = pd.to_numeric(df["bytes_received"])
    df["bytes_sent"] = pd.to_numeric(df["bytes_sent"])
    df["bytes_transferred"] = pd.to_numeric(df["bytes_transferred"])
    df["keepalive_count"] = pd.to_numeric(df["keepalive_count"])
    df["processing_time"] = pd.to_numeric(df["processing_time"])
    return df

In [2]:
df = load_dataset("../dataset/train1_access.log")
df = drop_columns(df)
df.describe()

In [3]:
def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    # One-hot encoding
    df = pd.get_dummies(df, columns=["path", "connection_status"])
    return df


df = preprocess(df)
df = one_hot_encode(df)
df.head()

Unnamed: 0,ip,timestamp,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,label,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,172.21.250.180,1746986686795,100396,457,100684,101141,0,22028,0,True,False,True,False
1,172.19.123.119,1746986687918,100396,510,100684,101194,0,12603,0,True,False,True,False
2,172.17.86.108,1746986689457,100396,467,100684,101151,0,14308,0,True,False,True,False
3,172.20.216.143,1746986690686,100396,716,100684,101400,0,13694,0,True,False,True,False
4,172.19.165.119,1746986691804,100396,410,100684,101094,0,12692,0,True,False,True,False


In [4]:
def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    le = LabelEncoder()
    df["label"] = le.fit_transform(df["label"])
    return df

def normalize_numerical_features(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    numerical_features = ["response_size", "bytes_received", "bytes_sent", "bytes_transferred", "keepalive_count", "processing_time"]
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df


df = encode_labels(df)
df = normalize_numerical_features(df)
df.head()

Unnamed: 0,ip,timestamp,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,label,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,172.21.250.180,1746986686795,0.829257,-0.347614,0.829251,0.829325,-0.281424,-0.137208,0,True,False,True,False
1,172.19.123.119,1746986687918,0.829257,0.233566,0.829251,0.830514,-0.281424,-0.140673,0,True,False,True,False
2,172.17.86.108,1746986689457,0.829257,-0.237957,0.829251,0.82955,-0.281424,-0.140047,0,True,False,True,False
3,172.20.216.143,1746986690686,0.829257,2.492493,0.829251,0.835137,-0.281424,-0.140272,0,True,False,True,False
4,172.19.165.119,1746986691804,0.829257,-0.863,0.829251,0.828271,-0.281424,-0.140641,0,True,False,True,False


In [5]:
def get_dataset_for_model(df):
    # Dataset preparation
    ip_timestamps = df[["ip", "timestamp"]]
    X = df.drop(columns=["ip", "timestamp", "label"])
    y = df["label"]
    return ip_timestamps, X, y


ip_timestamps, X, y = get_dataset_for_model(df)

In [6]:
X.head()

Unnamed: 0,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,0.829257,-0.347614,0.829251,0.829325,-0.281424,-0.137208,True,False,True,False
1,0.829257,0.233566,0.829251,0.830514,-0.281424,-0.140673,True,False,True,False
2,0.829257,-0.237957,0.829251,0.82955,-0.281424,-0.140047,True,False,True,False
3,0.829257,2.492493,0.829251,0.835137,-0.281424,-0.140272,True,False,True,False
4,0.829257,-0.863,0.829251,0.828271,-0.281424,-0.140641,True,False,True,False


### Common Helper Functions for Testing and Evaluation of the Models

In [82]:
def get_true_positive_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 1, y_pred == 1))

def get_true_negative_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 0, y_pred == 0))

def get_false_positive_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 0, y_pred == 1))

def get_false_negative_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 1, y_pred == 0))

def predict_and_evaluate(model, X, y, output_modifier = lambda x: x) -> dict:
    # Prediction
    y_predict = model.predict(X)
    y_predict = output_modifier(y_predict)

    # Evaluation
    tp = get_true_positive_count(y, y_predict)
    tn = get_true_negative_count(y, y_predict)
    fp = get_false_positive_count(y, y_predict)
    fn = get_false_negative_count(y, y_predict)

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)

    tp_rate = tp / (tp + fn)
    tn_rate = tn / (tn + fp)
    fp_rate = fp / (fp + tn)
    fn_rate = fn / (fn + tp)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"True Positive Rate: {tp_rate}")
    print(f"True Negative Rate: {tn_rate}")
    print(f"False Positive Rate: {fp_rate}")
    print(f"False Negative Rate: {fn_rate}")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "tp_rate": tp_rate,
        "tn_rate": tn_rate,
        "fp_rate": fp_rate,
        "fn_rate": fn_rate
    }

def prepare_and_test(_model, dataset_path: str, output_modifier = lambda x: x) -> dict:
    print(f"Using dataset '{dataset_path}'")
    test_df = load_dataset(dataset_path)
    test_df = drop_columns(test_df)
    test_df = preprocess(test_df)
    test_df = one_hot_encode(test_df)
    test_df = encode_labels(test_df)
    test_df = normalize_numerical_features(test_df)

    # Fill missing columns
    if "connection_status_-" not in test_df.columns:
        test_df["connection_status_-"] = False
    if "connection_status_+" not in test_df.columns:
        test_df["connection_status_+"] = False
    if "path_/item.html" not in test_df.columns:
        test_df["path_/item.html"] = False

    test_df = test_df[["ip", "timestamp", "response_size", "bytes_received", "bytes_sent", "bytes_transferred", "keepalive_count", "processing_time", "label", "path_/index.html", "path_/item.html", "connection_status_+", "connection_status_-"]]

    _, X_test, y_test = get_dataset_for_model(test_df)
    model_performance = predict_and_evaluate(_model, X_test, y_test, output_modifier)
    print("--------------------------------")
    return model_performance


### Random Forest Classifier

In [104]:
from sklearn.ensemble import RandomForestClassifier

# Training
model_rf = RandomForestClassifier(n_estimators=43, criterion="gini", random_state=42)
model_rf.fit(X, y)

# Testing
# Training dataset
model_rf_performance_train = predict_and_evaluate(model_rf, X, y)
print("--------------------------------")

# Simple test dataset, no overlapping scenarios - benign traffic, slowloris, hulk
model_rf_performance_test = prepare_and_test(model_rf, "../dataset/test_access.log")

# Dataset with overlapping scenarios - benign traffic, slowloris, hulk, flash event
model_rf_performance_test1 = prepare_and_test(model_rf, "../dataset/test1_access.log")

Accuracy: 0.9993340239751369
Precision: 0.9993755552735119
Recall: 0.9992081539835945
F1 Score: 0.9992918476177906
True Positive Rate: 0.9992081539835945
True Negative Rate: 0.9994457611996792
False Positive Rate: 0.0005542388003208277
False Negative Rate: 0.0007918460164054251
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.8914198936977981
Precision: 0.9985367281240854
Recall: 0.8277535177098496
F1 Score: 0.9051598355219524
True Positive Rate: 0.8277535177098496
True Negative Rate: 0.9979699553390174
False Positive Rate: 0.0020300446609825416
False Negative Rate: 0.1722464822901504
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.8164824476080997
Precision: 0.9991958041958042
Recall: 0.7340422799311602
F1 Score: 0.8463372377130501
True Positive Rate: 0.7340422799311602
True Negative Rate: 0.9986942205064153
False Positive Rate: 0.0013057794935846486
False Negative Rate: 0.2659577200688397
----------------

### Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

# Training
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X, y)

In [11]:
# Testing
model_lr_performance_train = predict_and_evaluate(model_lr, X, y)
print("--------------------------------")

model_lr_performance_test = prepare_and_test(model_lr, "../dataset/test_access.log")

model_lr_performance_test1 = prepare_and_test(model_lr, "../dataset/test1_access.log")

Accuracy: 0.9755737180558274
Precision: 0.9561205999716718
Recall: 0.993660155932754
F1 Score: 0.9745289993055399
True Positive Rate: 0.993660155932754
True Negative Rate: 0.9595180375440462
False Positive Rate: 0.040481962455953785
False Negative Rate: 0.006339844067246
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.6318906605922551
Precision: 1.0
Recall: 0.4119359534206696
F1 Score: 0.5835051546391752
True Positive Rate: 0.4119359534206696
True Negative Rate: 1.0
False Positive Rate: 0.0
False Negative Rate: 0.5880640465793304
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.5117340171544787
Precision: 1.0
Recall: 0.2908222239346536
F1 Score: 0.45059996418124293
True Positive Rate: 0.2908222239346536
True Negative Rate: 1.0
False Positive Rate: 0.0
False Negative Rate: 0.7091777760653464
--------------------------------


### Gradient Boosting Classifier

In [99]:
from sklearn.ensemble import GradientBoostingClassifier

# Training
model_gb = GradientBoostingClassifier(n_estimators=28, random_state=42)
model_gb.fit(X, y)
# Testing
model_gb_performance_train = predict_and_evaluate(model_gb, X, y)
print("--------------------------------")

model_gb_performance_test = prepare_and_test(model_gb, "../dataset/test_access.log")

model_gb_performance_test1 = prepare_and_test(model_gb, "../dataset/test1_access.log")

Accuracy: 0.9867473158063084
Precision: 0.993163670083973
Recall: 0.9785541703890197
F1 Score: 0.9858047954837159
True Positive Rate: 0.9785541703890197
True Negative Rate: 0.9940205293656444
False Positive Rate: 0.005979470634355596
False Negative Rate: 0.021445829610980266
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.8914198936977981
Precision: 0.9985367281240854
Recall: 0.8277535177098496
F1 Score: 0.9051598355219524
True Positive Rate: 0.8277535177098496
True Negative Rate: 0.9979699553390174
False Positive Rate: 0.0020300446609825416
False Negative Rate: 0.1722464822901504
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.8164824476080997
Precision: 0.9991958041958042
Recall: 0.7340422799311602
F1 Score: 0.8463372377130501
True Positive Rate: 0.7340422799311602
True Negative Rate: 0.9986942205064153
False Positive Rate: 0.0013057794935846486
False Negative Rate: 0.2659577200688397
-------------------

### Performance Metrics Summary

In [100]:
## Create table for model performance
model_performance_table_train = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "Gradient Boosting"],
    "Accuracy": [model_rf_performance_train["accuracy"], model_lr_performance_train["accuracy"], model_gb_performance_train["accuracy"]],
    "Precision": [model_rf_performance_train["precision"], model_lr_performance_train["precision"], model_gb_performance_train["precision"]],
    "Recall": [model_rf_performance_train["recall"], model_lr_performance_train["recall"], model_gb_performance_train["recall"]],
    "F1 Score": [model_rf_performance_train["f1"], model_lr_performance_train["f1"], model_gb_performance_train["f1"]]
})

model_performance_table_train

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.999334,0.999376,0.999208,0.999292
1,Logistic Regression,0.975574,0.956121,0.99366,0.974529
2,Gradient Boosting,0.986747,0.993164,0.978554,0.985805


In [101]:
model_performance_table_test = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "Gradient Boosting"],
    "Accuracy": [model_rf_performance_test["accuracy"], model_lr_performance_test["accuracy"], model_gb_performance_test["accuracy"]],
    "Precision": [model_rf_performance_test["precision"], model_lr_performance_test["precision"], model_gb_performance_test["precision"]],
    "Recall": [model_rf_performance_test["recall"], model_lr_performance_test["recall"], model_gb_performance_test["recall"]],
    "F1 Score": [model_rf_performance_test["f1"], model_lr_performance_test["f1"], model_gb_performance_test["f1"]]
})
model_performance_table_test

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.89142,0.998537,0.827754,0.90516
1,Logistic Regression,0.631891,1.0,0.411936,0.583505
2,Gradient Boosting,0.89142,0.998537,0.827754,0.90516


In [102]:
model_performance_table_test1 = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "Gradient Boosting"],
    "Accuracy": [model_rf_performance_test1["accuracy"], model_lr_performance_test1["accuracy"], model_gb_performance_test1["accuracy"]],
    "Precision": [model_rf_performance_test1["precision"], model_lr_performance_test1["precision"], model_gb_performance_test1["precision"]],
    "Recall": [model_rf_performance_test1["recall"], model_lr_performance_test1["recall"], model_gb_performance_test1["recall"]],
    "F1 Score": [model_rf_performance_test1["f1"], model_lr_performance_test1["f1"], model_gb_performance_test1["f1"]]
})
model_performance_table_test1

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.816482,0.999196,0.734042,0.846337
1,Logistic Regression,0.511734,1.0,0.290822,0.4506
2,Gradient Boosting,0.816482,0.999196,0.734042,0.846337


## Detection per Time Window

In [3]:
df = load_dataset("../dataset/train1_access.log")
# Drop rows which are missing label
df = df[df["label"] != "-"]
# Drop columns which are static or could be spoofed
df = df.drop(['logname', 'user', 'method', 'protocol', 'error_log_id', "user_agent", "query", "status", "referer"], axis=1)
df.describe()

Unnamed: 0,ip,timestamp,path,response_size,bytes_received,bytes_sent,bytes_transferred,connection_status,keepalive_count,processing_time,label
count,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934
unique,194761,329748,2,4,372,7,422,2,8,102841,2
top,172.19.227.19,1746989729109,/index.html,100396,471,100684,10860,+,0,9166,0
freq,81,23,255691,248147,9404,242068,9130,411386,383172,52,221926


In [12]:
df = preprocess(df)

# Aggregate stats per IP within 5 minute time windows
# Convert timestamp to datetime for easier time window grouping
df["datetime"] = pd.to_datetime(df["timestamp"], unit="ms")

# Define the time window (5 minutes)
time_window = "5min"

# Group by IP and time window
df_grouped = df.groupby(["ip", pd.Grouper(key="datetime", freq=time_window)]).agg({
    "response_size": ["count", "mean", "sum"],
    "bytes_received": ["mean", "sum"],
    "bytes_sent": ["mean", "sum"],
    "bytes_transferred": ["mean", "sum"],
    "processing_time": ["mean", "sum"],
    "label": ["max"]  # Use max to identify if any request in the window was malicious
})

# Flatten the column hierarchy
df_grouped.columns = ["_".join(col).strip() for col in df_grouped.columns.values]

# Rename columns for clarity
df_grouped.rename(columns={"response_size_count": "request_count", "label_max": "label"}, inplace=True)

# Add features about connection patterns
connection_stats = df.groupby(["ip", pd.Grouper(key="datetime", freq=time_window)])["connection_status"].value_counts().unstack(fill_value=0)
df_grouped = df_grouped.join(connection_stats)
df_grouped.rename(columns={"+": "connection_status_+_count", "-": "connection_status_-_count"}, inplace=True)

# Reset index to make IP and time window regular columns
df_grouped.reset_index(inplace=True)

# Add path diversity feature
path_counts = df.groupby(["ip", pd.Grouper(key="datetime", freq=time_window)])["path"].nunique()
df_grouped["unique_paths"] = path_counts.values

# Display the first few rows of the aggregated data
print(f"Aggregated data shape: {df_grouped.shape}")
df_grouped


Aggregated data shape: (224757, 17)


Unnamed: 0,ip,datetime,request_count,response_size_mean,response_size_sum,bytes_received_mean,bytes_received_sum,bytes_sent_mean,bytes_sent_sum,bytes_transferred_mean,bytes_transferred_sum,processing_time_mean,processing_time_sum,label,connection_status_+_count,connection_status_-_count,unique_paths
0,172.16.0.139,2025-05-11 20:45:00,7,48800.000000,341600,512.857143,3590,49086.714286,343607,49599.571429,347197,1.771700e+04,124019,0,7,0,2
1,172.16.0.148,2025-05-11 20:30:00,1,10103.000000,10103,526.000000,526,10389.000000,10389,10915.000000,10915,2.165500e+04,21655,0,1,0,1
2,172.16.0.168,2025-05-11 20:45:00,24,28914.041667,693937,525.000000,12600,29200.250000,700806,29725.250000,713406,1.534454e+04,368269,0,24,0,2
3,172.16.0.174,2025-05-11 20:20:00,3,40200.666667,120602,513.000000,1539,40487.333333,121462,41000.333333,123001,2.113500e+04,63405,0,3,0,2
4,172.16.0.183,2025-05-11 22:05:00,5,28161.600000,140808,469.200000,2346,28447.800000,142239,28917.000000,144585,1.266140e+04,63307,0,5,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224752,172.31.99.81,2025-05-11 19:45:00,1,100396.000000,100396,393.000000,393,100684.000000,100684,101077.000000,101077,9.176400e+04,91764,1,1,0,1
224753,172.31.99.84,2025-05-11 19:20:00,1,100396.000000,100396,415.000000,415,100684.000000,100684,101099.000000,101099,2.198000e+04,21980,1,1,0,1
224754,172.31.99.9,2025-05-11 18:50:00,1,221.000000,221,290.000000,290,405.000000,405,695.000000,695,2.113878e+07,21138782,1,0,1,1
224755,172.31.99.9,2025-05-11 19:35:00,1,100396.000000,100396,419.000000,419,100684.000000,100684,101103.000000,101103,1.464400e+04,14644,1,1,0,1


In [14]:
# Sort df_grouped by datetime
df_grouped = df_grouped.sort_values(by="datetime")
df_grouped

Unnamed: 0,ip,datetime,request_count,response_size_mean,response_size_sum,bytes_received_mean,bytes_received_sum,bytes_sent_mean,bytes_sent_sum,bytes_transferred_mean,bytes_transferred_sum,processing_time_mean,processing_time_sum,label,connection_status_+_count,connection_status_-_count,unique_paths
8003,172.17.86.108,2025-05-11 18:00:00,3,70298.333333,210895,505.666667,1517,70585.333333,211756,71091.000000,213273,13380.666667,40142,0,3,0,2
13619,172.19.165.119,2025-05-11 18:00:00,1,100396.000000,100396,410.000000,410,100684.000000,100684,101094.000000,101094,12692.000000,12692,0,1,0,1
26971,172.22.208.215,2025-05-11 18:00:00,1,100396.000000,100396,717.000000,717,100684.000000,100684,101401.000000,101401,12260.000000,12260,0,1,0,1
12904,172.19.123.119,2025-05-11 18:00:00,2,100396.000000,200792,538.000000,1076,100683.500000,201367,101221.500000,202443,11768.000000,23536,0,2,0,1
23462,172.21.250.180,2025-05-11 18:00:00,2,55249.500000,110499,487.000000,974,55536.500000,111073,56023.500000,112047,14721.500000,29443,0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30930,172.23.200.168,2025-05-11 23:35:00,5,46220.200000,231101,580.000000,2900,46506.800000,232534,47086.800000,235434,12361.600000,61808,0,5,0,2
24805,172.21.93.185,2025-05-11 23:35:00,10,46220.200000,462202,504.600000,5046,46507.000000,465070,47011.600000,470116,9183.400000,91834,0,10,0,2
8467,172.18.110.159,2025-05-11 23:35:00,8,43962.875000,351703,492.500000,3940,44249.500000,353996,44742.000000,357936,11448.375000,91587,0,8,0,2
21805,172.21.157.185,2025-05-11 23:35:00,18,20135.555556,362440,494.500000,8901,20421.611111,367589,20916.111111,376490,9413.333333,169440,0,18,0,2


In [15]:
# Train Random Forest classifier per time windows
