# Detection of DDoS Attacks from Access Logs Using Machine Learning

In [1]:
# Basic imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Model imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


## Detection per Log/Request

### Dataset Pre-processing

In [3]:
# General helper functions

def load_dataset(path: str) -> pd.DataFrame:
    """Read file with dataset, load and parse int into pd.DataFrame."""
    # Parse logs into DataFrame
    columns = ["ip", "logname", "user", "timestamp", "method", "path", "query", 
            "protocol", "status", "response_size", "referer", "user_agent", "bytes_received",
            "bytes_sent", "bytes_transferred", "connection_status", "keepalive_count",
            "processing_time", "error_log_id", "label"]

    df = pd.DataFrame(
        np.genfromtxt(path, delimiter="|", dtype=str, encoding="utf-8"),
        columns=columns
    )
    return df


def drop_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Drop rows which are missing label and drop columns which are static or could be spoofed."""
    # Drop rows
    df = df[df["label"] != "-"]
    # Drop columns
    df = df.drop(['logname', 'user', 'method', 'protocol', 'error_log_id', "user_agent", "query", "status", "referer"], axis=1)
    return df


def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the dataset by converting numerical values from string to numeric."""
    df["timestamp"] = pd.to_numeric(df["timestamp"])
    df["response_size"] = pd.to_numeric(df["response_size"])
    df["bytes_received"] = pd.to_numeric(df["bytes_received"])
    df["bytes_sent"] = pd.to_numeric(df["bytes_sent"])
    df["bytes_transferred"] = pd.to_numeric(df["bytes_transferred"])
    df["keepalive_count"] = pd.to_numeric(df["keepalive_count"])
    df["processing_time"] = pd.to_numeric(df["processing_time"])
    return df

def encode_labels(df: pd.DataFrame) -> pd.DataFrame:
    le = LabelEncoder()
    df["label"] = le.fit_transform(df["label"])
    return df

def normalize_numerical_features(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    numerical_features = ["response_size", "bytes_received", "bytes_sent", "bytes_transferred", "keepalive_count", "processing_time"]
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df

In [4]:
df = load_dataset("../dataset/train1_access.log")
df = drop_columns(df)
df.describe()

Unnamed: 0,ip,timestamp,path,response_size,bytes_received,bytes_sent,bytes_transferred,connection_status,keepalive_count,processing_time,label
count,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934
unique,194761,329748,2,4,372,7,422,2,8,102841,2
top,172.19.227.19,1746989729109,/index.html,100396,471,100684,10860,+,0,9166,0
freq,81,23,255691,248147,9404,242068,9130,411386,383172,52,221926


In [5]:
def one_hot_encode(df: pd.DataFrame) -> pd.DataFrame:
    # One-hot encoding
    df = pd.get_dummies(df, columns=["path", "connection_status"])
    return df


df = preprocess(df)
df = one_hot_encode(df)
df.head()

Unnamed: 0,ip,timestamp,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,label,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,172.21.250.180,1746986686795,100396,457,100684,101141,0,22028,0,True,False,True,False
1,172.19.123.119,1746986687918,100396,510,100684,101194,0,12603,0,True,False,True,False
2,172.17.86.108,1746986689457,100396,467,100684,101151,0,14308,0,True,False,True,False
3,172.20.216.143,1746986690686,100396,716,100684,101400,0,13694,0,True,False,True,False
4,172.19.165.119,1746986691804,100396,410,100684,101094,0,12692,0,True,False,True,False


In [6]:
df = encode_labels(df)
df = normalize_numerical_features(df)
df.head()

Unnamed: 0,ip,timestamp,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,label,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,172.21.250.180,1746986686795,0.829257,-0.347614,0.829251,0.829325,-0.281424,-0.137208,0,True,False,True,False
1,172.19.123.119,1746986687918,0.829257,0.233566,0.829251,0.830514,-0.281424,-0.140673,0,True,False,True,False
2,172.17.86.108,1746986689457,0.829257,-0.237957,0.829251,0.82955,-0.281424,-0.140047,0,True,False,True,False
3,172.20.216.143,1746986690686,0.829257,2.492493,0.829251,0.835137,-0.281424,-0.140272,0,True,False,True,False
4,172.19.165.119,1746986691804,0.829257,-0.863,0.829251,0.828271,-0.281424,-0.140641,0,True,False,True,False


In [7]:
def get_dataset_for_model(df):
    # Dataset preparation
    ip_timestamps = df[["ip", "timestamp"]]
    X = df.drop(columns=["ip", "timestamp", "label"])
    y = df["label"]
    return ip_timestamps, X, y


ip_timestamps, X, y = get_dataset_for_model(df)

In [8]:
X.head()

Unnamed: 0,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,0.829257,-0.347614,0.829251,0.829325,-0.281424,-0.137208,True,False,True,False
1,0.829257,0.233566,0.829251,0.830514,-0.281424,-0.140673,True,False,True,False
2,0.829257,-0.237957,0.829251,0.82955,-0.281424,-0.140047,True,False,True,False
3,0.829257,2.492493,0.829251,0.835137,-0.281424,-0.140272,True,False,True,False
4,0.829257,-0.863,0.829251,0.828271,-0.281424,-0.140641,True,False,True,False


### Common Helper Functions for Testing and Evaluation of the Models

In [9]:
def get_true_positive_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 1, y_pred == 1))

def get_true_negative_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 0, y_pred == 0))

def get_false_positive_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 0, y_pred == 1))

def get_false_negative_count(y_true, y_pred) -> int:
    return np.sum(np.logical_and(y_true == 1, y_pred == 0))

def predict_and_evaluate(model, X, y, output_modifier=None) -> dict:
    # Prediction
    y_predict = model.predict(X)
    if output_modifier:
        y_predict = output_modifier(y_predict)  # prep for neural networks

    # Evaluation
    tp = get_true_positive_count(y, y_predict)
    tn = get_true_negative_count(y, y_predict)
    fp = get_false_positive_count(y, y_predict)
    fn = get_false_negative_count(y, y_predict)

    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)

    tp_rate = tp / (tp + fn)
    tn_rate = tn / (tn + fp)
    fp_rate = fp / (fp + tn)
    fn_rate = fn / (fn + tp)

    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print(f"True Positive Rate: {tp_rate}")
    print(f"True Negative Rate: {tn_rate}")
    print(f"False Positive Rate: {fp_rate}")
    print(f"False Negative Rate: {fn_rate}")

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "tp_rate": tp_rate,
        "tn_rate": tn_rate,
        "fp_rate": fp_rate,
        "fn_rate": fn_rate
    }

def prepare_and_test(_model, dataset_path: str, output_modifier = lambda x: x) -> dict:
    """Load the dataset, pre-process, encode, normalize, feed to the model, and evaluate performance metrics."""
    print(f"Using dataset '{dataset_path}'")
    test_df = load_dataset(dataset_path)
    test_df = drop_columns(test_df)
    test_df = preprocess(test_df)
    test_df = one_hot_encode(test_df)
    test_df = encode_labels(test_df)
    test_df = normalize_numerical_features(test_df)

    # Fill missing columns
    if "connection_status_-" not in test_df.columns:
        test_df["connection_status_-"] = False
    if "connection_status_+" not in test_df.columns:
        test_df["connection_status_+"] = False
    if "path_/item.html" not in test_df.columns:
        test_df["path_/item.html"] = False

    test_df = test_df[["ip", "timestamp", "response_size", "bytes_received", "bytes_sent", "bytes_transferred", "keepalive_count", "processing_time", "label", "path_/index.html", "path_/item.html", "connection_status_+", "connection_status_-"]]

    _, X_test, y_test = get_dataset_for_model(test_df)
    model_performance = predict_and_evaluate(_model, X_test, y_test, output_modifier)
    print("--------------------------------")
    return model_performance


### Random Forest Classifier

In [10]:
# Training
model_rf = RandomForestClassifier(n_estimators=43, criterion="gini", random_state=42)
model_rf.fit(X, y)

# Testing
# Training dataset
model_rf_performance_train = predict_and_evaluate(model_rf, X, y)
print("--------------------------------")

# Simple test dataset, no overlapping scenarios - benign traffic, slowloris, hulk
model_rf_performance_test = prepare_and_test(model_rf, "../dataset/test_access.log")

# Dataset with overlapping scenarios - benign traffic, slowloris, hulk, flash event
model_rf_performance_test1 = prepare_and_test(model_rf, "../dataset/test1_access.log")

Accuracy: 0.9993340239751369
Precision: 0.9993755552735119
Recall: 0.9992081539835945
F1 Score: 0.9992918476177906
True Positive Rate: 0.9992081539835945
True Negative Rate: 0.9994457611996792
False Positive Rate: 0.0005542388003208277
False Negative Rate: 0.0007918460164054251
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.8914198936977981
Precision: 0.9985367281240854
Recall: 0.8277535177098496
F1 Score: 0.9051598355219524
True Positive Rate: 0.8277535177098496
True Negative Rate: 0.9979699553390174
False Positive Rate: 0.0020300446609825416
False Negative Rate: 0.1722464822901504
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.9753525629846651
Precision: 0.9483569218776644
Recall: 1.0
F1 Score: 0.9734940361581356
True Positive Rate: 1.0
True Negative Rate: 0.95497217620322
False Positive Rate: 0.04502782379678009
False Negative Rate: 0.0
--------------------------------


### Logistic Regression

In [11]:
# Training
model_lr = LogisticRegression(random_state=42)
model_lr.fit(X, y)

# Testing
model_lr_performance_train = predict_and_evaluate(model_lr, X, y)
print("--------------------------------")

model_lr_performance_test = prepare_and_test(model_lr, "../dataset/test_access.log")

model_lr_performance_test1 = prepare_and_test(model_lr, "../dataset/test1_access.log")

Accuracy: 0.9755737180558274
Precision: 0.9561205999716718
Recall: 0.993660155932754
F1 Score: 0.9745289993055399
True Positive Rate: 0.993660155932754
True Negative Rate: 0.9595180375440462
False Positive Rate: 0.040481962455953785
False Negative Rate: 0.006339844067246
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.6318906605922551
Precision: 1.0
Recall: 0.4119359534206696
F1 Score: 0.5835051546391752
True Positive Rate: 0.4119359534206696
True Negative Rate: 1.0
False Positive Rate: 0.0
False Negative Rate: 0.5880640465793304
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.9302896073849302
Precision: 0.9927586103713456
Recall: 0.8522000462356477
F1 Score: 0.9171250863856254
True Positive Rate: 0.8522000462356477
True Negative Rate: 0.9948600314345185
False Positive Rate: 0.0051399685654815
False Negative Rate: 0.14779995376435232
--------------------------------


### Gradient Boosting Classifier

In [12]:
# Training
model_gb = GradientBoostingClassifier(n_estimators=28, random_state=42)
model_gb.fit(X, y)

# Testing
model_gb_performance_train = predict_and_evaluate(model_gb, X, y)
print("--------------------------------")

model_gb_performance_test = prepare_and_test(model_gb, "../dataset/test_access.log")

model_gb_performance_test1 = prepare_and_test(model_gb, "../dataset/test1_access.log")

Accuracy: 0.9867473158063084
Precision: 0.993163670083973
Recall: 0.9785541703890197
F1 Score: 0.9858047954837159
True Positive Rate: 0.9785541703890197
True Negative Rate: 0.9940205293656444
False Positive Rate: 0.005979470634355596
False Negative Rate: 0.021445829610980266
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.8914198936977981
Precision: 0.9985367281240854
Recall: 0.8277535177098496
F1 Score: 0.9051598355219524
True Positive Rate: 0.8277535177098496
True Negative Rate: 0.9979699553390174
False Positive Rate: 0.0020300446609825416
False Negative Rate: 0.1722464822901504
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.9730157069280225
Precision: 0.9437360612818773
Recall: 1.0
F1 Score: 0.9710537146277889
True Positive Rate: 1.0
True Negative Rate: 0.9507030287583366
False Positive Rate: 0.04929697124166348
False Negative Rate: 0.0
--------------------------------


### Performance Metrics Summary

In [13]:
model_performance_table_train = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "Gradient Boosting"],
    "Accuracy": [model_rf_performance_train["accuracy"], model_lr_performance_train["accuracy"], model_gb_performance_train["accuracy"]],
    "Precision": [model_rf_performance_train["precision"], model_lr_performance_train["precision"], model_gb_performance_train["precision"]],
    "Recall": [model_rf_performance_train["recall"], model_lr_performance_train["recall"], model_gb_performance_train["recall"]],
    "F1 Score": [model_rf_performance_train["f1"], model_lr_performance_train["f1"], model_gb_performance_train["f1"]]
})

model_performance_table_train

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.999334,0.999376,0.999208,0.999292
1,Logistic Regression,0.975574,0.956121,0.99366,0.974529
2,Gradient Boosting,0.986747,0.993164,0.978554,0.985805


In [14]:
model_performance_table_test = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "Gradient Boosting"],
    "Accuracy": [model_rf_performance_test["accuracy"], model_lr_performance_test["accuracy"], model_gb_performance_test["accuracy"]],
    "Precision": [model_rf_performance_test["precision"], model_lr_performance_test["precision"], model_gb_performance_test["precision"]],
    "Recall": [model_rf_performance_test["recall"], model_lr_performance_test["recall"], model_gb_performance_test["recall"]],
    "F1 Score": [model_rf_performance_test["f1"], model_lr_performance_test["f1"], model_gb_performance_test["f1"]]
})
model_performance_table_test

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.89142,0.998537,0.827754,0.90516
1,Logistic Regression,0.631891,1.0,0.411936,0.583505
2,Gradient Boosting,0.89142,0.998537,0.827754,0.90516


In [15]:
model_performance_table_test1 = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "Gradient Boosting"],
    "Accuracy": [model_rf_performance_test1["accuracy"], model_lr_performance_test1["accuracy"], model_gb_performance_test1["accuracy"]],
    "Precision": [model_rf_performance_test1["precision"], model_lr_performance_test1["precision"], model_gb_performance_test1["precision"]],
    "Recall": [model_rf_performance_test1["recall"], model_lr_performance_test1["recall"], model_gb_performance_test1["recall"]],
    "F1 Score": [model_rf_performance_test1["f1"], model_lr_performance_test1["f1"], model_gb_performance_test1["f1"]]
})
model_performance_table_test1

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.975353,0.948357,1.0,0.973494
1,Logistic Regression,0.93029,0.992759,0.8522,0.917125
2,Gradient Boosting,0.973016,0.943736,1.0,0.971054


In [16]:
# Create a combined table with dataset information
training_data = model_performance_table_train.assign(Dataset="Training")
test_data = model_performance_table_test.assign(Dataset="Test")
test1_data = model_performance_table_test1.assign(Dataset="Test1")

# Add blank rows between datasets
blank_row = pd.DataFrame({col: [''] for col in training_data.columns}, index=[0])
combined_performance = pd.concat([
    training_data,
    blank_row,
    test_data,
    blank_row.copy(),
    test1_data
], ignore_index=True)

# Reorder columns to put Dataset first
column_order = ["Dataset", "Model", "Accuracy", "Precision", "Recall", "F1 Score"]
combined_performance = combined_performance[column_order]

combined_performance


Unnamed: 0,Dataset,Model,Accuracy,Precision,Recall,F1 Score
0,Training,Random Forest,0.999334,0.999376,0.999208,0.999292
1,Training,Logistic Regression,0.975574,0.956121,0.99366,0.974529
2,Training,Gradient Boosting,0.986747,0.993164,0.978554,0.985805
3,,,,,,
4,Test,Random Forest,0.89142,0.998537,0.827754,0.90516
5,Test,Logistic Regression,0.631891,1.0,0.411936,0.583505
6,Test,Gradient Boosting,0.89142,0.998537,0.827754,0.90516
7,,,,,,
8,Test1,Random Forest,0.975353,0.948357,1.0,0.973494
9,Test1,Logistic Regression,0.93029,0.992759,0.8522,0.917125


## Detection per Time Window

### Common Helper Functions for Testing and Evaluation of the Models

In [17]:
def aggregate(df: pd.DataFrame) -> pd.DataFrame:
    # Convert timestamp to datetime for easier time window grouping
    df["datetime"] = pd.to_datetime(df["timestamp"], unit="ms")

    time_window = "5min"

    # Group by IP and time window
    df_grouped = df.groupby(["ip", pd.Grouper(key="datetime", freq=time_window)]).agg({
        "response_size": ["count", "mean", "sum"],
        "bytes_received": ["mean", "sum"],
        "bytes_sent": ["mean", "sum"],
        "bytes_transferred": ["mean", "sum"],
        "processing_time": ["mean", "sum"],
        "label": ["max"]  # Use max to identify if any request in the window was malicious
    })

    # Flatten the column hierarchy
    df_grouped.columns = ["_".join(col).strip() for col in df_grouped.columns.values]

    # Rename columns for clarity
    df_grouped.rename(columns={"response_size_count": "request_count", "label_max": "label"}, inplace=True)

    # Add features about connection patterns
    connection_stats = df.groupby(["ip", pd.Grouper(key="datetime", freq=time_window)])["connection_status"].value_counts().unstack(fill_value=0)
    df_grouped = df_grouped.join(connection_stats)
    df_grouped.rename(columns={"+": "connection_status_+_count", "-": "connection_status_-_count"}, inplace=True)

    # Reset index to make IP and time window regular columns
    df_grouped.reset_index(inplace=True)

    # Add path diversity feature
    path_counts = df.groupby(["ip", pd.Grouper(key="datetime", freq=time_window)])["path"].nunique()
    df_grouped["unique_paths"] = path_counts.values

    # Sort df_grouped by datetime
    df_grouped = df_grouped.sort_values(by="datetime")
    return df_grouped

def normalize_numerical_features_tw(df: pd.DataFrame) -> pd.DataFrame:
    scaler = StandardScaler()
    numerical_features = [
        "request_count",
        "response_size_mean",
        "response_size_sum",
        "bytes_received_mean",
        "bytes_received_sum",
        "bytes_sent_mean",
        "bytes_sent_sum",
        "bytes_transferred_mean",
        "bytes_transferred_sum",
        "processing_time_mean",
        "processing_time_sum",
        "connection_status_+_count",
        "connection_status_-_count",
        "unique_paths"
    ]
    df[numerical_features] = scaler.fit_transform(df[numerical_features])
    return df 

def get_dataset_for_model_tw(df):
    # Dataset preparation
    ip_timestamps = df[["ip", "datetime"]]
    X = df.drop(columns=["ip", "datetime", "label"])
    y = df["label"]
    return ip_timestamps, X, y

def prepare_and_test_tw(_model, dataset_path: str, output_modifier=None):
    print(f"Using dataset '{dataset_path}'")
    test_df = load_dataset(dataset_path)
    test_df = drop_columns(test_df)
    test_df = preprocess(test_df)
    test_df = encode_labels(test_df)
    test_df = aggregate(test_df)
    # test_df = normalize_numerical_features_tw(test_df)  # commented because it breaks following models

    _, X_test, y_test = get_dataset_for_model_tw(test_df)
    model_performance = predict_and_evaluate(_model, X_test, y_test, output_modifier)
    print("--------------------------------")
    return model_performance

### Dataset Pre-processing

In [18]:
df = load_dataset("../dataset/train1_access.log")
df = drop_columns(df)
df.describe()

Unnamed: 0,ip,timestamp,path,response_size,bytes_received,bytes_sent,bytes_transferred,connection_status,keepalive_count,processing_time,label
count,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934
unique,194761,329748,2,4,372,7,422,2,8,102841,2
top,172.19.227.19,1746989729109,/index.html,100396,471,100684,10860,+,0,9166,0
freq,81,23,255691,248147,9404,242068,9130,411386,383172,52,221926


In [19]:
df = preprocess(df)
df = encode_labels(df)

# Aggregate stats per IP within 5 minute time windows
df_grouped = aggregate(df)
df_grouped.describe()

Unnamed: 0,datetime,request_count,response_size_mean,response_size_sum,bytes_received_mean,bytes_received_sum,bytes_sent_mean,bytes_sent_sum,bytes_transferred_mean,bytes_transferred_sum,processing_time_mean,processing_time_sum,label,connection_status_+_count,connection_status_-_count,unique_paths
count,224757,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0,224757.0
mean,2025-05-11 19:40:43.600421120,1.863942,88855.503733,118189.3,437.385539,910.908848,89139.800299,118721.0,89577.185838,119631.9,721944.9,736649.0,0.85224,1.830359,0.033583,1.110114
min,2025-05-11 18:00:00,1.0,221.0,221.0,190.0,190.0,405.0,405.0,596.0,596.0,39.0,39.0,0.0,0.0,0.0,1.0
25%,2025-05-11 19:25:00,1.0,100396.0,100396.0,403.0,403.0,100684.0,100684.0,101074.0,101086.0,9314.0,9917.0,1.0,1.0,0.0,1.0
50%,2025-05-11 19:35:00,1.0,100396.0,100396.0,428.0,430.0,100684.0,100684.0,101102.0,101112.0,20821.0,37372.0,1.0,1.0,0.0,1.0
75%,2025-05-11 19:45:00,1.0,100396.0,100396.0,459.0,461.0,100684.0,100684.0,101124.0,101143.0,72815.0,89351.0,1.0,1.0,0.0,1.0
max,2025-05-11 23:35:00,81.0,100396.0,2463189.0,777.666667,46969.0,100684.0,2484378.0,101456.0,2525351.0,21624310.0,42610230.0,1.0,81.0,2.0,2.0
std,,3.300364,27699.857161,91017.65,65.976268,1869.384989,27711.479861,91885.72,27696.819163,93579.48,3679731.0,3684428.0,0.354862,3.314032,0.180376,0.313033


In [20]:
# df_grouped = normalize_numerical_features_tw(df_grouped)  # commented because it breaks following models
df_grouped


Unnamed: 0,ip,datetime,request_count,response_size_mean,response_size_sum,bytes_received_mean,bytes_received_sum,bytes_sent_mean,bytes_sent_sum,bytes_transferred_mean,bytes_transferred_sum,processing_time_mean,processing_time_sum,label,connection_status_+_count,connection_status_-_count,unique_paths
8003,172.17.86.108,2025-05-11 18:00:00,3,70298.333333,210895,505.666667,1517,70585.333333,211756,71091.000000,213273,13380.666667,40142,0,3,0,2
13619,172.19.165.119,2025-05-11 18:00:00,1,100396.000000,100396,410.000000,410,100684.000000,100684,101094.000000,101094,12692.000000,12692,0,1,0,1
26971,172.22.208.215,2025-05-11 18:00:00,1,100396.000000,100396,717.000000,717,100684.000000,100684,101401.000000,101401,12260.000000,12260,0,1,0,1
12904,172.19.123.119,2025-05-11 18:00:00,2,100396.000000,200792,538.000000,1076,100683.500000,201367,101221.500000,202443,11768.000000,23536,0,2,0,1
23462,172.21.250.180,2025-05-11 18:00:00,2,55249.500000,110499,487.000000,974,55536.500000,111073,56023.500000,112047,14721.500000,29443,0,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30930,172.23.200.168,2025-05-11 23:35:00,5,46220.200000,231101,580.000000,2900,46506.800000,232534,47086.800000,235434,12361.600000,61808,0,5,0,2
24805,172.21.93.185,2025-05-11 23:35:00,10,46220.200000,462202,504.600000,5046,46507.000000,465070,47011.600000,470116,9183.400000,91834,0,10,0,2
8467,172.18.110.159,2025-05-11 23:35:00,8,43962.875000,351703,492.500000,3940,44249.500000,353996,44742.000000,357936,11448.375000,91587,0,8,0,2
21805,172.21.157.185,2025-05-11 23:35:00,18,20135.555556,362440,494.500000,8901,20421.611111,367589,20916.111111,376490,9413.333333,169440,0,18,0,2


In [21]:
# Separate ip, datetime and labels from the dataset
_, X, y = get_dataset_for_model_tw(df_grouped)

### Random Forest

In [22]:
# Training
model_rf_tw = RandomForestClassifier(n_estimators=10, random_state=42)
model_rf_tw.fit(X, y)

# Testing
model_rf_tw_performance_train = predict_and_evaluate(model_rf_tw, X, y)
print("--------------------------------")
model_rf_tw_performance_test = prepare_and_test_tw(model_rf_tw, "../dataset/test_access.log")
model_rf_tw_performance_test1 = prepare_and_test_tw(model_rf_tw, "../dataset/test1_access.log")


Accuracy: 0.9994171482979396
Precision: 0.9996293550778355
Recall: 0.9996867609516202
F1 Score: 0.9996580571905874
True Positive Rate: 0.9996867609516202
True Negative Rate: 0.9978620897320084
False Positive Rate: 0.002137910267991569
False Negative Rate: 0.00031323904837977104
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.9972801450589301
Precision: 0.997093023255814
Recall: 1.0
F1 Score: 0.9985443959243085
True Positive Rate: 1.0
True Negative Rate: 0.9594594594594594
False Positive Rate: 0.04054054054054054
False Negative Rate: 0.0
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.9917427669198601
Precision: 0.9927598065834045
Recall: 0.9972207792207792
F1 Score: 0.9949852927837457
True Positive Rate: 0.9972207792207792
True Negative Rate: 0.9665391969407265
False Positive Rate: 0.033460803059273424
False Negative Rate: 0.0027792207792207793
--------------------------------


In [23]:

# # Get feature importances
feature_importances = pd.DataFrame({
    'Feature': X.columns,
    'Importance': model_rf_tw.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nTop 10 most important features:")
print(feature_importances.head(10))


Top 10 most important features:
                      Feature  Importance
8       bytes_transferred_sum    0.246445
4          bytes_received_sum    0.189826
0               request_count    0.137979
3         bytes_received_mean    0.121403
13               unique_paths    0.072725
1          response_size_mean    0.066758
11  connection_status_+_count    0.034378
9        processing_time_mean    0.026015
12  connection_status_-_count    0.025862
2           response_size_sum    0.024999


### Logistic Regression

In [24]:
# Training
model_lr_tw = LogisticRegression(max_iter=200, random_state=42)
model_lr_tw.fit(X, y)

# Testing
model_lr_tw_performance_train = predict_and_evaluate(model_lr_tw, X, y)
print("--------------------------------")
model_lr_tw_performance_test = prepare_and_test_tw(model_lr_tw, "../dataset/test_access.log")
model_lr_tw_performance_test1 = prepare_and_test_tw(model_lr_tw, "../dataset/test1_access.log")

Accuracy: 0.9961425005672794
Precision: 0.9956383410099918
Recall: 0.9998538217774228
F1 Score: 0.997741628788372
True Positive Rate: 0.9998538217774228
True Negative Rate: 0.9747365251430292
False Positive Rate: 0.02526347485697079
False Negative Rate: 0.0001461782225772265
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.9972801450589301
Precision: 0.997093023255814
Recall: 1.0
F1 Score: 0.9985443959243085
True Positive Rate: 1.0
True Negative Rate: 0.9594594594594594
False Positive Rate: 0.04054054054054054
False Negative Rate: 0.0
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.992596227703337
Precision: 0.9910675213015162
Recall: 1.0
F1 Score: 0.9955137238677647
True Positive Rate: 1.0
True Negative Rate: 0.9585325047801148
False Positive Rate: 0.04146749521988528
False Negative Rate: 0.0
--------------------------------


### K-Nearest Neighbors

In [25]:
# Training
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X, y)

# Testing
model_knn_tw_performance_train = predict_and_evaluate(knn, X, y)
print("--------------------------------")
model_knn_tw_performance_test = prepare_and_test_tw(knn, "../dataset/test_access.log")
model_knn_tw_performance_test1 = prepare_and_test_tw(knn, "../dataset/test1_access.log")

Accuracy: 0.9956219383600956
Precision: 0.9955892832065079
Recall: 0.9992899914903391
F1 Score: 0.9974362047492742
True Positive Rate: 0.9992899914903391
True Negative Rate: 0.9744655224330021
False Positive Rate: 0.025534477566997893
False Negative Rate: 0.0007100085096608143
--------------------------------
Using dataset '../dataset/test_access.log'
Accuracy: 0.993427017225748
Precision: 0.9930036188178528
Recall: 1.0
F1 Score: 0.9964895291126982
True Positive Rate: 1.0
True Negative Rate: 0.902027027027027
False Positive Rate: 0.09797297297297297
False Negative Rate: 0.0
--------------------------------
Using dataset '../dataset/test1_access.log'
Accuracy: 0.9863873005035418
Precision: 0.984739079223639
Recall: 0.9989090909090909
F1 Score: 0.9917734739665266
True Positive Rate: 0.9989090909090909
True Negative Rate: 0.9287762906309751
False Positive Rate: 0.07122370936902486
False Negative Rate: 0.001090909090909091
--------------------------------


### Performance Metrics Summary

In [26]:
model_tw_performance_table_train = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "KNN"],
    "Accuracy": [model_rf_tw_performance_train["accuracy"], model_lr_tw_performance_train["accuracy"], model_knn_tw_performance_train["accuracy"]],
    "Precision": [model_rf_tw_performance_train["precision"], model_lr_tw_performance_train["precision"], model_knn_tw_performance_train["precision"]],
    "Recall": [model_rf_tw_performance_train["recall"], model_lr_tw_performance_train["recall"], model_knn_tw_performance_train["recall"]],
    "F1 Score": [model_rf_tw_performance_train["f1"], model_lr_tw_performance_train["f1"], model_knn_tw_performance_train["f1"]]
})

model_tw_performance_table_train

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.999417,0.999629,0.999687,0.999658
1,Logistic Regression,0.996143,0.995638,0.999854,0.997742
2,KNN,0.995622,0.995589,0.99929,0.997436


In [27]:
model_tw_performance_table_test = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "KNN"],
    "Accuracy": [model_rf_tw_performance_test["accuracy"], model_lr_tw_performance_test["accuracy"], model_knn_tw_performance_test["accuracy"]],
    "Precision": [model_rf_tw_performance_test["precision"], model_lr_tw_performance_test["precision"], model_knn_tw_performance_test["precision"]],
    "Recall": [model_rf_tw_performance_test["recall"], model_lr_tw_performance_test["recall"], model_knn_tw_performance_test["recall"]],
    "F1 Score": [model_rf_tw_performance_test["f1"], model_lr_tw_performance_test["f1"], model_knn_tw_performance_test["f1"]]
})
model_tw_performance_table_test

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.99728,0.997093,1.0,0.998544
1,Logistic Regression,0.99728,0.997093,1.0,0.998544
2,KNN,0.993427,0.993004,1.0,0.99649


In [28]:
model_tw_performance_table_test1 = pd.DataFrame({
    "Model": ["Random Forest", "Logistic Regression", "KNN"],
    "Accuracy": [model_rf_tw_performance_test1["accuracy"], model_lr_tw_performance_test1["accuracy"], model_knn_tw_performance_test1["accuracy"]],
    "Precision": [model_rf_tw_performance_test1["precision"], model_lr_tw_performance_test1["precision"], model_knn_tw_performance_test1["precision"]],
    "Recall": [model_rf_tw_performance_test1["recall"], model_lr_tw_performance_test1["recall"], model_knn_tw_performance_test1["recall"]],
    "F1 Score": [model_rf_tw_performance_test1["f1"], model_lr_tw_performance_test1["f1"], model_knn_tw_performance_test1["f1"]]
})
model_tw_performance_table_test1

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Random Forest,0.991743,0.99276,0.997221,0.994985
1,Logistic Regression,0.992596,0.991068,1.0,0.995514
2,KNN,0.986387,0.984739,0.998909,0.991773


In [29]:
# Create a combined table with dataset information
training_data_tw = model_tw_performance_table_train.assign(Dataset="Training")
test_data_tw = model_tw_performance_table_test.assign(Dataset="Test")
test1_data_tw = model_tw_performance_table_test1.assign(Dataset="Test1")

# Add blank rows to visually separate used datasets
blank_row_tw = pd.DataFrame({col: [''] for col in training_data_tw.columns}, index=[0])
combined_performance_tw = pd.concat([
    training_data_tw,
    blank_row_tw,
    test_data_tw,
    blank_row_tw.copy(),
    test1_data_tw
], ignore_index=True)

# Reorder columns to put Dataset first
column_order_tw = ["Dataset", "Model", "Accuracy", "Precision", "Recall", "F1 Score"]
combined_performance_tw = combined_performance_tw[column_order_tw]

combined_performance_tw

Unnamed: 0,Dataset,Model,Accuracy,Precision,Recall,F1 Score
0,Training,Random Forest,0.999417,0.999629,0.999687,0.999658
1,Training,Logistic Regression,0.996143,0.995638,0.999854,0.997742
2,Training,KNN,0.995622,0.995589,0.99929,0.997436
3,,,,,,
4,Test,Random Forest,0.99728,0.997093,1.0,0.998544
5,Test,Logistic Regression,0.99728,0.997093,1.0,0.998544
6,Test,KNN,0.993427,0.993004,1.0,0.99649
7,,,,,,
8,Test1,Random Forest,0.991743,0.99276,0.997221,0.994985
9,Test1,Logistic Regression,0.992596,0.991068,1.0,0.995514


In [30]:
combined_performance

Unnamed: 0,Dataset,Model,Accuracy,Precision,Recall,F1 Score
0,Training,Random Forest,0.999334,0.999376,0.999208,0.999292
1,Training,Logistic Regression,0.975574,0.956121,0.99366,0.974529
2,Training,Gradient Boosting,0.986747,0.993164,0.978554,0.985805
3,,,,,,
4,Test,Random Forest,0.89142,0.998537,0.827754,0.90516
5,Test,Logistic Regression,0.631891,1.0,0.411936,0.583505
6,Test,Gradient Boosting,0.89142,0.998537,0.827754,0.90516
7,,,,,,
8,Test1,Random Forest,0.975353,0.948357,1.0,0.973494
9,Test1,Logistic Regression,0.93029,0.992759,0.8522,0.917125
