# Detection of DDoS Attack Using Machine Learning

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


# Parse logs into DataFrame
columns = ["ip", "logname", "user", "timestamp", "method", "path", "query", 
           "protocol", "status", "response_size", "referer", "user_agent", "bytes_received",
           "bytes_sent", "bytes_transferred", "connection_status", "keepalive_count",
           "processing_time", "error_log_id", "label"]

df = pd.DataFrame(
    np.genfromtxt("../dataset/train1_access.log", delimiter="|", dtype=str, encoding="utf-8"),
    columns=columns
)
df = df[df["label"] != "-"]
df = df.drop(['logname', 'user', 'method', 'protocol', 'error_log_id', "user_agent", "query", "status", "referer"], axis=1)
df.describe()

Unnamed: 0,ip,timestamp,path,response_size,bytes_received,bytes_sent,bytes_transferred,connection_status,keepalive_count,processing_time,label
count,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934,418934
unique,194761,329748,2,4,372,7,422,2,8,102841,2
top,172.19.227.19,1746989729109,/index.html,100396,471,100684,10860,+,0,9166,0
freq,81,23,255691,248147,9404,242068,9130,411386,383172,52,221926


In [2]:
# Preprocessing
df["timestamp"] = pd.to_numeric(df["timestamp"])
df["response_size"] = pd.to_numeric(df["response_size"])
df["bytes_received"] = pd.to_numeric(df["bytes_received"])
df["bytes_sent"] = pd.to_numeric(df["bytes_sent"])
df["bytes_transferred"] = pd.to_numeric(df["bytes_transferred"])
df["keepalive_count"] = pd.to_numeric(df["keepalive_count"])
df["processing_time"] = pd.to_numeric(df["processing_time"])

# One-hot encoding
df = pd.get_dummies(df, columns=["path", "connection_status"])
df.head()

Unnamed: 0,ip,timestamp,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,label,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,172.21.250.180,1746986686795,100396,457,100684,101141,0,22028,0,True,False,True,False
1,172.19.123.119,1746986687918,100396,510,100684,101194,0,12603,0,True,False,True,False
2,172.17.86.108,1746986689457,100396,467,100684,101151,0,14308,0,True,False,True,False
3,172.20.216.143,1746986690686,100396,716,100684,101400,0,13694,0,True,False,True,False
4,172.19.165.119,1746986691804,100396,410,100684,101094,0,12692,0,True,False,True,False


In [3]:

le = LabelEncoder()
df["label"] = le.fit_transform(df["label"])

scaler = StandardScaler()
numerical_features = ["response_size", "bytes_received", "bytes_sent", "bytes_transferred", "keepalive_count", "processing_time"]
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [5]:
# Dataset preparation
ip_timestamps = df[["ip", "timestamp"]]
X = df[[
    "response_size",
    "bytes_received",
    "bytes_sent",
    "bytes_transferred",
    "keepalive_count",
    "processing_time",
    "path_/index.html",
    "path_/item.html",
    "connection_status_+",
    "connection_status_-"
]]
y = df["label"]

In [6]:
X

Unnamed: 0,response_size,bytes_received,bytes_sent,bytes_transferred,keepalive_count,processing_time,path_/index.html,path_/item.html,connection_status_+,connection_status_-
0,0.829257,-0.347614,0.829251,0.829325,-0.281424,-0.137208,True,False,True,False
1,0.829257,0.233566,0.829251,0.830514,-0.281424,-0.140673,True,False,True,False
2,0.829257,-0.237957,0.829251,0.829550,-0.281424,-0.140047,True,False,True,False
3,0.829257,2.492493,0.829251,0.835137,-0.281424,-0.140272,True,False,True,False
4,0.829257,-0.863000,0.829251,0.828271,-0.281424,-0.140641,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...
418929,-1.195094,0.288395,-1.194979,-1.195521,-0.281424,-0.141999,False,True,True,False
418930,-1.195094,1.088888,-1.195001,-1.193905,2.477720,-0.140931,False,True,True,False
418931,-1.195094,3.139468,-1.194979,-1.189686,-0.281424,-0.141360,False,True,True,False
418932,-1.195094,-0.183129,-1.194979,-1.196486,-0.281424,-0.140076,False,True,True,False


In [7]:
y

0         0
1         0
2         0
3         0
4         0
         ..
418929    0
418930    0
418931    0
418932    0
418933    0
Name: label, Length: 418934, dtype: int64

In [8]:
# Training
model = RandomForestClassifier(random_state=42)
model.fit(X, y)

In [9]:
# Prediction
y_predict = model.predict(X)

# Evaluation
accuracy = accuracy_score(y, y_predict)
precision = precision_score(y, y_predict)
recall = recall_score(y, y_predict)
f1 = f1_score(y, y_predict)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

Accuracy: 0.9994509875063853
Precision: 0.9995278324973091
Recall: 0.9993045967676439
F1 Score: 0.9994162021666514
