In [1]:
import pandas as pd

from data_helper import load_data, load_true_labels, add_binary_true_labels_to_dataset, add_labels_to_dataset

In [2]:
# PATHS 

# 1. santos | intranet_server | apache2 | intranet.smith.santos.com.access.log 
path_santos = "../../AIT_LD-v2/santos"
path_log_file = "/gather/intranet_server/logs/apache2/intranet.smith.santos.com-access.log"
path_true_labels = "/labels/intranet_server/logs/apache2/intranet.smith.santos.com-access.log"

In [3]:
# Define the column names for the log format
columns = ['ip', 'identity', 'user', 'datetime', 'request', 'status', 'size', 'referer', 'user_agent']

# Read the log file with space as delimiter
# The sep parameter uses a regex pattern to handle multiple spaces
# The quoting=3 parameter treats quotes as part of the text
df_raw = pd.read_csv(
    path_santos + path_log_file,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    header=None,
    names=columns,
    na_values='-',
    quoting=3
)

df_raw.head()

Unnamed: 0,ip,identity,user,datetime,request,status,size,referer,user_agent
0,172.21.128.119,,,[17/Jan/2022:06:25:38 +0000],"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,3598,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86..."
1,172.21.128.119,,,[17/Jan/2022:06:26:38 +0000],"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86..."
2,172.21.128.119,,,[17/Jan/2022:06:27:38 +0000],"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86..."
3,172.21.128.119,,,[17/Jan/2022:06:28:38 +0000],"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86..."
4,172.21.128.119,,,[17/Jan/2022:06:29:38 +0000],"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86..."


In [4]:
df_raw.describe()

Unnamed: 0,identity,user,status,size
count,0.0,0.0,9462.0,9462.0
mean,,,367.807123,4388.614458
std,,,77.887448,31888.408164
min,,,200.0,0.0
25%,,,404.0,146.0
50%,,,404.0,363.0
75%,,,404.0,363.0
max,,,500.0,569760.0


In [5]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9462 entries, 0 to 9461
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ip          9462 non-null   object 
 1   identity    0 non-null      float64
 2   user        0 non-null      float64
 3   datetime    9462 non-null   object 
 4   request     9462 non-null   object 
 5   status      9462 non-null   int64  
 6   size        9462 non-null   int64  
 7   referer     9462 non-null   object 
 8   user_agent  9462 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 665.4+ KB


In [6]:
# Remove the rows with missing values
df_raw = df_raw.drop(["identity", "user"], axis=1)

df_raw.describe()

Unnamed: 0,status,size
count,9462.0,9462.0
mean,367.807123,4388.614458
std,77.887448,31888.408164
min,200.0,0.0
25%,404.0,146.0
50%,404.0,363.0
75%,404.0,363.0
max,500.0,569760.0


In [7]:
df_raw.isnull().sum()

ip            0
datetime      0
request       0
status        0
size          0
referer       0
user_agent    0
dtype: int64

In [8]:
df_raw.duplicated().sum()

5

In [9]:
# Check duplicates

df_raw[df_raw.duplicated()]

Unnamed: 0,ip,datetime,request,status,size,referer,user_agent
40,192.168.104.125,[17/Jan/2022:06:36:12 +0000],"""-""",408,3002,"""-""","""-"""
8245,192.168.104.4,[17/Jan/2022:11:43:19 +0000],"""-""",408,3002,"""-""","""-"""
8246,192.168.104.4,[17/Jan/2022:11:43:19 +0000],"""-""",408,3002,"""-""","""-"""
9313,192.168.104.4,[17/Jan/2022:18:39:44 +0000],"""-""",408,3002,"""-""","""-"""
9375,192.168.104.141,[17/Jan/2022:18:56:51 +0000],"""-""",408,3002,"""-""","""-"""


In [10]:
df_true_labels = load_true_labels(path_santos + path_true_labels)

df_true_labels.head(10)

Unnamed: 0,line,labels,rules
0,431,"[service_scan, foothold]","{'service_scan': ['attacker.service_scan'], 'f..."
1,432,"[service_scan, foothold]","{'service_scan': ['attacker.service_scan'], 'f..."
2,433,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
3,434,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
4,435,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
5,436,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
6,437,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
7,438,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
8,439,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
9,440,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...


In [11]:
# Manually extract extra features

# Possible features:

# o Request
#  - request_method
#  - path_analysis -> len, depth, has query
#  - sql_patterns (maybe True / false if found) | select, insert, update, delete, union, join ?CHECK, CITE
#  - suspicious_file_extensions (true / false) | php, asp, exe, dll, sh, bat ?CHECK, CITE
# o Response
#  - response_size
#  - status_code
# o User Agent
#  - search for specific terms (bot, crawler, spider, scan, WPscan, curl, wget, python, perl) ?CHECK, CITE