In [21]:
import pandas as pd
import numpy as np

from helpers.data_helper import load_true_labels
from helpers.intranet_access_log_helper import train_log_classifier, extract_features, create_sequence_features, create_lstm_model

In [2]:
# PATHS 

# 1. santos | intranet_server | apache2 | intranet.smith.santos.com.access.log 
path_santos = "../AIT_LD-v2/santos"
path_log_file = "/gather/intranet_server/logs/apache2/intranet.smith.santos.com-access.log"
path_true_labels = "/labels/intranet_server/logs/apache2/intranet.smith.santos.com-access.log"

In [3]:
# Define the column names for the log format
columns = ['ip', 'logname', 'username', 'timestamp', 'requestmethod', 'statuscode', 'responsesize', 'referrer', 'user_agent']

# Read the log file with space as delimiter
df_raw = pd.read_csv(
    path_santos + path_log_file,
    sep=r'\s(?=(?:[^"]*"[^"]*")*[^"]*$)(?![^\[]*\])',
    engine='python',
    header=None,
    names=columns,
    na_values='-',
    quoting=3
)

# Post-process the timestamp and request method columns
# Remove brackets from timestamp
df_raw['timestamp'] = df_raw['timestamp'].str.strip('[]')

# Split the request method column (which contains "GET /path HTTP/1.1")
# Create new columns for method, path, and protocol
df_raw[['method', 'path', 'protocol']] = df_raw['requestmethod'].str.strip('"').str.split(' ', n=2, expand=True)

# Drop the original requestmethod column
#df_raw = df_raw.drop('requestmethod', axis=1)

df_raw.head()

Unnamed: 0,ip,logname,username,timestamp,requestmethod,statuscode,responsesize,referrer,user_agent,method,path,protocol
0,172.21.128.119,,,17/Jan/2022:06:25:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,3598,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
1,172.21.128.119,,,17/Jan/2022:06:26:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
2,172.21.128.119,,,17/Jan/2022:06:27:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
3,172.21.128.119,,,17/Jan/2022:06:28:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
4,172.21.128.119,,,17/Jan/2022:06:29:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1


In [4]:
# Load true labels
true_labels = load_true_labels(path_santos + path_true_labels)

true_labels

Unnamed: 0,line,labels,rules
0,431,"[service_scan, foothold]","{'service_scan': ['attacker.service_scan'], 'f..."
1,432,"[service_scan, foothold]","{'service_scan': ['attacker.service_scan'], 'f..."
2,433,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
3,434,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
4,435,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
...,...,...,...
7789,8221,"[attacker_http, foothold, webshell_cmd]",{'attacker_http': ['attacker.foothold.apache.a...
7790,8222,"[webshell_cmd, escalate]",{'webshell_cmd': ['attacker.escalate.webshell....
7791,8223,"[webshell_cmd, escalate]",{'webshell_cmd': ['attacker.escalate.webshell....
7792,8224,"[webshell_cmd, escalate]",{'webshell_cmd': ['attacker.escalate.webshell....


In [5]:
df_raw

Unnamed: 0,ip,logname,username,timestamp,requestmethod,statuscode,responsesize,referrer,user_agent,method,path,protocol
0,172.21.128.119,,,17/Jan/2022:06:25:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,3598,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
1,172.21.128.119,,,17/Jan/2022:06:26:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
2,172.21.128.119,,,17/Jan/2022:06:27:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
3,172.21.128.119,,,17/Jan/2022:06:28:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
4,172.21.128.119,,,17/Jan/2022:06:29:38 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1171,"""https://intranet.smith.santos.com/wp-admin/up...","""Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:86...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
...,...,...,...,...,...,...,...,...,...,...,...,...
9457,172.21.128.119,,,17/Jan/2022:20:26:02 +0000,"""GET /wp-content/plugins/wpdiscuz/assets/img/l...",200,4069,"""https://intranet.smith.santos.com/wp-content/...","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",GET,/wp-content/plugins/wpdiscuz/assets/img/loadin...,HTTP/1.1
9458,172.21.128.119,,,17/Jan/2022:20:26:02 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,5673,"""https://intranet.smith.santos.com/?p=5""","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
9459,172.21.128.119,,,17/Jan/2022:20:26:02 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,540,"""https://intranet.smith.santos.com/?p=5""","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",POST,/wp-admin/admin-ajax.php,HTTP/1.1
9460,172.21.128.119,,,17/Jan/2022:20:26:31 +0000,"""POST /wp-admin/admin-ajax.php HTTP/1.1""",200,1230,"""https://intranet.smith.santos.com/?p=5""","""Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/5...",POST,/wp-admin/admin-ajax.php,HTTP/1.1


In [6]:
df_raw.describe()

Unnamed: 0,logname,username,statuscode,responsesize
count,0.0,0.0,9462.0,9462.0
mean,,,367.807123,4388.614458
std,,,77.887448,31888.408164
min,,,200.0,0.0
25%,,,404.0,146.0
50%,,,404.0,363.0
75%,,,404.0,363.0
max,,,500.0,569760.0


In [7]:
# Change object types to strings

df_raw['ip'] = df_raw['ip'].astype(str)
df_raw['timestamp'] = df_raw['timestamp'].astype(str)
df_raw['requestmethod'] = df_raw['requestmethod'].astype(str)
df_raw['referrer'] = df_raw['referrer'].astype(str)
df_raw['user_agent'] = df_raw['user_agent'].astype(str)
df_raw['method'] = df_raw['method'].astype(str)
df_raw['path'] = df_raw['path'].astype(str)
df_raw['protocol'] = df_raw['protocol'].astype(str)

# Drop username and logname (never used)
df_raw.drop(['username', 'logname'], axis=1, inplace=True)


In [8]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9462 entries, 0 to 9461
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ip             9462 non-null   object
 1   timestamp      9462 non-null   object
 2   requestmethod  9462 non-null   object
 3   statuscode     9462 non-null   int64 
 4   responsesize   9462 non-null   int64 
 5   referrer       9462 non-null   object
 6   user_agent     9462 non-null   object
 7   method         9462 non-null   object
 8   path           9462 non-null   object
 9   protocol       9462 non-null   object
dtypes: int64(2), object(8)
memory usage: 739.3+ KB


In [9]:
df_features, encoders = extract_features(df_raw)

df_features

Unnamed: 0,method_encoded,path_encoded,protocol_encoded,statuscode,responsesize,path_length,has_php,has_wp,is_scan
0,4,4505,1,200,3598,24,1,1,0
1,4,4505,1,200,1171,24,1,1,0
2,4,4505,1,200,1171,24,1,1,0
3,4,4505,1,200,1171,24,1,1,0
4,4,4505,1,200,1171,24,1,1,0
...,...,...,...,...,...,...,...,...,...
9457,1,4925,1,200,4069,51,0,1,0
9458,4,4505,1,200,5673,24,1,1,0
9459,4,4505,1,200,540,24,1,1,0
9460,4,4505,1,200,1230,24,1,1,0


In [14]:
# Create sequences
window_size = 10
X = create_sequence_features(df_features, window_size)

X

array([[[   4, 4505,    1, ...,    1,    1,    0],
        [   4, 4505,    1, ...,    1,    1,    0],
        [   4, 4505,    1, ...,    1,    1,    0],
        ...,
        [   1,    1,    1, ...,    0,    0,    0],
        [   1, 7764,    1, ...,    0,    1,    0],
        [   1, 6426,    1, ...,    0,    1,    0]],

       [[   4, 4505,    1, ...,    1,    1,    0],
        [   4, 4505,    1, ...,    1,    1,    0],
        [   4, 4505,    1, ...,    1,    1,    0],
        ...,
        [   1, 7764,    1, ...,    0,    1,    0],
        [   1, 6426,    1, ...,    0,    1,    0],
        [   1, 7794,    1, ...,    0,    1,    0]],

       [[   4, 4505,    1, ...,    1,    1,    0],
        [   4, 4505,    1, ...,    1,    1,    0],
        [   4, 4505,    1, ...,    1,    1,    0],
        ...,
        [   1, 6426,    1, ...,    0,    1,    0],
        [   1, 7794,    1, ...,    0,    1,    0],
        [   1, 6428,    1, ...,    0,    1,    0]],

       ...,

       [[   1, 4942,    

In [18]:
# Prepare labels
y = np.array(true_labels)

y

array([[431, list(['service_scan', 'foothold']),
        {'service_scan': ['attacker.service_scan'], 'foothold': ['attacker.service_scan']}],
       [432, list(['service_scan', 'foothold']),
        {'service_scan': ['attacker.service_scan'], 'foothold': ['attacker.service_scan']}],
       [433, list(['attacker_http', 'foothold', 'service_scan']),
        {'attacker_http': ['attacker.foothold.apache.access'], 'foothold': ['attacker.foothold.apache.access', 'attacker.service_scan'], 'service_scan': ['attacker.service_scan']}],
       ...,
       [8223, list(['webshell_cmd', 'escalate']),
        {'webshell_cmd': ['attacker.escalate.webshell.cmd.http_prepare_crack'], 'escalate': ['attacker.escalate.webshell.cmd.http_prepare_crack']}],
       [8224, list(['webshell_cmd', 'escalate']),
        {'webshell_cmd': ['attacker.escalate.webshell.cmd.http_prepare_crack'], 'escalate': ['attacker.escalate.webshell.cmd.http_prepare_crack']}],
       [8322, list(['webshell_cmd', 'escalate']),
        

In [22]:
# Create and train model
input_shape = (window_size, X.shape[2])
num_classes = y.shape[1]  # number of label classes

model = create_lstm_model(input_shape, num_classes)

model

  super().__init__(**kwargs)


<Sequential name=sequential, built=True>

In [None]:
# Split data #TODO: Use data_helper instead
train_split = int(len(X) * 0.8)
X_train, X_test = X[:train_split], X[train_split:]
y_train, y_test = y[:train_split], y[train_split:]

# Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32
)


# TODO: Hier weiter und nochmal die features durchgehen!!!

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type int).

In [None]:
# Train model
model, history, encoders = train_log_classifier(df_raw, true_labels)

# Make predictions
new_logs = [...] # New log lines
predictions = predict_sequences(model, new_logs, encoders)# Load your data
log_data = [...] # Your log lines
labels = [...] # Your labels

# Train model
model, history, encoders = train_log_classifier(log_data, labels)

# Make predictions
new_logs = [...] # New log lines
predictions = predict_sequences(model, new_logs, encoders)

KeyError: 'requestmethod'

In [None]:
df_raw.describe()

Unnamed: 0,identity,user,status,size
count,0.0,0.0,9462.0,9462.0
mean,,,367.807123,4388.614458
std,,,77.887448,31888.408164
min,,,200.0,0.0
25%,,,404.0,146.0
50%,,,404.0,363.0
75%,,,404.0,363.0
max,,,500.0,569760.0


In [None]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9462 entries, 0 to 9461
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ip          9462 non-null   object 
 1   identity    0 non-null      float64
 2   user        0 non-null      float64
 3   datetime    9462 non-null   object 
 4   request     9462 non-null   object 
 5   status      9462 non-null   int64  
 6   size        9462 non-null   int64  
 7   referer     9462 non-null   object 
 8   user_agent  9462 non-null   object 
dtypes: float64(2), int64(2), object(5)
memory usage: 665.4+ KB


In [None]:
# Remove the rows with missing values
df_raw = df_raw.drop(["identity", "user"], axis=1)

df_raw.describe()

Unnamed: 0,status,size
count,9462.0,9462.0
mean,367.807123,4388.614458
std,77.887448,31888.408164
min,200.0,0.0
25%,404.0,146.0
50%,404.0,363.0
75%,404.0,363.0
max,500.0,569760.0


In [None]:
df_raw.isnull().sum()

ip            0
datetime      0
request       0
status        0
size          0
referer       0
user_agent    0
dtype: int64

In [None]:
df_raw.duplicated().sum()

5

In [None]:
# Check duplicates

df_raw[df_raw.duplicated()]

Unnamed: 0,ip,datetime,request,status,size,referer,user_agent
40,192.168.104.125,[17/Jan/2022:06:36:12 +0000],"""-""",408,3002,"""-""","""-"""
8245,192.168.104.4,[17/Jan/2022:11:43:19 +0000],"""-""",408,3002,"""-""","""-"""
8246,192.168.104.4,[17/Jan/2022:11:43:19 +0000],"""-""",408,3002,"""-""","""-"""
9313,192.168.104.4,[17/Jan/2022:18:39:44 +0000],"""-""",408,3002,"""-""","""-"""
9375,192.168.104.141,[17/Jan/2022:18:56:51 +0000],"""-""",408,3002,"""-""","""-"""


In [None]:
df_true_labels = load_true_labels(path_santos + path_true_labels)

df_true_labels.head(10)

Unnamed: 0,line,labels,rules
0,431,"[service_scan, foothold]","{'service_scan': ['attacker.service_scan'], 'f..."
1,432,"[service_scan, foothold]","{'service_scan': ['attacker.service_scan'], 'f..."
2,433,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
3,434,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
4,435,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
5,436,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
6,437,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
7,438,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
8,439,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...
9,440,"[attacker_http, foothold, service_scan]",{'attacker_http': ['attacker.foothold.apache.a...


In [None]:
# Notes about the log lines (santos):
# - First two big blocks of nmap and WPscan scans
# - Webshell upload: 3 messages (8191 - 8193): 2x GET then POST /wp-admin/admin-ajax.php HTTP/1.1
# - There are more lines with sus IP (172.21.128.119) which are not labeled as attacks

In [None]:
# Manually extract extra features

# Possible features:

# o Request
#  - IP address 
#  - request_path
#  - request_method
#  - response status codes (maybe even patterns e.g. many 404s followed by a 200?)
#  - path_analysis -> len, depth, has query
#  - payload analysis: Base64 encoded content, sql_patterns (maybe True / false if found) | select, insert, update, delete, union, join ?CHECK, CITE
#  - suspicious_file_extensions (true / false) | php, asp, exe, dll, sh, bat ?CHECK, CITE
#  - Time based: frequency, intervals
# o Response
#  - response_size
#  - status_code
# o User Agent
#  - search for specific terms (bot, crawler, spider, scan, WPscan, curl, wget, python, perl) ?CHECK, CITE

In [None]:
# Log line structure:
# 1. Client IP Address
# 2. Logname (not used, always -)
# 3. User Name (not used, always -)
# 4. Timestamp format [DD7Mon/YYYY:HH:MM:SS +0000]
# 5. Request (method, path, protocol version): "GET /path/to/file HTTP/1.1"
# 6. Response Status Code: 200, 404 etc.
# 7. Response Size in Bytes: 100, 6137 etc.
# 8. Referrer (page that linked to this url): "https://intranet.smith.santos.com/", "-"
# 9. User Agent (browser, OS, device): 
#   "python-requests/2.27.1",
#    "WPScan v3.8.20 (https://wpscan.com/wordpress-security-scanner)",
#    "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)"