## Global parameters and notes

In [2]:
# Global Imports
import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Custom Imports
from helpers.data_helper import load_data_robust, load_models_from_disk, classify_unseen_data

In [3]:
PATH_DATASET_TO_CLASSIFY = "../AIT_LD-v2/russellmitchell"

#PATH_DATASET_TO_CLASSIFY = "../AIT_LD-v2/harrison"
#PATH_DATASET_TO_CLASSIFY = "../AIT_LD-v2/shaw"    
#PATH_DATASET_TO_CLASSIFY = "../AIT_LD-v2/wheeler" 
#PATH_DATASET_TO_CLASSIFY = "../AIT_LD-v2/wardbeck"
#PATH_DATASET_TO_CLASSIFY = "../AIT_LD-v2/fox"     
#PATH_DATASET_TO_CLASSIFY = "../AIT_LD-v2/wilson"  

# -> santos was used to train the models

### Get events from file: intranet / auth.log
* Contributes:
  * Compromised User Account -> IOC and Asset
  * Root Access Events -> Event
  * Which files were accessed -> Event
  * Which commands were executed and in which PWD -> Event
* Trained Classifiers: 
  * RandomForest 
  * GradientBoost 
  * SVM
  * MLP 
* Uses binary classification
* Features are simply booleans depending on presence of keywords

In [4]:
# Step by Step:

# 1. Import log file
# 2. Extract features from log file to be used for classification
# 3. Load trained classifier(s)
# 4. Classify log file
# 5. Keep attack-related logs

#6. TODO: Think about how to correlate them

#7. Upload Iris Report

In [5]:
# 1. Import log file

path_intranet_auth_log = "/gather/intranet_server/logs/auth.log"
df_intranet_auth = load_data_robust(PATH_DATASET_TO_CLASSIFY + path_intranet_auth_log)

#df_intranet_auth.head()

In [6]:
# 2. Extract features from log file to be used for classification

from helpers.intranet_auth_log_helper import extract_features

df_intranet_auth_features = extract_features(df_intranet_auth)

#df_intranet_auth_features.head()

In [7]:
# 3. Load trained classifier(s)

models = load_models_from_disk("trained-models/intranet_auth_log")

Loaded intr_auth_gradientboost from trained-models/intranet_auth_log\intr_auth_gradientboost.joblib
Loaded intr_auth_mlp from trained-models/intranet_auth_log\intr_auth_mlp.joblib
Loaded intr_auth_randomforest from trained-models/intranet_auth_log\intr_auth_randomforest.joblib
Loaded intr_auth_svc from trained-models/intranet_auth_log\intr_auth_svc.joblib


In [8]:
# 4. Classify log file
all_predictions = classify_unseen_data(models, df_intranet_auth_features)

all_predictions

{'intr_auth_gradientboost': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]),
 'intr_auth_mlp': array([0, 0, 0, 0, 0, 0, 0, 0, 0,

In [9]:
# TODO: Get some Stats / Evaluations, choose predictions of best model 

In [10]:
# Choose predictions of one model

predictions_grad_boost = all_predictions["intr_auth_gradientboost"]

In [11]:
# 5. Keep attack-related logs

df_attack_related_intranet_auth = df_intranet_auth[predictions_grad_boost == 1]

df_attack_related_intranet_auth


Unnamed: 0,timestamp,message
66,Jan 23 16:23:04,systemd-logind[957]: Removed session 111.
69,Jan 23 16:30:47,systemd: pam_unix(systemd-user:session): sessi...
70,Jan 23 16:30:47,systemd-logind[957]: New session 271 of user j...
144,Jan 24 04:37:40,su[27950]: Successful su for jhall by www-data
145,Jan 24 04:37:40,su[27950]: + /dev/pts/1 www-data:jhall
146,Jan 24 04:37:40,su[27950]: pam_unix(su:session): session opene...
147,Jan 24 04:37:40,systemd-logind[957]: New session c1 of user jh...
148,Jan 24 04:37:58,sudo: jhall : TTY=pts/1 ; PWD=/var/www/intr...
149,Jan 24 04:38:06,sudo: jhall : TTY=pts/1 ; PWD=/var/www/intr...
150,Jan 24 04:38:06,sudo: pam_unix(sudo:session): session opened f...


In [12]:
#6. TODO: Think about how to correlate them

#7. Upload Iris Report

### Get suspicious events and info from the dnsmasq logs
* Contributes:
  * IP address of where files are extracted to
  * potentially file names of extracted files
  * Time(frame) of extraction
* Trained Classifiers: 
  * RandomForest for MultiClass classification

* Includes DNS Exfiltration Attack Step
* Extract domain and message type using regex pattern matching
* Message types:
  * queries (a, aaaa, srv, txt, ptr, ,mx)
  * forwarded
  * reply
  * cached
  * nameserver
* Domain features:
  * Lenght
  * Parts
  * Avg_part_length
  * Max_part_length
  * Special_char_count
  * Numeric_char_count
  * Alpha_char_count
  * Entropy (Shannon)

In [13]:
# Step by Step:

# 1. Import log file
# 2. Extract features from log file to be used for classification
# 3. Load trained classifier(s)
# 4. Classify log file
# 5. Keep attack-related logs

#6. TODO: Think about how to correlate them

#7. Upload Iris Report

In [14]:
# 1. Import log file

path_inet_dnsmasq = "/gather/inet-firewall/logs/dnsmasq.log"
df_inet_dnsmasq = load_data_robust(PATH_DATASET_TO_CLASSIFY + path_inet_dnsmasq)

df_inet_dnsmasq.head()

Unnamed: 0,timestamp,message
0,Jan 21 00:00:09,query[A] 3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1z...
1,Jan 21 00:00:09,forwarded 3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1...
2,Jan 21 00:00:09,reply 3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1zAf*...
3,Jan 21 00:00:31,query[A] 3x6-.597-.L**fA/ib4pGEIb5*uJ223L5A/pW...
4,Jan 21 00:00:31,forwarded 3x6-.597-.L**fA/ib4pGEIb5*uJ223L5A/p...


In [15]:
# 2. Extract features from log file to be used for classification

from helpers.inet_dnsmasq_log_helper import extract_dns_features

df_inet_dnsmasq_features = extract_dns_features(df_inet_dnsmasq)

Unmatched rows:
  Row 17181: failed to access /etc/dnsmasq.d/dnsmasq-resolv.conf: No such file or directory


In [16]:
print("Whole dataset Count: ", df_inet_dnsmasq.__len__())
print("Dataset after feature extraction: ", df_inet_dnsmasq_features.__len__())

if(df_inet_dnsmasq.__len__() != df_inet_dnsmasq_features.__len__()):
    print("WARNING: Dataset length mismatch")

Whole dataset Count:  275900
Dataset after feature extraction:  275899


In [17]:
df_inet_dnsmasq_features

Unnamed: 0,message_type,timestamp,domain,domain_length,domain_parts,avg_part_length,max_part_length,special_char_count,numeric_char_count,alpha_char_count,entropy
0,query_a,Jan 21 00:00:09,3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1zAf*Wgpq-....,203,11,17.545455,35,13,30,150,5.768733
1,forwarded,Jan 21 00:00:09,3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1zAf*Wgpq-....,203,11,17.545455,35,13,30,150,5.768733
2,reply,Jan 21 00:00:09,3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1zAf*Wgpq-....,203,11,17.545455,35,13,30,150,5.768733
3,query_a,Jan 21 00:00:31,3x6-.597-.L**fA/ib4pGEIb5*uJ223L5A/pWGilEyrR-....,203,11,17.545455,35,17,42,134,5.763903
4,forwarded,Jan 21 00:00:31,3x6-.597-.L**fA/ib4pGEIb5*uJ223L5A/pWGilEyrR-....,203,11,17.545455,35,17,42,134,5.763903
...,...,...,...,...,...,...,...,...,...,...,...
275894,query_a,Jan 24 23:55:43,e6410.d.akamaiedge.net,22,4,4.750000,10,0,4,15,3.572624
275895,forwarded,Jan 24 23:55:43,e6410.d.akamaiedge.net,22,4,4.750000,10,0,4,15,3.572624
275896,reply,Jan 24 23:55:43,e6410.d.akamaiedge.net,22,4,4.750000,10,0,4,15,3.572624
275897,query_aaaa,Jan 24 23:58:27,mail,4,1,4.000000,4,0,0,4,2.000000


In [18]:
# Preprocess data before classification

preprocessor = load_models_from_disk("trained-models/inet_dnsmasq_log/more")["santos_dnsmasq_preprocessor1"]

df_inet_dnsmasq_features = df_inet_dnsmasq_features.drop(["timestamp"], axis=1)
df_inet_dnsmasq_features = preprocessor.fit_transform(df_inet_dnsmasq_features)

df_inet_dnsmasq_features

Loaded santos_dnsmasq_preprocessor1 from trained-models/inet_dnsmasq_log/more\santos_dnsmasq_preprocessor1.joblib


array([[ 2.05036893,  1.89019734,  1.93916484, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.05036893,  1.89019734,  1.93916484, ...,  0.        ,
         0.        ,  0.        ],
       [ 2.05036893,  1.89019734,  1.93916484, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.51002789, -0.39718352, -0.54399952, ...,  0.        ,
         0.        ,  1.        ],
       [-0.76465298, -1.3774896 , -0.68954912, ...,  0.        ,
         0.        ,  0.        ],
       [-0.76465298, -1.3774896 , -0.68954912, ...,  0.        ,
         0.        ,  0.        ]])

In [19]:
# 3. Load trained classifier(s) and preprocessor
# Only one: Random Forest Multiclass Classifier

models = load_models_from_disk("trained-models/inet_dnsmasq_log")
clf_rf = models["santos_dnsmasq_rf1"]

Loaded santos_dnsmasq_rf1 from trained-models/inet_dnsmasq_log\santos_dnsmasq_rf1.joblib


In [20]:
# 4. Classify log file
y_pred = clf_rf.predict(df_inet_dnsmasq_features)
y_pred

array([[1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
# Multi-label classification (one-hot encoded labels)
label_columns = ["attacker", "dns_scan", "dnsteal", "dnsteal-received", 
                 "escalate", "foothold", "network_scan", 
                 "service_scan", "traceroute", "webshell_cmd", "wpscan"]

In [22]:
# Some evaluation:

# 1. Basic counts for each class
predicted_label_counts = pd.DataFrame({
    'label': label_columns,
    'count': np.sum(y_pred, axis=0)
})
predicted_label_counts = predicted_label_counts.sort_values('count', ascending=False)

print("1. Predicted label counts:")
print(predicted_label_counts)


# 2. Distribution of number of labels per instance
labels_per_instance = np.sum(y_pred, axis=1)
label_distribution = pd.Series(labels_per_instance).value_counts().sort_index()
label_distribution_df = pd.DataFrame({
    'num_labels': label_distribution.index,
    'count': label_distribution.values,
    'percentage': (label_distribution.values / len(y_pred) * 100).round(2)
})

print("\n2. Distribution of number of labels per instance (0 means benign):")
print(label_distribution_df)


# 3. Co-occurrence matrix - which labels appear together
cooccurrence = np.zeros((len(label_columns), len(label_columns)))
for i in range(len(label_columns)):
    for j in range(len(label_columns)):
        if i == j:
            # Count instances where this label appears
            cooccurrence[i, j] = np.sum(y_pred[:, i])
        else:
            # Count instances where both labels appear together
            cooccurrence[i, j] = np.sum(np.logical_and(y_pred[:, i], y_pred[:, j]))

cooccurrence_df = pd.DataFrame(cooccurrence, 
                               index=label_columns,
                               columns=label_columns)

print("\n3. Co-occurrence matrix:")
print(cooccurrence_df)

1. Predicted label counts:
               label  count
0           attacker  52985
2            dnsteal  52985
3   dnsteal-received  52985
1           dns_scan   1755
5           foothold   1526
4           escalate      0
6       network_scan      0
7       service_scan      0
8         traceroute      0
9       webshell_cmd      0
10            wpscan      0

2. Distribution of number of labels per instance (0 means benign):
   num_labels   count  percentage
0           0  221145       80.15
1           1     257        0.09
2           2    1512        0.55
3           3   52985       19.20

3. Co-occurrence matrix:
                  attacker  dns_scan  dnsteal  dnsteal-received  escalate  \
attacker           52985.0       0.0  52985.0           52985.0       0.0   
dns_scan               0.0    1755.0      0.0               0.0       0.0   
dnsteal            52985.0       0.0  52985.0           52985.0       0.0   
dnsteal-received   52985.0       0.0  52985.0           52985.0  

In [23]:
# 5. Keep attack-related logs

# Get the indices where at least one class is predicted
positive_indices = np.where(np.sum(y_pred, axis=1) > 0)[0]

# Create a new dataframe with just those entries
df_attack_related_intranet_auth = df_inet_dnsmasq.iloc[positive_indices].copy()

# Add a column with the predicted classes for each entry
df_attack_related_intranet_auth['predicted_labels'] = [
    [label_columns[j] for j in range(len(label_columns)) if y_pred[i, j] == 1]
    for i in positive_indices
]

# Add individual class columns if needed
for i, label in enumerate(label_columns):
    df_attack_related_intranet_auth[label] = y_pred[positive_indices, i]

# Preview the result
df_attack_related_intranet_auth.head()

Unnamed: 0,timestamp,message,predicted_labels,attacker,dns_scan,dnsteal,dnsteal-received,escalate,foothold,network_scan,service_scan,traceroute,webshell_cmd,wpscan
0,Jan 21 00:00:09,query[A] 3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1z...,"[attacker, dnsteal, dnsteal-received]",1,0,1,1,0,0,0,0,0,0,0
1,Jan 21 00:00:09,forwarded 3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1...,"[attacker, dnsteal, dnsteal-received]",1,0,1,1,0,0,0,0,0,0,0
2,Jan 21 00:00:09,reply 3x6-.596-.IunWTzebVlyAhhHj*ZfWjOBun1zAf*...,"[attacker, dnsteal, dnsteal-received]",1,0,1,1,0,0,0,0,0,0,0
3,Jan 21 00:00:31,query[A] 3x6-.597-.L**fA/ib4pGEIb5*uJ223L5A/pW...,"[attacker, dnsteal, dnsteal-received]",1,0,1,1,0,0,0,0,0,0,0
4,Jan 21 00:00:31,forwarded 3x6-.597-.L**fA/ib4pGEIb5*uJ223L5A/p...,"[attacker, dnsteal, dnsteal-received]",1,0,1,1,0,0,0,0,0,0,0


In [24]:
print(df_attack_related_intranet_auth.__len__())
print(df_inet_dnsmasq.__len__())
print(df_inet_dnsmasq_features.__len__())

54754
275900
275899


In [25]:
#6. TODO: Think about how to correlate them

#7. Upload Iris Report

### Get events from file: intranet / access.log
* Contributes:

* Trained Classifiers: 



In [26]:
# Step by Step:

# 1. Import log file
# 2. Extract features from log file to be used for classification
# 3. Load trained classifier(s)
# 4. Classify log file
# 5. Keep attack-related logs

#6. TODO: Think about how to correlate them

#7. Upload Iris Report

In [28]:
# 1. Import log file

path_intranet_access_log = "/gather/intranet_server/logs/apache2/intranet.smith.santos.com-access.log"
df_intranet_access = load_data_robust(PATH_DATASET_TO_CLASSIFY + path_intranet_access_log)

df_intranet_access.head()

FileNotFoundError: [Errno 2] No such file or directory: '../AIT_LD-v2/russellmitchell/gather/intranet_server/logs/apache2/intranet.smith.santos.com-access.log'

In [None]:
# 2. Extract features from log file to be used for classification

from helpers.intranet_access_log_helper import extract_features


In [None]:
# 3. Load trained classifier(s)
# 4. Classify log file
# 5. Keep attack-related logs

#6. TODO: Think about how to correlate them

#7. Upload Iris Report