In [98]:
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 30)


In [99]:
# Custom imports

from data_helper import load_data, load_true_labels, add_true_labels_to_dataset

In [100]:
# PATHS - Change these to use different data from the dataset!

# 1. santos | inet-firewall | dnsmasq.log
path_santos = "../../AIT_LD-v2/santos"
path_log_file = "/gather/inet-firewall/logs/dnsmasq.log"
path_true_labels_file = "/labels/inet-firewall/logs/dnsmasq.log"

In [101]:
# Load the AIT log data set v2
df_raw = load_data(path_santos + path_log_file)

# Display the first few rows of the data set
#df_raw.head(5)

# Display the entries corresponding to attack (rows 144-151)
df_raw.iloc[144:152]

Unnamed: 0,0
144,Jan 14 00:07:10 dnsmasq[14...
145,Jan 14 00:07:10 dnsmasq[14...
146,Jan 14 00:07:27 dnsmasq[14...
147,Jan 14 00:07:27 dnsmasq[14...
148,Jan 14 00:07:27 dnsmasq[14...
149,Jan 14 00:07:41 dnsmasq[14...
150,Jan 14 00:07:41 dnsmasq[14...
151,Jan 14 00:07:41 dnsmasq[14...


In [102]:
# load true labels from json
df_true_labels = load_true_labels(path_santos + path_true_labels_file)

df_true_labels.head(10)

Unnamed: 0,line,labels,rules
0,1,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
1,2,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
2,3,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
3,50,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
4,51,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
5,52,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
6,53,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
7,54,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
8,55,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...
9,72,"[dnsteal, attacker, dnstea...",{'dnsteal': ['dnsteal.doma...


In [103]:
# Add the true labels to the dataset
df_raw = add_true_labels_to_dataset(df_raw, df_true_labels)

In [104]:
df_raw.head()

Unnamed: 0,0,true_type,labels
0,Jan 14 00:00:09 dnsmasq[14...,1.0,"['dnsteal', 'attacker', 'd..."
1,Jan 14 00:00:09 dnsmasq[14...,1.0,"['dnsteal', 'attacker', 'd..."
2,Jan 14 00:00:09 dnsmasq[14...,1.0,"['dnsteal', 'attacker', 'd..."
3,Jan 14 00:00:23 dnsmasq[14...,,
4,Jan 14 00:00:23 dnsmasq[14...,,


In [105]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275667 entries, 0 to 275666
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   0          275667 non-null  object
 1   true_type  39426 non-null   object
 2   labels     39426 non-null   object
dtypes: object(3)
memory usage: 6.3+ MB


In [106]:
df_raw.describe()

Unnamed: 0,0,true_type,labels
count,275667,39426,39426
unique,207381,1,7
top,Jan 14 11:22:17 dnsmasq[14...,1,"['dnsteal', 'attacker', 'd..."
freq,53,39426,38561


In [107]:
df_raw.describe(include="object")

Unnamed: 0,0,true_type,labels
count,275667,39426,39426
unique,207381,1,7
top,Jan 14 11:22:17 dnsmasq[14...,1,"['dnsteal', 'attacker', 'd..."
freq,53,39426,38561


In [108]:
# Check for null values
df_raw.isnull().sum()

0                 0
true_type    236241
labels       236241
dtype: int64

In [109]:
# Check for duplicates
# 1. santos | inet-firewall | dnsmasq.log               duplicates: 68286, isnull: true_type: 236241, labels: 236241

df_raw.duplicated().sum()

68286

In [110]:
# -> Duplicates normal if logs are written quickly
# TODO: investigate which lines are duplicated and how to treat them. e.g. combine them and keep info about count or keep them as they are

df_raw[df_raw.duplicated()]

Unnamed: 0,0,true_type,labels
11,Jan 14 00:00:23 dnsmasq[14...,,
12,Jan 14 00:00:23 dnsmasq[14...,,
13,Jan 14 00:00:23 dnsmasq[14...,,
14,Jan 14 00:00:23 dnsmasq[14...,,
18,Jan 14 00:00:23 dnsmasq[14...,,
...,...,...,...
275644,Jan 17 23:39:13 dnsmasq[14...,,
275645,Jan 17 23:39:13 dnsmasq[14...,,
275653,Jan 17 23:39:13 dnsmasq[14...,,
275654,Jan 17 23:39:13 dnsmasq[14...,,


In [111]:
# IMPORTANT: For CountVectorizer we will remove duplicates #TODO: Test later if this is a good idea

df_raw = df_raw.drop_duplicates()
df_raw.duplicated().sum()


0

In [112]:
# Plot the distribution of true labels into benign and attack-related

#import seaborn as sns
#import matplotlib.pyplot as plt

#sns.countplot(x=df_raw['true_type'])
#plt.title("Label Distribution")
#plt.xticks(ticks=[0,1], labels=['Benign', 'Attack-related'])

#for i in range(2):
#    count = df_raw['true_type'].value_counts().values[i]
#    plt.text(i, count, str(count), ha = 'center')
#plt.show()

In [113]:
# Rename column name
df_raw.columns = ['raw', 'true_type', 'labels']

In [114]:
# Use CountVectorizer to convert the raw text data into a matrix of token counts
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer() # Using "english" stop-words: 107431, without: 107431 #TODO: Test difference
# TODO: Try max_df and min_df (to ignore words that appear often or rarely)
vocab = vectorizer.fit(df_raw['raw'])

In [115]:
print("CountVectorizer vocabulary size: ", vocab.vocabulary_.__len__())

print(vocab.get_feature_names_out())

CountVectorizer vocabulary size:  107493
['00' '000' '0001' ... 'zzzvm4py7ktn' 'zzzzbpkx08ybkoomdsplpwpznfwjp6'
 'zzzzw7m']


In [119]:
# 1. Binary Classification (Benign vs Attack)

from sklearn.preprocessing import LabelBinarizer

# LabelBinarizer to convert the true_type column into binary labels
label_binarizer = LabelBinarizer()
y = label_binarizer.fit_transform(
    df_raw['true_type'].apply(lambda x: 1 if x == "1" else 0)
).ravel()   # ravel() to flatten to 1D array


In [120]:
df_raw['true_type'].values

array(['1', '1', '1', ..., nan, nan, nan], dtype=object)

In [121]:
# Create Feature Matrix X from Vectorizer
X = vectorizer.transform(df_raw['raw'])

X

<207381x107493 sparse matrix of type '<class 'numpy.int64'>'
	with 3767438 stored elements in Compressed Sparse Row format>

In [122]:
from sklearn.model_selection import train_test_split

# Split into train, test and validation sets
# 70% train, 10% validation, 20% test #TODO: Calculate if this is correct
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.125, random_state=42, stratify=y_temp
)

In [126]:
from collections import Counter
print("\nClass distribution in training set:")
print(Counter(y_train))

print("\nClass distribution in validation set:")
print(Counter(y_val))

print("\nClass distribution in test set:")
print(Counter(y_test))



Class distribution in training set:
Counter({0: 117766, 1: 27400})

Class distribution in validation set:
Counter({0: 16824, 1: 3914})

Class distribution in test set:
Counter({0: 33648, 1: 7829})


In [123]:
# Optional garbage collection:
import gc

del X_temp, y_temp, df_raw, df_true_labels
gc.collect()

24

In [139]:
# DECIDE IF TRAINING SHOULD BE DONE OR IF CLASSIFIERS SHOULD BE LOADED FROM DISK INSTEAD
run_training = False

# Save the trained classifiers to disk, OVERWRITING existing ones!
save_trained_classifiers = False 

In [132]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
#from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

# Run Binary Classifiers
  
classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Naive Bayes": MultinomialNB(),
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    #"XGBoost": XGBClassifier(use_label_encoder=False),
    "MLP": MLPClassifier(max_iter=1000),
    "AdaBoost": AdaBoostClassifier()
}

if(run_training):
    for name, classifier in classifiers.items():
        print("\n\n-----Training", name, "-----")

        # Train
        classifier.fit(X_train, y_train)
        
        # Predict
        y_pred = classifier.predict(X_val)

        # Evaluate
        print(f"Validation Results for {name}:\n", classification_report(y_val, y_pred))
        print("Confusion Matrix for {name}: ")
        print(confusion_matrix(y_val, y_pred))



-----Training Logistic Regression -----
Validation Results for Logistic Regression:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     16824
           1       1.00      1.00      1.00      3914

    accuracy                           1.00     20738
   macro avg       1.00      1.00      1.00     20738
weighted avg       1.00      1.00      1.00     20738

Confusion Matrix for {name}: 
[[16823     1]
 [    1  3913]]


-----Training Random Forest -----
Validation Results for Random Forest:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     16824
           1       1.00      1.00      1.00      3914

    accuracy                           1.00     20738
   macro avg       1.00      1.00      1.00     20738
weighted avg       1.00      1.00      1.00     20738

Confusion Matrix for {name}: 
[[16824     0]
 [    1  3913]]


-----Training Naive Bayes -----
Validation Results for Naive

In [140]:
import joblib
import os

# Save the trained classifiers and vectorizer to disk
if(run_training and save_trained_classifiers):
    if not os.path.exists('models'):
        os.makedirs('models')

    for name, classifier in classifiers.items():
        filename = f"models/{name.replace(' ', '_')}.joblib"
        joblib.dump(classifier, filename)
    
    joblib.dump(vectorizer, "models/vectorizer.joblib")

In [None]:
if(not run_training):
    classifiers = {}
    for name in classifiers.keys():
        classifiers[name] = joblib.load(f"models/{name.replace(" ", "_")}.joblib")
    
    vectorizer = joblib.load("models/vectorizer.joblib")

In [144]:
X_val[2]

<1x107493 sparse matrix of type '<class 'numpy.int64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [145]:
y_val[2]

0

In [151]:
# Manually check a few hardcoded example messages from the validation set

def predict_message(message, classifier):
    # Transform the new message using same vectorizer
    message_vectorized = vectorizer.transform([message])
    # Make prediction
    prediction = classifier.predict(message_vectorized)[0]
    # Get probability scores
    prob = classifier.predict_proba(message_vectorized)[0]
    return prediction, prob

# A few test messages from the validation set 
test_messages = [
    "DHCP REQUEST received from 192.168.1.100",  # normal message example
    "Multiple failed login attempts from IP 10.0.0.5",  # potential attack example
    "DNS query www.example.com from 192.168.1.50"  # another normal message example
]

# Test each message with each classifier
for name, clf in classifiers.items():
    print(f"\nResults from {name}:")

    if(not hasattr(clf, "predict_proba")):
        print("This classifier does not support predict_proba().")
        continue

    for msg in test_messages:
        prediction, probabilities = predict_message(msg, clf)
        print(f"\nMessage: {msg}")
        print(f"Prediction: {'attack' if prediction == 1 else 'normal'}")
        print(f"Probability scores: normal: {probabilities[0]:.3f}, attack: {probabilities[1]:.3f}")


Results from Logistic Regression:

Message: DHCP REQUEST received from 192.168.1.100
Prediction: normal
Probability scores: normal: 1.000, attack: 0.000

Message: Multiple failed login attempts from IP 10.0.0.5
Prediction: normal
Probability scores: normal: 1.000, attack: 0.000

Message: DNS query www.example.com from 192.168.1.50
Prediction: normal
Probability scores: normal: 1.000, attack: 0.000

Results from Random Forest:

Message: DHCP REQUEST received from 192.168.1.100
Prediction: normal
Probability scores: normal: 0.960, attack: 0.040

Message: Multiple failed login attempts from IP 10.0.0.5
Prediction: normal
Probability scores: normal: 1.000, attack: 0.000

Message: DNS query www.example.com from 192.168.1.50
Prediction: normal
Probability scores: normal: 1.000, attack: 0.000

Results from Naive Bayes:

Message: DHCP REQUEST received from 192.168.1.100
Prediction: normal
Probability scores: normal: 0.783, attack: 0.217

Message: Multiple failed login attempts from IP 10.0.0.

In [67]:
# Troubleshooting torch installation

import sys
import torch

print("Python version:", sys.version)
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("MPS available:", torch.backends.mps.is_available())

Python version: 3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]
PyTorch version: 2.5.1+cu118
CUDA available: True
MPS available: False


In [66]:
import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

x = torch.rand(5, 3)
print(x)


# Choose Hardware, Cuda uses GPU
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")



tensor([[0.5973, 0.0086, 0.2944],
        [0.0028, 0.6524, 0.6644],
        [0.4380, 0.2283, 0.6385],
        [0.0467, 0.1260, 0.8122],
        [0.9741, 0.1510, 0.3118]])
Using cuda device


In [None]:
# Test PyTorch Model using their tutorial (mainly to test installation & hardware)
X = torch.rand(1, 28, 28, device=device)
logits = nn_model(X)
pred_probab = nn.Softmax(dim=1)(logits)
y_pred = pred_probab.argmax(1)
print(f"Predicted class: {y_pred}")

Predicted class: tensor([6], device='cuda:0')


In [107]:
#Define class, (currently pytorch default)
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(1000, 11005),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [108]:
#Create model instance
nn_model = NeuralNetwork().to(device)
print(nn_model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=1000, out_features=11005, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


In [109]:
df_with_o_h.describe(include="object")

Unnamed: 0,true_type,labels
count,24050,24050
unique,1,1
top,1,"['dnsteal', 'attacker', 'd..."
freq,24050,24050


In [110]:
df_with_o_h.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Columns: 11007 entries, timestamp to 9_msgt
dtypes: datetime64[ns](1), float64(11001), int64(3), object(2)
memory usage: 8.2+ GB


In [111]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
import torch
import numpy as np

# Preprocessing pipeline
# Only take first 1000 rows for now (memory limit and speed up)
X = df_with_o_h.iloc[0:1000].drop(["true_type", "labels"], axis=1)

# Ensure all column names are strings
X.columns = X.columns.astype(str)

# Convert datetime columns to numerical features
for col in X.select_dtypes(include=['datetime64']).columns:
    X[col] = X[col].astype('int64')  # Convert datetime to timestamp

# Drop any remaining non-numeric columns
X = X.select_dtypes(include=[np.number])

# Prepare labels
y = df_with_o_h.iloc[0:1000]["true_type"]

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert to tensors
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

# Print some information
print("Unique labels:", label_encoder.classes_)
print("X tensor shape:", X_tensor.shape)
print("y tensor shape:", y_tensor.shape)

Unique labels: ['1' nan]
X tensor shape: torch.Size([1000, 11005])
y tensor shape: torch.Size([1000])


In [112]:
logits = nn_model(X_tensor.to(device))

RuntimeError: mat1 and mat2 shapes cannot be multiplied (1000x11005 and 1000x11005)