In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from node2vec import Node2Vec

In [2]:
import os
from gensim.models import Word2Vec

In [3]:
from sklearn.preprocessing import LabelEncoder
# from pycaret.classification import *
from time import perf_counter

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [5]:
# Loading the phase2 data

# loc_phase2="/home/erfan/Desktop/Experiment/Final_Dataset/Provenance_Logs/Phase2_Provenance.csv"

# loc_phase2 = "D:\00 Datasets\CICAPT-IIoT Dataset\Provenance_Logs\Phase2_Provenance.csv"

phase2_raw = pd.read_csv(r'D:\00 Datasets\CICAPT-IIoT Dataset\Provenance_Logs\Phase2_Provenance.csv', low_memory=False)
# phase2_raw = pd.read_csv(loc_phase2)


In [11]:
phase2_raw

Unnamed: 0,id,type,from,to,uid,egid,exe,gid,euid,name,pid,seen time,source,ppid,command line,start time,event id,time,operation,path,subtype,permissions,epoch,version,flags,local address,remote port,protocol,remote address,local port,tgid,fd,mode,label,subLabel
0,ca5b322fa4d4cb63aa5dd9fbd88e37e4,Process,,,0.0,0.0,/usr/bin/dash,0.0,0.0,ethtool,917.0,1.701469e+09,syscall,913.0,,,,,,,,,,,,,,,,,,,,0,0
1,e36715b62c8cdc32e47483b3600712f1,Process,,,0.0,0.0,/usr/bin/dash,0.0,0.0,ethtool,917.0,,syscall,913.0,/bin/sh /etc/network/if-up.d/ethtool,1.701469e+09,,,,,,,,,,,,,,,,,,0,0
2,,WasTriggeredBy,e36715b62c8cdc32e47483b3600712f1,ca5b322fa4d4cb63aa5dd9fbd88e37e4,,,,,,,,,syscall,,,,246.0,1.701469e+09,execve,,,,,,,,,,,,,,,0,0
3,216f4d4893942a66d649c4e02b5722c4,Artifact,,,,,,,,,,,syscall,,,,,,,/etc/network/if-up.d/ethtool,file,755.0,0.0,0.0,,,,,,,,,,0,0
4,,Used,e36715b62c8cdc32e47483b3600712f1,216f4d4893942a66d649c4e02b5722c4,,,,,,,,,syscall,,,,246.0,1.701469e+09,load,,,,,,,,,,,,,,,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
196730,e97a4f8ea72d1ccbbd9a4fd26692c38e,Process,,,0.0,0.0,/usr/bin/systemd-tty-ask-password-agent,0.0,0.0,systemd-tty-ask,244020.0,,syscall,244019.0,/bin/systemd-tty-ask-password-agent --watch,1.701728e+09,,,,,,,,,,,,,,,,,,0,0
196731,,WasTriggeredBy,e97a4f8ea72d1ccbbd9a4fd26692c38e,cc3aa5736c336ef1edaa2d2c54191f8e,,,,,,,,,syscall,,,,404003.0,1.701728e+09,execve,,,,,,,,,,,,,,,0,0
196732,12b6df6ce5e36fc5608f5c2d916718ae,Artifact,,,,,,,,,,,syscall,,,,,,,/bin/systemd-tty-ask-password-agent,file,755.0,0.0,0.0,,,,,,,,,,0,0
196733,,Used,e97a4f8ea72d1ccbbd9a4fd26692c38e,12b6df6ce5e36fc5608f5c2d916718ae,,,,,,,,,syscall,,,,404003.0,1.701728e+09,load,,,,,,,,,,,,,,,0,0


In [6]:
# Separating the nodes and edges

# 1. Filter rows that have a non-null value in the "to" column
raw_edges= phase2_raw.dropna(subset=["to"])

# 2. Create a boolean mask to mark rows that are present in the raw_edges
node_mask = phase2_raw.index.isin(raw_edges.index)

# 3. Filter rows that are not part of the edges (i.e., rows where the "to" column is null)
raw_nodes=phase2_raw[~node_mask]

In [7]:
# Processing edges; Dropping some null features and encoding valuable edge features
edges=raw_edges.copy()
nodes=raw_nodes.copy()

edge_attributes_to_drop=["id","uid","egid","exe","gid","euid","name","pid","seen time", "source","ppid",
                         "command line","start time","event id","path","subtype","permissions","epoch","version","flags",
                        "local address","remote port","protocol","remote address","local port","tgid","fd","mode"]

edge_attributes_to_encode=["type","operation"]

edges.drop(columns=edge_attributes_to_drop,inplace=True)

type_encoder = LabelEncoder()
operation_encoder = LabelEncoder()

edges["type"]=type_encoder.fit_transform(edges["type"])
edges["operation"]=operation_encoder.fit_transform(edges["operation"])

In [8]:
#Dropping some null features from nodes
node_attributes_to_drop=["from", "to" , "pid" , "seen time" , "source", "ppid", "command line", "start time" ,
                        "event id" , "time" , "operation" , "version", "flags","local address" , "local port",
                        "tgid","fd","mode","epoch","protocol"]
process_attributes=["uid", "egid" , "exe" , "gid" , "euid" , "name" ,]
artifact_attributes=["path","permissions","epoch","remote port","protocol","remote address"]

nodes.drop(columns=node_attributes_to_drop,inplace=True)

In [9]:
# Encoding the node features

attr_to_encode=["type","subtype","uid","egid","exe","gid","euid","name","path","subtype","permissions","remote port","remote address"]

nodes_encoder = LabelEncoder()
for i in attr_to_encode:
    nodes[i]=nodes_encoder.fit_transform(nodes[i])



In [10]:
# Creating the graph using networkX

graph= nx.from_pandas_edgelist(edges,source="from",target="to", edge_attr=["type","operation"], create_using=nx.MultiDiGraph())

In [11]:
print("The number of graph nodes are: {} and the number of edges are: {}".format(graph.number_of_nodes(), graph.number_of_edges()))

The number of graph nodes are: 53286 and the number of edges are: 143449


In [12]:
# Assigning the node features to the graph's node attribute

node_attributes_dict = nodes.drop(columns=["label","subLabel"]).set_index('id').to_dict(orient='index')
nx.set_node_attributes(graph,node_attributes_dict)

In [13]:
for node, attrs in graph.nodes(data=True):
    print(f"Node {node}: {attrs}")

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)




Node 1f894898aeea053f3e132a1c60c26fba: {'type': 1, 'uid': 3, 'egid': 12, 'exe': 10, 'gid': 12, 'euid': 4, 'name': 30, 'path': 13425, 'subtype': 5, 'permissions': 12, 'remote port': 8, 'remote address': 73}
Node 18199c4c68ac74ad24e26e6d35dee15a: {'type': 1, 'uid': 3, 'egid': 12, 'exe': 26, 'gid': 12, 'euid': 4, 'name': 68, 'path': 13425, 'subtype': 5, 'permissions': 12, 'remote port': 8, 'remote address': 73}
Node 6720ec214ede1d960cb83a8e5e285a12: {'type': 1, 'uid': 3, 'egid': 12, 'exe': 26, 'gid': 12, 'euid': 4, 'name': 68, 'path': 13425, 'subtype': 5, 'permissions': 12, 'remote port': 8, 'remote address': 73}
Node ff1a1e6327749d014d77bffddc566eef: {'type': 1, 'uid': 3, 'egid': 12, 'exe': 10, 'gid': 12, 'euid': 4, 'name': 30, 'path': 13425, 'subtype': 5, 'permissions': 12, 'remote port': 8, 'remote address': 73}
Node 97bccda171cf7974ddf70ee5fe2763c1: {'type': 1, 'uid': 3, 'egid': 12, 'exe': 10, 'gid': 12, 'euid': 4, 'name': 30, 'path': 13425, 'subtype': 5, 'permissions': 12, 'remote p

In [14]:
# Using node2vec on the graph to get node embeddings 
node2vec_phase2 = Node2Vec(graph, dimensions=64, walk_length=10, num_walks=100, workers=8)

Computing transition probabilities: 100%|███████████████████████████████████████| 53286/53286 [04:39<00:00, 190.53it/s]


PicklingError: Could not pickle the task to send it to the workers.

In [None]:
model = node2vec_phase2.fit(window=10, min_count=1, batch_words=4)

In [None]:
# Saving the trained Node2Vec models for further uses

model.save("./Sample_Node2Vec_64sized_phase2.model")

In [None]:
# Loading the saved Node2Vec models 

path_to_phase2_model="./Sample_Node2Vec_64sized_phase2.model"
model2=Word2Vec.load(path_to_phase2_model)

In [None]:
# Creating the phase2 dataframe; including the embeddings, labels, and sub-labels 

embeddings2 = {node: model2.wv[node] for node in graph.nodes()}
embedding_df2 = pd.DataFrame.from_dict(embeddings2, orient='index')

labels_dict2 = {node["id"]: node["label"] for _ ,node in nodes.iterrows()}
sub_labels_dict2 = {node["id"]: node["subLabel"] for _ ,node in nodes.iterrows()}

label_df2 = pd.DataFrame.from_dict(labels_dict2, orient='index', columns=['label'])
sub_label_df2 = pd.DataFrame.from_dict(sub_labels_dict2, orient='index', columns=['subLabel'])

phase2_processed = embedding_df2.join(label_df2).join(sub_label_df2)

In [None]:
phase2_processed

In [None]:
# The first model; data consists of all attack sub-classes and only from phase2
# Binary classification; all stages of attack are labelled as 1 

print("Training the first model")
s = setup(data = phase2_processed.drop(columns=["subLabel"]), target = 'label', session_id=123)
best_model = compare_models()