https://machinelearningmastery.com/one-class-classification-algorithms/

https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

# One Class Classification

## Initialization 

In [1]:
from google.colab import drive
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!ls gdrive/MyDrive/feature_csvs/

0492_features.csv   5736_features.csv	     benchmark3_features.csv
14271_features.csv  benchmark1_features.csv
15664_features.csv  benchmark2_features.csv


In [3]:
def add_stars_to_path(df):
  result_df = df.replace(to_replace='/var/lib/docker/containers/[a-zA-Z0-9-_]+/hostname', value='/var/lib/docker/containers/*/hostname', regex=True)
  result_df = result_df.replace(to_replace='/var/lib/docker/overlay2/[a-zA-Z0-9-_]+/diff/etc/hostname', value='/var/lib/docker/overlay2/*/diff/etc/hostname', regex=True)
  result_df = result_df.replace(to_replace='/var/lib/docker/overlay2/[a-zA-Z0-9-_]+/diff/etc/resolv.conf', value='/var/lib/docker/overlay2/*/diff/etc/resolv.conf', regex=True)
  result_df = result_df.replace(to_replace='/systemd/netif/links/[\w.#]+', value='/systemd/netif/links/*', regex=True)
  result_df = result_df.replace(to_replace='/systemd/netif/[\w.#]+', value='/systemd/netif/*', regex=True)
  result_df = result_df.replace(to_replace='/systemd/resolve/[\w.#-]+', value='/systemd/resolve/*', regex=True)
  result_df = result_df.replace(to_replace='/var/lib/docker/containers/[\w]+/hosts', value='/var/lib/docker/containers/*/hosts', regex=True)
  result_df = result_df.replace(to_replace='/var/lib/docker/containers/[\w]+/resolv.conf', value='/var/lib/docker/containers/*/resolv.conf', regex=True)
  result_df = result_df.replace(to_replace='/docker/runtime-runc/moby/[\w.#]+/exec.fifo', value='/docker/runtime-runc/moby/*/exec.fifo', regex=True)

  return result_df

## Reading/Importing Flows

In [4]:
df1 = pd.read_csv("gdrive/MyDrive/feature_csvs/benchmark1_features.csv")
df2 = pd.read_csv("gdrive/MyDrive/feature_csvs/benchmark2_features.csv")
df3 = pd.read_csv("gdrive/MyDrive/feature_csvs/benchmark3_features.csv")
df = pd.concat([df1, df2, df3])
df

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,['/var/lib/docker/containers/e717d550b5f83c105...,['/'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0
1,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-journald'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['memory_write', 'clone_mem']",[],[],0
2,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-journald'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['memory_write', 'clone_mem']",[],[],0
3,file,"['/systemd/netif/.#stateEAyBSe', '/systemd/net...",['/bin/networkctl'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]",['perm_check'],"['write', 'rename']",[],[],0
4,file,"['/systemd/netif/links/.#54SX1QcY', '/systemd/...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
...,...,...,...,...,...,...,...,...,...,...
961,process_memory,['/usr/bin/dockerd'],['/usr/bin/dockerd'],"['/', '/usr/bin/dockerd']","[0, 0, 1, 0, 0]",['memory_read'],['memory_write'],[],[],0
962,file,['/xtables.lock'],['/usr/bin/dockerd'],['/usr/bin/dockerd'],"[0, 0, 1, 0, 0]","['open', 'perm_check']",['file_lock'],"['127.0.0.11:42753', '--wait', '-I', '34925', ...","['127.0.0.11:42753', '--wait', '-I', '34925', ...",0
963,file,['/var/lib/docker/overlay2/d8bce2271ec901d6660...,"['/', '/usr/bin/dockerd']",['/'],"[1, 1, 1, 1, 1]","['open', 'perm_check', 'read']",['munmap'],"['daemon off;', '/usr/local/openresty/bin/open...",[],0
964,file,['/var/lib/docker/overlay2/d8bce2271ec901d6660...,"['/', '/usr/bin/dockerd']","['/usr/local/openresty/nginx/sbin/nginx', '/']","[1, 1, 1, 1, 1]","['open', 'perm_check', 'read']",['munmap'],"['daemon off;', '/usr/local/openresty/bin/open...",[],0


In [5]:
df = df[(df['object_type'] == 'file') | (df['object_type'] == 'link')]
df

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,['/var/lib/docker/containers/e717d550b5f83c105...,['/'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0
3,file,"['/systemd/netif/.#stateEAyBSe', '/systemd/net...",['/bin/networkctl'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]",['perm_check'],"['write', 'rename']",[],[],0
4,file,"['/systemd/netif/links/.#54SX1QcY', '/systemd/...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
5,file,"['/systemd/resolve/.#stub-resolv.confC9gey9', ...",['/usr/bin/dockerd'],['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],[],0
9,file,"['/systemd/netif/links/56', '/systemd/netif/li...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
...,...,...,...,...,...,...,...,...,...,...
959,file,"['/systemd/resolve/.#stub-resolv.confHjVq91', ...","['/bin/bash', '/usr/bin/sudo']",['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],['./scripts/social-network/read-user-timeline....,0
962,file,['/xtables.lock'],['/usr/bin/dockerd'],['/usr/bin/dockerd'],"[0, 0, 1, 0, 0]","['open', 'perm_check']",['file_lock'],"['127.0.0.11:42753', '--wait', '-I', '34925', ...","['127.0.0.11:42753', '--wait', '-I', '34925', ...",0
963,file,['/var/lib/docker/overlay2/d8bce2271ec901d6660...,"['/', '/usr/bin/dockerd']",['/'],"[1, 1, 1, 1, 1]","['open', 'perm_check', 'read']",['munmap'],"['daemon off;', '/usr/local/openresty/bin/open...",[],0
964,file,['/var/lib/docker/overlay2/d8bce2271ec901d6660...,"['/', '/usr/bin/dockerd']","['/usr/local/openresty/nginx/sbin/nginx', '/']","[1, 1, 1, 1, 1]","['open', 'perm_check', 'read']",['munmap'],"['daemon off;', '/usr/local/openresty/bin/open...",[],0


## Printing Info 

In [6]:
for i in df.columns:
  print(i)
  print(len(df[i].unique()))
  print()

object_type
1

entity_path
901

reader_path
16

writer_path
9

namespaces
3

reader_relation_types
14

writer_relation_types
8

writer_argvs
8

reader_argvs
143

priviledged_flow
1



## Feature Engineering 

### Count Vectorizer 

In [7]:
feature = "entity_path"
text = df[feature]

vectorizer1 = CountVectorizer()

vectorizer1.fit(text)

print(vectorizer1.vocabulary_)

vector = vectorizer1.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df = pd.DataFrame(vector.todense(), columns=vectorizer1.get_feature_names())
x = count_vect_df.to_numpy()

for i in range(0, len(df)):
  df[feature].iloc[i] = x[i]
df.head()

{'var': 818, 'lib': 722, 'docker': 694, 'containers': 679, 'e717d550b5f83c10579c936a1e75de3e26ca354f11df21b9d2747c92930b6c71': 700, 'hostname': 719, 'systemd': 815, 'netif': 742, 'stateeaybse': 769, 'state': 750, 'links': 724, '54sx1qcy': 296, '54': 291, 'resolve': 746, 'stub': 813, 'resolv': 745, 'confc9gey9': 569, 'conf': 532, '56': 300, '56o7gptq': 302, '54cftket': 295, '53myecpm': 290, '53': 288, '55': 297, '55iwl6ap': 299, '56govs5h': 301, 'conf0k0uh0': 534, '58yqzw8k': 308, '58': 306, 'statei0l5eu': 777, '48s4fbe': 283, 'confqj1wlt': 631, '60kqvqih': 317, '60': 315, '57': 303, '57uvgmra': 305, '59mkn6ed': 310, '59': 309, '60e0mav6': 316, '58qstsfa': 307, 'confofqkrp': 621, 'statekifa3j': 784, '62': 326, '62m3fjr3': 329, 'conf8vbvuj': 554, '61kbwjqx': 322, '61': 319, '62egqrp1': 328, 'stateu8itpl': 795, '64gnr5ov': 339, '64': 336, 'conf8rumth': 553, '63': 330, '63oiqvk0': 334, '66': 348, '66qc6aqu': 351, '64acn0lz': 338, 'stateazfjcj': 760, '68': 361, '682oidst': 362, '66qptzty': 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['/'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0
3,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['/bin/networkctl'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]",['perm_check'],"['write', 'rename']",[],[],0
4,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['/usr/bin/dockerd'],['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],[],0
9,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0


In [8]:
feature = "reader_path"
text = df[feature]

vectorizer2 = CountVectorizer()

vectorizer2.fit(text)

print(vectorizer2.vocabulary_)

vector = vectorizer2.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df2 = pd.DataFrame(vector.todense(), columns=vectorizer2.get_feature_names())
x = count_vect_df2.to_numpy()

for i in range(0, len(df)):
  df[feature].iloc[i] = x[i]
df.head()

{'bin': 1, 'networkctl': 6, 'lib': 4, 'systemd': 15, 'resolved': 11, 'usr': 16, 'python3': 10, 'dockerd': 2, 'runc': 12, 'networkd': 7, 'local': 5, 'gosu': 3, 'bash': 0, 'openresty': 9, 'nginx': 8, 'sbin': 13, 'sudo': 14}
(903, 17)
[[0 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 1 0 ... 0 2 1]
 ...
 [0 1 1 ... 0 0 1]
 [0 1 1 ... 0 0 1]
 [0 2 1 ... 0 0 2]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0
3,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]",['perm_check'],"['write', 'rename']",[],[],0
4,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],[],0
9,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0


In [9]:
feature = "writer_path"
text = df[feature]

vectorizer3 = CountVectorizer()

vectorizer3.fit(text)

print(vectorizer3.vocabulary_)

vector = vectorizer3.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df3 = pd.DataFrame(vector.todense(), columns=vectorizer3.get_feature_names())
x = count_vect_df3.to_numpy()

for i in range(0, len(df)):
  df[feature].iloc[i] = x[i]
df.head()

{'usr': 12, 'bin': 0, 'dockerd': 1, 'lib': 3, 'systemd': 10, 'networkd': 5, 'resolved': 8, 'udevd': 11, 'local': 4, 'gosu': 2, 'openresty': 7, 'nginx': 6, 'sbin': 9}
(903, 13)
[[1 1 0 ... 0 0 1]
 [0 0 0 ... 2 0 0]
 [0 0 0 ... 2 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 1]
 [1 0 1 ... 0 0 1]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0
3,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]",['perm_check'],"['write', 'rename']",[],[],0
4,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],[],0
9,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","['read', 'open', 'getattr', 'perm_check']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0


In [10]:
# (((df['namespaces'].to_list().split('[')[1]).split(']')[0]).split(','))
type(df['namespaces'].to_list()[0])

str

In [11]:
df['namespaces'] = list(map(lambda x: list(map(lambda y: int(y),x.split('[')[1].split(']')[0].split(','))), df['namespaces'].to_numpy()))
df['namespaces'] = df['namespaces'].apply(lambda x: np.array(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
feature = "reader_relation_types"
text = df[feature]

vectorizer4 = CountVectorizer()

vectorizer4.fit(text)

print(vectorizer4.vocabulary_)

vector = vectorizer4.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df4 = pd.DataFrame(vector.todense(), columns=vectorizer4.get_feature_names())
x = count_vect_df4.to_numpy()

for i in range(0, len(df)):
  df[feature].iloc[i] = x[i]
df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


{'getattr': 0, 'perm_check': 2, 'read': 3, 'open': 1, 'read_ioctl': 4}
(903, 5)
[[1 0 0 0 0]
 [0 0 1 0 0]
 [1 1 1 1 0]
 ...
 [0 1 1 1 0]
 [0 1 1 1 0]
 [1 1 1 1 1]]


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]",['write'],[],[],0
3,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[0, 0, 1, 0, 0]","['write', 'rename']",[],[],0
4,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","['write', 'rename']",[],[],0
9,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0


In [13]:
feature = "writer_relation_types"
text = df[feature]

vectorizer5 = CountVectorizer()

vectorizer5.fit(text)

print(vectorizer5.vocabulary_)

vector = vectorizer5.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df5 = pd.DataFrame(vector.todense(), columns=vectorizer5.get_feature_names())
x = count_vect_df5.to_numpy()

for i in range(0, len(df)):
  df[feature].iloc[i] = x[i]
df.head()

{'write': 4, 'rename': 2, 'file_lock': 0, 'write_ioctl': 5, 'unlink': 3, 'munmap': 1}
(903, 6)
[[0 0 0 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 ...
 [0 1 0 0 0 0]
 [0 1 0 0 0 0]
 [0 0 0 0 0 1]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0]",[],[],0
3,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[0, 0, 1, 0, 0]","[0, 0, 1, 0, 1, 0]",[],[],0
4,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],[],0
9,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0


In [14]:
feature = "reader_argvs"
text = df[feature]

vectorizer6 = CountVectorizer()

vectorizer6.fit(text)

print(vectorizer6.vocabulary_)

vector = vectorizer6.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df6 = pd.DataFrame(vector.todense(), columns=vectorizer6.get_feature_names())
x = count_vect_df6.to_numpy()

for i in range(0, len(df)):
  df[feature].iloc[i] = x[i]
df.head()

{'no': 392, 'legend': 375, 'bin': 306, 'networkctl': 391, 'pager': 397, 'list': 378, '305e3e1e26fdae2768dd44e2326fd17332939db66947879660270af8493db7d1': 49, '0225c94e7e42': 2, 'exec': 355, 'root': 407, 'var': 434, 'run': 408, 'docker': 335, 'libnetwork': 376, 'setkey': 413, '79709dcecfef3fa68918c4312dcf78ab34fb5df08bcaa5cd1edb34ae129ce1e0': 260, '29335add0d96e45f3baa44a4793bce08013a9510a7f6052931b9bd5de43b72ed': 47, 'netns': 389, 'cb7b10faf75c': 315, 'false': 366, 'set': 412, 'ipv6': 374, 'all': 295, 'setup': 414, 'resolver': 405, '127': 19, '11': 16, '47915': 176, 'de3ff6cac546cd8e94d5fac4dfd4a9287ebd60931f2390a98212979d66bc4085': 330, '41973': 140, 'bd73b0f55af5e8c4dee14d7e97706cc5f7bd69ff84dc5f9af091f11bad6b2e76': 303, 'bf6cf0481ca3': 305, '99f9b0a8a5b1': 282, '40586': 124, 'd9fd7251c6b4': 324, 'cb286ba50fe29e96ae28162e5caaf31fdb8422a1219018e700cee6e2e0bffac3': 314, '78d59ee10560': 259, '34103': 62, '60383': 236, '33909': 61, '59623': 229, '42777': 146, '38697': 103, 'efc65dc22c73':

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[0, 0, 1, 0, 0]","[0, 0, 1, 0, 1, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [15]:
feature = "writer_argvs"
text = df[feature]

vectorizer7 = CountVectorizer()

vectorizer7.fit(text)

print(vectorizer7.vocabulary_)

vector = vectorizer7.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df7 = pd.DataFrame(vector.todense(), columns=vectorizer7.get_feature_names())
x = count_vect_df7.to_numpy()

for i in range(0, len(df)):
  df[feature].iloc[i] = x[i]
df.head()

{'16686': 10, '35825': 44, '39477': 80, '42777': 107, '35313': 42, 'dport': 205, '41973': 101, '6831': 184, '47457': 131, '172': 11, '18': 12, '8400': 190, '127': 3, '11': 1, '37757': 63, '5000': 144, '59623': 176, '8500': 191, '53': 153, '38453': 67, '12': 2, '14269': 6, '38697': 70, 'nat': 212, 'to': 223, 'destination': 200, 'output': 215, '48066': 133, '15': 7, '44799': 118, '40586': 86, '60383': 179, '33359': 26, '39689': 81, 'dnat': 201, '33081': 23, '33909': 29, '41481': 96, '49156': 141, '36912': 56, '33863': 28, '45104': 120, '49155': 140, 'sport': 221, '51542': 151, 'udp': 224, '58365': 175, '58286': 174, 'br': 198, '9dd2b9211053': 194, '47915': 132, '6832': 185, '34535': 34, '34103': 30, '5775': 171, 'accept': 195, '60972': 183, '42645': 104, '5778': 172, '8600': 192, '45355': 122, 'docker': 202, '56890': 165, '14267': 4, 'source': 220, '32775': 21, '33039': 22, '56903': 166, 'snat': 219, 'sbin': 217, 'iptables': 208, 'docker_output': 203, '60532': 180, 'masquerade': 210, '83

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[0, 0, 1, 0, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [16]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(df[["object_type"]])

print(enc.categories_)
onehottt = enc.transform(df[["object_type"]]).toarray()
feature = "object_type"

for i in range(0, len(df)):
  df[feature].iloc[i] = onehottt[i]
df.head()

[array(['file'], dtype=object)]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow
0,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[0, 0, 1, 0, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
5,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [17]:
df['object_type'][6].shape

(2,)

## Training

In [18]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

# fit on majority class
trainX = []
temp = []
tempdf = df[['object_type','entity_path', 'reader_path', 'writer_path', 'namespaces', 'reader_relation_types', 'writer_relation_types', 'reader_argvs', 'writer_argvs', 'priviledged_flow']]
# tempdf = df[['namespaces','priviledged_flow']]
for i in range(0, len(tempdf)):
  for j in tempdf.columns:
    try: 
      temp = temp + tempdf[j].iloc[i].tolist()
    except:
      temp.append(tempdf[j].iloc[i])
  trainX.append(temp)
  temp = []
trainX = np.array(trainX)
trainX.shape

(903, 6)

In [19]:
trainX

array([[1, 1, 1, 1, 1, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       ...,
       [1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 0]])

## Testing Feature Eng

In [20]:
df2 = pd.read_csv("gdrive/MyDrive/feature_csvs/5736_features.csv")
df3 = pd.read_csv("gdrive/MyDrive/feature_csvs/14271_features.csv")
df4 = pd.read_csv("gdrive/MyDrive/feature_csvs/15664_features.csv")
df5 = pd.read_csv("gdrive/MyDrive/feature_csvs/0492_features.csv")
df1 = pd.concat([df2, df3, df4, df5])
df1

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
3,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-journald'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['memory_write', 'clone_mem']",[],[],0,0
4,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-journald'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['memory_write', 'clone_mem']",[],[],0,0
...,...,...,...,...,...,...,...,...,...,...,...
28,file,"['/systemd/netif/links/3', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
29,process_memory,['/'],['/usr/bin/containerd-shim-runc-v2'],['/'],"[1, 1, 1, 1, 1]",['ptrace_read'],"['clone_mem', 'memory_write']",[],"['--console-socket', 'json', '--bundle', '--lo...",1,0
30,file,"['/systemd/netif/links/8', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
31,file,"['/systemd/resolve/stub-resolv.conf', '/system...",['/usr/bin/sudo'],['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0


In [21]:
df1 = df1[(df1['object_type'] == 'file') | (df1['object_type'] == 'link')]
df1

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
5,file,"['/systemd/netif/state', '/systemd/netif/.#sta...","['/lib/systemd/systemd-timesyncd', '/bin/netwo...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'perm_check', 'open', 'read']","['write', 'rename']",[],[],0,0
6,file,"['/systemd/netif/links/13', '/systemd/netif/li...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'open', 'perm_check', 'read']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0
...,...,...,...,...,...,...,...,...,...,...,...
27,file,"['/systemd/netif/links/.#8FCP8GC', '/systemd/n...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
28,file,"['/systemd/netif/links/3', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
30,file,"['/systemd/netif/links/8', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
31,file,"['/systemd/resolve/stub-resolv.conf', '/system...",['/usr/bin/sudo'],['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0


In [22]:
test_labels = df1['labels']
test_labels

0     0
1     0
2     0
5     0
6     0
     ..
27    0
28    0
30    0
31    0
32    1
Name: labels, Length: 74, dtype: int64

In [23]:
text1 = df1['entity_path']



vector1 = vectorizer1.transform(text1)

print(vector1.shape)
print(vector1.toarray())

count_vect_df1 = pd.DataFrame(vector1.todense(), columns=vectorizer1.get_feature_names())
x1 = count_vect_df1.to_numpy()
x1.shape

for i in range(0, len(df1)):
  df1['entity_path'].iloc[i] = x1[i]
df1.head()

(74, 820)
[[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['/lib/systemd/systemd-timesyncd', '/bin/netwo...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'perm_check', 'open', 'read']","['write', 'rename']",[],[],0,0
6,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'open', 'perm_check', 'read']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0


In [24]:
feature = "reader_path"
text = df1[feature]

print(vectorizer2.vocabulary_)

vector = vectorizer2.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df2 = pd.DataFrame(vector.todense(), columns=vectorizer2.get_feature_names())
x = count_vect_df2.to_numpy()

for i in range(0, len(df1)):
  df1[feature].iloc[i] = x[i]
df1.head()

{'bin': 1, 'networkctl': 6, 'lib': 4, 'systemd': 15, 'resolved': 11, 'usr': 16, 'python3': 10, 'dockerd': 2, 'runc': 12, 'networkd': 7, 'local': 5, 'gosu': 3, 'bash': 0, 'openresty': 9, 'nginx': 8, 'sbin': 13, 'sudo': 14}
(74, 17)
[[0 2 1 ... 0 0 2]
 [0 2 1 ... 0 0 2]
 [0 2 1 ... 0 0 2]
 ...
 [0 0 0 ... 0 2 0]
 [0 1 0 ... 1 0 1]
 [1 1 0 ... 0 0 0]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'perm_check', 'open', 'read']","['write', 'rename']",[],[],0,0
6,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'open', 'perm_check', 'read']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0


In [25]:
feature = "writer_path"
text = df1[feature]

vector = vectorizer3.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df3 = pd.DataFrame(vector.todense(), columns=vectorizer3.get_feature_names())
x = count_vect_df3.to_numpy()

for i in range(0, len(df1)):
  df1[feature].iloc[i] = x[i]
df1.head()

(74, 13)
[[1 1 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 0 0 0 1 0 2 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 0 0 0 1 0 2 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 0 0 0 1 0 2 0 0]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0 0 1 0 1 0 0 0 0 2 0 0]
 [0 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","['getattr', 'perm_check', 'open', 'read']","['write', 'rename']",[],[],0,0
6,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","['getattr', 'open', 'perm_check', 'read']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0


In [26]:
df1['namespaces'] = list(map(lambda x: list(map(lambda y: int(y),x.split('[')[1].split(']')[0].split(','))), df1['namespaces'].to_numpy()))
df1['namespaces'] = df1['namespaces'].apply(lambda x: np.array(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [27]:
feature = "reader_relation_types"
text = df1[feature]

vector = vectorizer4.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df4 = pd.DataFrame(vector.todense(), columns=vectorizer4.get_feature_names())
x = count_vect_df4.to_numpy()

for i in range(0, len(df1)):
  df1[feature].iloc[i] = x[i]
df1.head()

(74, 5)
[[1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 1]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 0 0 0 0]
 [0 0 1 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [0 0 1 0 0]
 [1 1 1 1 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 0 0 0 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]
 [1 1 1 1 0]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]",['rename'],[],[],0,0
1,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]",['rename'],[],[],0,0
2,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]",['rename'],[],[],0,0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","['write', 'rename']",[],[],0,0
6,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0


In [28]:
feature = "writer_relation_types"
text = df1[feature]

vector = vectorizer5.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df15 = pd.DataFrame(vector.todense(), columns=vectorizer5.get_feature_names())
x = count_vect_df15.to_numpy()

for i in range(0, len(df1)):
  df1[feature].iloc[i] = x[i]
df1.head()

(74, 6)
[[0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 0 0 1 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 0 0 1 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 0 0]
 [0 0 0 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 1 0 1 0]
 [0 0 1 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [0 0 0 0 1 0]
 [

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",[],[],0,0
1,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",[],[],0,0
2,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",[],[],0,0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],[],0,0
6,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0


In [29]:
feature = "reader_argvs"
text = df1[feature]

vector = vectorizer6.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df16 = pd.DataFrame(vector.todense(), columns=vectorizer6.get_feature_names())
x = count_vect_df16.to_numpy()

for i in range(0, len(df1)):
  df1[feature].iloc[i] = x[i]
df1.head()

(74, 438)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
1,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
2,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
6,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]",[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0


In [30]:
feature = "writer_argvs"
text = df1[feature]

vector = vectorizer7.transform(text)

print(vector.shape)
print(vector.toarray())

count_vect_df17 = pd.DataFrame(vector.todense(), columns=vectorizer7.get_feature_names())
x = count_vect_df17.to_numpy()

for i in range(0, len(df1)):
  df1[feature].iloc[i] = x[i]
df1.head()

(74, 227)
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
1,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
2,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
5,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
6,file,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0


In [31]:
onehottt1 = enc.transform(df1[["object_type"]]).toarray()
feature = "object_type"

for i in range(0, len(df1)):
  df1[feature].iloc[i] = onehottt1[i]
df1.head()


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
1,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
2,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[1, 1, 1, 1, 1]","[1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
5,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0
6,[1.0],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0]","[0, 1, 0, 0, 0]","[1, 1, 1, 1, 0]","[0, 0, 1, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0,0


## Testing

In [32]:
testX = []
temp1 = []
tempdf1 = df1[['object_type','entity_path', 'reader_path', 'writer_path', 'namespaces', 'reader_relation_types', 'writer_relation_types', 'reader_argvs', 'writer_argvs', 'priviledged_flow']]
# tempdf1 = df1[['namespaces','priviledged_flow']]
for i in range(0, len(tempdf1)):
  for j in tempdf1.columns:
    try: 
      temp1 = temp1 + tempdf1[j].iloc[i].tolist()
    except: 
      temp1.append(tempdf1[j].iloc[i])
  testX.append(temp1)
  temp1 = []
testX = np.array(testX)
testX.shape

(74, 6)

In [33]:
from sklearn.svm import OneClassSVM
model = OneClassSVM(gamma='scale', nu=0.01, kernel='rbf') 

In [34]:
# from sklearn.neighbors import LocalOutlierFactor
# model = LocalOutlierFactor(n_neighbors=50, contamination=0.25, algorithm='kd_tree')

In [35]:
model.fit(trainX)

OneClassSVM(nu=0.01)

In [36]:
yhat = model.fit_predict(testX)

## Results

In [37]:
yhat

array([-1, -1, -1,  1,  1,  1, -1, -1, -1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1, -1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1, -1, -1, -1,  1,  1, -1, -1, -1,  1,  1,
        1,  1,  1,  1,  1,  1])

In [38]:
#yhat1 = np.where(yhat == -1, 1, yhat)
yhat[yhat == 1] = 0
yhat[yhat == -1] = 1
yhat

array([1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0])

In [39]:
def print_stats(predictions, labels):
  print("Accuracy = {}".format(accuracy_score(labels, predictions)))
  print("Precision = {}".format(precision_score(labels, predictions)))
  print("Recall = {}".format(recall_score(labels, predictions)))

In [40]:
df2 = pd.read_csv("gdrive/MyDrive/feature_csvs/5736_features.csv")
df3 = pd.read_csv("gdrive/MyDrive/feature_csvs/14271_features.csv")
df4 = pd.read_csv("gdrive/MyDrive/feature_csvs/15664_features.csv")
df5 = pd.read_csv("gdrive/MyDrive/feature_csvs/0492_features.csv")
review_df = pd.concat([df2, df3, df4, df5])
review_df

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
3,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-journald'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['memory_write', 'clone_mem']",[],[],0,0
4,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-journald'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['memory_write', 'clone_mem']",[],[],0,0
...,...,...,...,...,...,...,...,...,...,...,...
28,file,"['/systemd/netif/links/3', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
29,process_memory,['/'],['/usr/bin/containerd-shim-runc-v2'],['/'],"[1, 1, 1, 1, 1]",['ptrace_read'],"['clone_mem', 'memory_write']",[],"['--console-socket', 'json', '--bundle', '--lo...",1,0
30,file,"['/systemd/netif/links/8', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
31,file,"['/systemd/resolve/stub-resolv.conf', '/system...",['/usr/bin/sudo'],['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0


In [41]:
review_df = review_df[(review_df['object_type'] == 'file') | (review_df['object_type'] == 'link')]
review_df

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels
0,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
5,file,"['/systemd/netif/state', '/systemd/netif/.#sta...","['/lib/systemd/systemd-timesyncd', '/bin/netwo...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'perm_check', 'open', 'read']","['write', 'rename']",[],[],0,0
6,file,"['/systemd/netif/links/13', '/systemd/netif/li...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'open', 'perm_check', 'read']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0
...,...,...,...,...,...,...,...,...,...,...,...
27,file,"['/systemd/netif/links/.#8FCP8GC', '/systemd/n...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
28,file,"['/systemd/netif/links/3', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
30,file,"['/systemd/netif/links/8', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
31,file,"['/systemd/resolve/stub-resolv.conf', '/system...",['/usr/bin/sudo'],['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0


In [42]:
review_df['predictions'] = yhat
review_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels,predictions
0,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
1,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
2,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
5,file,"['/systemd/netif/state', '/systemd/netif/.#sta...","['/lib/systemd/systemd-timesyncd', '/bin/netwo...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'perm_check', 'open', 'read']","['write', 'rename']",[],[],0,0,0
6,file,"['/systemd/netif/links/13', '/systemd/netif/li...","['/lib/systemd/systemd-resolved', '/usr/bin/py...",['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['getattr', 'open', 'perm_check', 'read']","['write', 'rename']",[],"['--no-legend', '/bin/networkctl', '--no-pager...",0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
27,file,"['/systemd/netif/links/.#8FCP8GC', '/systemd/n...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0,0
28,file,"['/systemd/netif/links/3', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0,0
30,file,"['/systemd/netif/links/8', '/systemd/netif/lin...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0,0
31,file,"['/systemd/resolve/stub-resolv.conf', '/system...",['/usr/bin/sudo'],['/lib/systemd/systemd-resolved'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0,0


In [43]:
review_df.shape

(74, 12)

In [44]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, losses
from tensorflow.keras.datasets import fashion_mnist
from tensorflow.keras.models import Model

In [45]:
print_stats(review_df['predictions'], review_df['labels'])

Accuracy = 0.6486486486486487
Precision = 0.0
Recall = 0.0


In [46]:
# Anomalies that are correctly identified
# TP
correct_anomaly = review_df.loc[(review_df['labels'] == 1) & (review_df['predictions'] == 1)]
correct_anomaly

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels,predictions


In [47]:
# Benign data that is wrongly labelled as Anomalies -> False Positives -> Prescisions
# FP
wrong_anomaly = review_df.loc[(review_df['labels'] == 0) & (review_df['predictions'] == 1)]
wrong_anomaly

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels,predictions
0,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
1,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
2,file,['/var/lib/docker/overlay2/21efa2c02f6440f41f4...,"['/usr/bin/dockerd', '/usr/bin/docker-runc']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
8,file,['/var/lib/docker/containers/ea9022a5e365a2f85...,['/usr/bin/docker-runc'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0,0,1
9,file,['/var/lib/docker/containers/ea9022a5e365a2f85...,['/usr/bin/docker-runc'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0,0,1
10,file,['/var/lib/docker/containers/ea9022a5e365a2f85...,['/usr/bin/docker-runc'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0,0,1
0,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,"['/', '/usr/bin/dockerd']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
1,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,"['/', '/usr/bin/dockerd']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0,1
7,file,['/var/lib/docker/containers/067b997e15622699d...,['/'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0,0,1
8,file,['/var/lib/docker/containers/067b997e15622699d...,['/'],['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['write'],[],[],0,0,1


In [48]:
# Anomalous data what is wrongly labelled as Benign -> False Negatives -> Recall 
wrong_benign = review_df.loc[(review_df['labels'] == 1) & (review_df['predictions'] == 0)]
wrong_benign

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels,predictions
42,file,['/usr/bin/docker-runc'],"['/usr/bin/docker-containerd-shim', '/bin/bash...",['/usr/bin/docker-runc'],"[1, 1, 1, 1, 1]","['getattr', 'open', 'perm_check', 'read_ioctl'...",['write'],"['/proc/self/fd/3', '/overwrite_runc']","['--root', 'cat', '-address', '/bin/bash', 'st...",1,1,0
25,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,['/usr/bin/dockerd'],"['/bin/cp', '/bin/bash', '/usr/bin/dockerd']","[1, 1, 1, 1, 1]","['open', 'read', 'getattr', 'mmap_private', 'p...","['rename', 'write']","['evil_libnss_files.so.2', 'cp', '/lib/x86_64-...","['docker-tar', '/var/lib/docker/overlay2/3e383...",1,1,0
26,file,['/evil'],['/bin/bash'],['/'],"[1, 1, 1, 1, 1]","['open', 'perm_check', 'read', 'getattr']",['write'],['bash'],"['cat', '/evil']",1,1,0
33,link,['/var/lib/docker/overlay2/b57f7ff1d23b59e7ca9...,['/usr/bin/dockerd'],"['/usr/bin/dockerd', '/usr/bin/docker-runc']","[1, 1, 1, 1, 1]","['getattr', 'read_link']",['rename'],"['/totally_safe_path', '/symlink_swap']",[],1,1,0
32,file,['/release_agent'],['/bin/bash'],['/'],"[1, 1, 1, 1, 1]","['open', 'getattr', 'perm_check', 'read']",['write'],['bash'],"['cat', '/sys/fs/cgroup/memory/release_agent']",1,1,0


In [49]:
total_anomalies = review_df.loc[(review_df['labels'] == 1)]
print(total_anomalies.shape)
total_anomalies

(5, 12)


Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,priviledged_flow,labels,predictions
42,file,['/usr/bin/docker-runc'],"['/usr/bin/docker-containerd-shim', '/bin/bash...",['/usr/bin/docker-runc'],"[1, 1, 1, 1, 1]","['getattr', 'open', 'perm_check', 'read_ioctl'...",['write'],"['/proc/self/fd/3', '/overwrite_runc']","['--root', 'cat', '-address', '/bin/bash', 'st...",1,1,0
25,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,['/usr/bin/dockerd'],"['/bin/cp', '/bin/bash', '/usr/bin/dockerd']","[1, 1, 1, 1, 1]","['open', 'read', 'getattr', 'mmap_private', 'p...","['rename', 'write']","['evil_libnss_files.so.2', 'cp', '/lib/x86_64-...","['docker-tar', '/var/lib/docker/overlay2/3e383...",1,1,0
26,file,['/evil'],['/bin/bash'],['/'],"[1, 1, 1, 1, 1]","['open', 'perm_check', 'read', 'getattr']",['write'],['bash'],"['cat', '/evil']",1,1,0
33,link,['/var/lib/docker/overlay2/b57f7ff1d23b59e7ca9...,['/usr/bin/dockerd'],"['/usr/bin/dockerd', '/usr/bin/docker-runc']","[1, 1, 1, 1, 1]","['getattr', 'read_link']",['rename'],"['/totally_safe_path', '/symlink_swap']",[],1,1,0
32,file,['/release_agent'],['/bin/bash'],['/'],"[1, 1, 1, 1, 1]","['open', 'getattr', 'perm_check', 'read']",['write'],['bash'],"['cat', '/sys/fs/cgroup/memory/release_agent']",1,1,0


## Misc

In [None]:
resultdf.to_csv("gdrive/MyDrive/Data_Provenance_Anomaly_Detection/5736_per_flow_features_with_yhat.csv", index=False)

In [None]:
from sklearn.ensemble import IsolationForest
model2 = IsolationForest(contamination=0.2, bootstrap=True)

In [None]:
model2.fit(trainX)

IsolationForest(bootstrap=True, contamination=0.2)

In [None]:
yhat2 = model2.predict(testX)

In [None]:
print(yhat2)
len(yhat2)

[ 1  1  1  1  1  1  1  1  1  1  1 -1  1 -1 -1 -1  1  1  1  1  1 -1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1
  1  1  1  1  1 -1  1  1  1  1  1  1 -1 -1  1  1  1  1  1  1  1  1  1  1
  1 -1  1  1  1  1  1  1  1 -1  1  1  1  1  1  1  1  1 -1 -1  1  1  1  1
 -1  1  1  1  1  1  1  1  1 -1 -1  1  1  1  1  1  1  1  1  1  1  1]


118

In [None]:
#yhat1 = np.where(yhat == -1, 1, yhat)
yhat2[yhat2 == 1] = 0
yhat2[yhat2 == -1] = 1
yhat2

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
df2 = pd.read_csv("gdrive/MyDrive/Data_Provenance_Anomaly_Detection/cve_labeled_features_14271.csv")
df3 = pd.read_csv("gdrive/MyDrive/Data_Provenance_Anomaly_Detection/cve_labeled_features_5736.csv")
df4 = pd.read_csv("gdrive/MyDrive/Data_Provenance_Anomaly_Detection/cve_labeled_features_15664.csv")
review_df = pd.concat([df2, df3, df4])
review_df

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,labels
0,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,"['/', '/usr/bin/dockerd']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0
1,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,"['/', '/usr/bin/dockerd']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0
2,process_memory,['/usr/bin/dockerd'],"['/bin/touch', '/bin/umount', '/bin/mkdir', '/...",['/usr/bin/dockerd'],"[0, 0, 0, 0, 0]",['memory_read'],"['clone_mem', 'memory_write']",[],[],0
3,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['clone_mem', 'memory_write']",[],[],0
4,file,"['/systemd/netif/state', '/systemd/netif/.#sta...",['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['perm_check', 'getattr', 'open', 'read']","['rename', 'write']",[],[],0
...,...,...,...,...,...,...,...,...,...,...
39,file,"['/systemd/netif/.#stateYdoDkt', '/systemd/net...",['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0
40,file,"['/systemd/netif/links/.#13Wjlzqv', '/systemd/...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0
41,process_memory,['/lib/systemd/systemd'],['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd'],"[0, 1, 0, 0, 0]",['ptrace_read'],['memory_write'],[],"['--prefix=/net/ipv6/neigh/vethd44be3c', '--pr...",0
42,process_memory,['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-timesyncd'],"[0, 1, 0, 0, 0]",['ptrace_read'],['memory_write'],[],[],0


In [None]:
review_df['predictions'] = yhat2
review_df

Unnamed: 0,object_type,entity_path,reader_path,writer_path,namespaces,reader_relation_types,writer_relation_types,writer_argvs,reader_argvs,labels,predictions
0,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,"['/', '/usr/bin/dockerd']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
1,file,['/var/lib/docker/overlay2/3e383c63e41c4026836...,"['/', '/usr/bin/dockerd']",['/usr/bin/dockerd'],"[1, 1, 1, 1, 1]",['getattr'],['rename'],[],[],0,0
2,process_memory,['/usr/bin/dockerd'],"['/bin/touch', '/bin/umount', '/bin/mkdir', '/...",['/usr/bin/dockerd'],"[0, 0, 0, 0, 0]",['memory_read'],"['clone_mem', 'memory_write']",[],[],0,0
3,process_memory,['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd-udevd'],"[0, 1, 0, 0, 0]",['ptrace_read'],"['clone_mem', 'memory_write']",[],[],0,0
4,file,"['/systemd/netif/state', '/systemd/netif/.#sta...",['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['perm_check', 'getattr', 'open', 'read']","['rename', 'write']",[],[],0,0
...,...,...,...,...,...,...,...,...,...,...,...
39,file,"['/systemd/netif/.#stateYdoDkt', '/systemd/net...",['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
40,file,"['/systemd/netif/links/.#13Wjlzqv', '/systemd/...",['/lib/systemd/systemd-resolved'],['/lib/systemd/systemd-networkd'],"[0, 1, 0, 0, 0]","['open', 'getattr', 'perm_check', 'read']","['write', 'rename']",[],[],0,0
41,process_memory,['/lib/systemd/systemd'],['/lib/systemd/systemd-udevd'],['/lib/systemd/systemd'],"[0, 1, 0, 0, 0]",['ptrace_read'],['memory_write'],[],"['--prefix=/net/ipv6/neigh/vethd44be3c', '--pr...",0,0
42,process_memory,['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-timesyncd'],['/lib/systemd/systemd-timesyncd'],"[0, 1, 0, 0, 0]",['ptrace_read'],['memory_write'],[],[],0,0


In [None]:
print_stats(review_df['predictions'], review_df['labels'])


Accuracy = 0.8813559322033898
Precision = 0.4117647058823529
Recall = 0.6363636363636364


In [None]:
from sklearn.covariance import EllipticEnvelope
model3 = EllipticEnvelope(contamination=0.2)

In [None]:
model3.fit(trainX)

  "The covariance matrix associated to your dataset is not full rank"


EllipticEnvelope(contamination=0.2)