In [1]:
from attack_predictor_libs.dataset.guide_dataset import GuideDataset
import polars as pl
import warnings
from pgmpy.estimators import BicScore, HillClimbSearch
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


## dataset

In [2]:
dataset = GuideDataset()

INFO:attack_predictor_libs.dataset.dataset:load cache file from: /home/work/dataset/guide/guide.csv


In [3]:
train_df = dataset.data.head()

In [4]:
train_df = dataset.data.filter(pl.col("dataset_type")=='train')
test_df  = dataset.data.filter(pl.col("dataset_type")=='test')

In [5]:
train_df = train_df.with_columns(
    pl.col("MitreTechniques").str.split(";").alias("technique")
).explode("technique")
train_df = train_df.filter(pl.col("IncidentGrade") == "TruePositive")

train_df = train_df.with_columns(
    pl.col("technique").map_elements(lambda x:  x.split(".")[0] if "."  in x else x).alias('parent-technique')
)
train_df = train_df.with_columns( (pl.col("OrgId").cast(str) + "_" + pl.col("IncidentId").cast(str)).alias("primary_key"))

In [6]:
train_df.head()
groubpy_train_df = train_df.group_by(["primary_key", "parent-technique"]).agg((pl.count()>=1).cast(int))

In [7]:
key_counts = groubpy_train_df.group_by("primary_key").agg(
    pl.col("parent-technique").n_unique().alias("technique_count")
)

# technique_countが1のprimary_keyを抽出
single_keys = key_counts.filter(pl.col("technique_count") == 1).select("primary_key")

# technique_countが1のprimary_keyを除外
filtered_df = groubpy_train_df.filter(~pl.col("primary_key").is_in(single_keys["primary_key"]))

In [114]:
pivot_df = filtered_df.pivot(
    values="count",
    columns="parent-technique",
    index="primary_key"
).fill_null(0)

In [112]:
pdf = pivot_df.to_pandas()
pdf = pdf.head(100)

In [110]:
pdf

Unnamed: 0,primary_key,T1219,T1078,T1047,T1205,T1620,T1059,T1110,T1518,T1543,...,T1039,T1037,T1496,T0859,T1615,T1578,T0829,T0816,T1537,T1585
100,0_37605,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
101,0_384,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
102,0_3867,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
103,0_38747,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
104,0_392,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1016_155011,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
196,101_168,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
197,101_253,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
198,101_4798,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
for i in range(0,100):
    columns_with_one = pdf.columns[pdf.iloc[i] == 1].tolist()
    print(i,columns_with_one)

0 ['T1190', 'T1133']
1 ['T1190', 'T1133']
2 ['T1190', 'T1133']
3 ['T1190', 'T1133']
4 ['T1219', 'T1078', 'T1110', 'T1190', 'T1098', 'T1133', 'T1105', 'T1189', 'T1570', 'T1203', 'T1071']
5 ['T1190', 'T1133']
6 ['T1190', 'T1133']
7 ['T1190', 'T1133']
8 ['T1190', 'T1133']
9 ['T1190', 'T1133']
10 ['T1190', 'T1133']
11 ['T1190', 'T1133']
12 ['T1190', 'T1133']
13 ['T1190', 'T1133']
14 ['T1190', 'T1133']
15 ['T1190', 'T1133']
16 ['T1190', 'T1133']
17 ['T1190', 'T1133']
18 ['T1190', 'T1133']
19 ['T1190', 'T1133']
20 ['T1190', 'T1133']
21 ['T1190', 'T1133']
22 ['T1190', 'T1133']
23 ['T1190', 'T1133']
24 ['T1189', 'T1203', 'T1071']
25 ['T1190', 'T1133']
26 ['T1190', 'T1133']
27 ['T1190', 'T1133']
28 ['T1190', 'T1133']
29 ['T1190', 'T1133']
30 ['T1190', 'T1133']
31 ['T1190', 'T1133']
32 ['T1190', 'T1133']
33 ['T1190', 'T1133']
34 ['T1190', 'T1133']
35 ['T1190', 'T1133']
36 ['T1190', 'T1133']
37 ['T1190', 'T1133']
38 ['T1190', 'T1133']
39 ['T1190', 'T1133']
40 ['T1190', 'T1133']
41 ['T1190', 'T113

In [22]:
score_function = BicScore(pdf)
search = HillClimbSearch(pdf)
network = search.estimate(max_iter=300)

100%|██████████| 3/3 [00:53<00:00, 17.79s/it]


In [11]:
from pgmpy.models import BayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.sampling import GibbsSampling


In [31]:
# 学習したDAGをBayesianNetworkに変換
bayesian_network = BayesianNetwork(network.edges())

# パラメータを学習
bayesian_network.fit(pdf, estimator=BayesianEstimator)

In [44]:
nodes = list(bayesian_network.nodes)

In [68]:
# サンプリング初期パラメータ
from pgmpy.factors.discrete import State
import random
columns_with_one = pdf.columns[pdf.iloc[0] == 1].tolist()

state = {}
for node in nodes:
    state[node] = random.randint(0, 1)


for column in columns_with_one:
    if (column in state) and (column != "T1030"):
        state[column] = 1

evidence = [ State(node, state[node]) for node in state]

In [69]:
sampling = GibbsSampling(bayesian_network)

In [70]:
sampling.sample(size=1, start_state=evidence)

0it [00:00, ?it/s]


Unnamed: 0,T1135,T1018,T1087,T1049,T1016
0,1,1,0,0,1
