In [1]:
from attack_predictor_libs.dataset.guide_dataset import GuideDataset
import polars as pl
from attack_predictor_libs.visualize.scenario_visualizer import visualize_scenario
from tqdm import tqdm

## 仮説検証内容
仮説：攻撃は、Preparation, Intrusion, Compromiseのこの３つの順番に処理が進んでいる。
この仮説が正しいか検証する。  

### 仮説検証 (Trainデータ)

In [2]:
dataset = GuideDataset()

In [3]:
df  = dataset.data.filter(pl.col("dataset_type") == 'train').filter(pl.col("IncidentGrade") == "TruePositive")

In [None]:
df.n_uniqu

In [54]:
ret_dict = {}
gdf = df.group_by(["OrgId","IncidentId"])

for key, group_df in tqdm(gdf, total=df.n_unique(subset=["OrgId","IncidentId"])):
    
    tmpdf = group_df.with_columns(
        pl.max_horizontal(
            pl.col("is_preparation_alert").cast(int),
            pl.col("is_intrusion_alert").cast(int)*2,
            pl.col("is_compromise_alert").cast(int)*4
        ).alias("phase"),
    ).sort("Timestamp")

    min_value = tmpdf.select(
        pl.col("phase").diff().fill_null(0).alias("phase_diff")
    ).get_column("phase_diff").min()
    
    # keyはincident_id, min_valueはマイナスであれば、Preparation, Intrusino, Compromiseの順でアラートが来て以内ということを示す。
    ret_dict[str(key[0]) + "_" + str(key[1])] = min_value

100%|██████████| 45535/45535 [02:20<00:00, 323.30it/s]


In [56]:
incident_df = df.group_by(["OrgId", "IncidentId"]).agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
incident_df =  incident_df.with_columns(    (
            (pl.col("is_preparation_alert").cast(int) *(2**0)) +  # is_preparation_alert をビット2にシフト
            (pl.col("is_intrusion_alert").cast(int) *(2**1)) +   # is_intrusion_alert をビット1にシフト
            pl.col("is_compromise_alert").cast(int)  *(2**2)           # is_compromise_alert をビット0に
    ).alias("alert_bit_value")
)

In [62]:
incident_df = incident_df.with_columns((pl.col("OrgId").cast(str) + "_" + pl.col("IncidentId").cast(str)).replace_strict(ret_dict).cast(int).alias("ret_value"))
not_expected_scenario_dict = {}
for key, group_df in incident_df.group_by(["OrgId", "IncidentId"]):
    key = str(key[0]) + "_" + str(key[1])
    count = group_df.filter(pl.col("ret_value") < 0).shape[0]
    not_expected_scenario_dict[key] = count
    # if count > 0:
        # print(key)

In [67]:
for key in not_expected_scenario_dict.keys():
    print(key, not_expected_scenario_dict[key])


8_84149 0
129_75435 0
309_394337 0
38_455573 0
786_33895 0
461_48305 0
8_524202 0
374_27445 0
338_30959 0
239_51732 0
247_48544 0
516_75603 0
900_41068 0
55_179391 1
2046_49847 0
54_229891 0
195_57084 0
5_501085 0
806_78402 0
289_247259 0
875_36769 0
195_33399 0
253_100639 0
56_114583 0
5_207513 0
830_297194 0
740_31743 0
714_251306 0
1999_53101 0
205_40662 0
445_68956 0
107_15008 0
107_68524 0
56_182592 0
5_360130 0
465_19184 0
502_12895 0
28_180452 0
28_211186 0
1298_28466 0
376_85627 0
715_141159 0
38_239406 0
38_458175 0
457_141406 0
718_388734 0
38_456804 0
457_185820 0
2115_278418 0
56_147198 0
280_17532 0
360_127617 0
947_113296 0
2725_240949 0
130_65984 0
374_47165 0
231_67953 0
5_501997 0
216_71655 0
108_165093 0
335_38 0
56_103062 0
395_37287 0
56_248461 0
59_227581 0
387_31808 0
35_284221 0
547_16584 0
417_38036 0
704_75873 0
28_211116 0
892_43946 0
8_84151 0
595_172177 0
216_110509 0
38_458346 0
398_108527 0
853_16111 0
5_501380 0
1453_34787 0
689_105724 0
242_151256 0
665_

In [47]:
incident_df.group_by("alert_bit_value").agg(pl.count("IncidentId").alias("count")).sort("alert_bit_value").with_columns(pl.col("alert_bit_value").replace_strict(not_expected_scenario_dict).alias("not_expected_scenario_count"))

alert_bit_value,count,not_expected_scenario_count
i64,u32,i64
0,19,0
1,11360,0
2,2274,0
3,5835,91
4,86,0
6,36,4
7,3,3


In [None]:
incident_df.filter( pl.col("alert_bit_value")==7).with_columns( (pl.col("OrgId").cast(str) + "_" + pl.col("IncidentId").cast(str))).get_column("OrgId").to_list()

### 仮説検証 (Testデータ)

In [8]:
test_df  = dataset.data.filter(pl.col("dataset_type") == 'test').filter(pl.col("IncidentGrade") == "TruePositive")

In [42]:
ret_dict = {}
gdf = test_df.group_by(["OrgId","IncidentId"])

for key, group_df in tqdm(gdf, total=test_df.n_unique(subset=["OrgId","IncidentId"])):
    
    tmpdf = group_df.with_columns(
        pl.max_horizontal(
            pl.col("is_preparation_alert").cast(int),
            pl.col("is_intrusion_alert").cast(int)*2,
            pl.col("is_compromise_alert").cast(int)*4
        ).alias("phase"),
    ).sort("Timestamp")

    min_value = tmpdf.select(
        pl.col("phase").diff().fill_null(0).alias("phase_diff")
    ).get_column("phase_diff").min()
    
    # keyはincident_id, min_valueはマイナスであれば、Preparation, Intrusino, Compromiseの順でアラートが来て以内ということを示す。
    ret_dict[str(key[0]) + "_" + str(key[1])] = min_value

  0%|          | 0/19613 [00:00<?, ?it/s]

100%|██████████| 19613/19613 [01:00<00:00, 323.34it/s]


In [43]:
incident_df = test_df.group_by(["OrgId", "IncidentId"]).agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
# incident_df = test_df.group_by("IncidentId").agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
incident_df =  incident_df.with_columns(    (
            (pl.col("is_preparation_alert").cast(int) *(2**0)) +  # is_preparation_alert をビット2にシフト
            (pl.col("is_intrusion_alert").cast(int) *(2**1)) +   # is_intrusion_alert をビット1にシフト
            pl.col("is_compromise_alert").cast(int)  *(2**2)           # is_compromise_alert をビット0に
    ).alias("alert_bit_value")
)

In [45]:
incident_df = incident_df.with_columns((pl.col("OrgId").cast(str) + "_" + pl.col("IncidentId").cast(str)).replace_strict(ret_dict).cast(int).alias("ret_value"))
not_expected_scenario_dict = {}
for key, group_df in incident_df.group_by("alert_bit_value"):
    key = key[0]
    count = group_df.filter(pl.col("ret_value") < 0).shape[0]
    not_expected_scenario_dict[key] = count

In [46]:
incident_df.group_by("alert_bbit_value").agg(pl.count("IncidentId").alias("count")).sort("alert_bit_value").with_columns(pl.col("alert_bit_value").replace_strict(not_expected_scenario_dict).alias("not_expected_scenario_count"))

alert_bit_value,count,not_expected_scenario_count
i64,u32,i64
0,19,0
1,11360,0
2,2274,0
3,5835,91
4,86,0
6,36,4
7,3,3


In [53]:
incident_df.filter( pl.col("alert_bit_value")==7).with_columns( (pl.col("OrgId").cast(str) + "_" + pl.col("IncidentId").cast(str))).get_column("OrgId").to_list()


['899_81161', '445_2613', '240_704']

### hogehoge
preparation, intrusion, compromise全てが含まれる７にて、どのような結果になっているか確認する。

In [13]:
df  = dataset.data.filter(pl.col("dataset_type") == 'train').filter(pl.col("IncidentGrade") == "TruePositive")

In [15]:
ret_dict = {}
gdf = df.group_by("IncidentId")
for key, group_df in tqdm(gdf, total=df.n_unique("IncidentId")):
    
    tmpdf = group_df.with_columns(
        pl.max_horizontal(
            pl.col("is_preparation_alert").cast(int),
            pl.col("is_intrusion_alert").cast(int)*2,
            pl.col("is_compromise_alert").cast(int)*4
        ).alias("phase"),
    ).sort("Timestamp")

    min_value = tmpdf.select(
        pl.col("phase").diff().fill_null(0).alias("phase_diff")
    ).get_column("phase_diff").min()
    
    # keyはincident_id, min_valueはマイナスであれば、Preparation, Intrusino, Compromiseの順でアラートが来て以内ということを示す。
    ret_dict[key[0]] = min_value

  0%|          | 0/42701 [00:00<?, ?it/s]

  1%|          | 272/42701 [00:00<02:32, 278.04it/s]


KeyboardInterrupt: 

In [49]:
incident_df = df.group_by("IncidentId").agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
incident_df =  incident_df.with_columns(    (
            (pl.col("is_preparation_alert").cast(int) *(2**0)) +  # is_preparation_alert をビット2にシフト
            (pl.col("is_intrusion_alert").cast(int) *(2**1)) +   # is_intrusion_alert をビット1にシフト
            pl.col("is_compromise_alert").cast(int)  *(2**2)           # is_compromise_alert をビット0に
    ).alias("alert_bit_value")
)

In [50]:
incident_df = incident_df.with_columns(
    pl.col("IncidentId").replace_strict(ret_dict).cast(int).alias("ret_value")
)
not_expected_scenario_dict = {}
for key, group_df in incident_df.group_by("alert_bit_value"):
    key = key[0]
    count = group_df.filter(pl.col("ret_value") < 0).shape[0]
    not_expected_scenario_dict[key] = count

In [65]:
incident_df.filter(pl.col("alert_bit_value") == 7).filter(pl.col("ret_value") >= 0)

IncidentId,is_preparation_alert,is_intrusion_alert,is_compromise_alert,alert_bit_value,ret_value
i64,bool,bool,bool,i64,i64
76576,true,true,true,7,0
82346,true,true,true,7,0
22884,true,true,true,7,0
68383,true,true,true,7,0
13658,true,true,true,7,0
…,…,…,…,…,…
18652,true,true,true,7,0
14335,true,true,true,7,0
49860,true,true,true,7,0
71377,true,true,true,7,0
