In [1]:
import polars as pl
from attack_predictor_libs.dataset.guide_dataset import GuideDataset

## 目的
GUIDEデータセットには大量のデータがあるが、使用困難なデータもあると思われる。  


In [2]:
dataset = GuideDataset()

## 学習用データ（TP, FP, BP全て含む)

In [3]:
train_df = dataset.data.filter(pl.col("dataset_type") == 'train')
incident_df = train_df.group_by("IncidentId").agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
incident_df =  incident_df.with_columns(    (
            (pl.col("is_preparation_alert").cast(int) *(2**0)) +  # is_preparation_alert をビット2にシフト
            (pl.col("is_intrusion_alert").cast(int) *(2**1)) +   # is_intrusion_alert をビット1にシフト
            pl.col("is_compromise_alert").cast(int)  *(2**2)           # is_compromise_alert をビット0に
    ).alias("alert_bit_value")
)

In [4]:
incident_df.group_by("alert_bit_value").agg(pl.count("IncidentId").alias("count")).sort("alert_bit_value")

alert_bit_value,count
i64,u32
0,695
1,54347
2,20191
3,99721
4,1915
5,1005
6,1377
7,3801


## テスト用データ（TP, FP, BP全て含む)

In [5]:
test_df = dataset.data.filter(pl.col("dataset_type") == 'test')
incident_df = test_df.group_by("IncidentId").agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
incident_df =  incident_df.with_columns(    (
            (pl.col("is_preparation_alert").cast(int) *(2**0)) +  # is_preparation_alert をビット2にシフト
            (pl.col("is_intrusion_alert").cast(int) *(2**1)) +   # is_intrusion_alert をビット1にシフト
            pl.col("is_compromise_alert").cast(int)  *(2**2)           # is_compromise_alert をビット0に
    ).alias("alert_bit_value")
)

In [6]:
incident_df.group_by("alert_bit_value").agg(pl.count("IncidentId").alias("count")).sort("alert_bit_value")

alert_bit_value,count
i64,u32
0,347
1,30872
2,12194
3,44535
4,1432
5,377
6,785
7,760


## 学習用データ(True Positive)

In [7]:
dataset.data.head()

Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,MitreTechniques,IncidentGrade,ActionGrouped,ActionGranular,EntityType,EvidenceRole,DeviceId,Sha256,IpAddress,Url,AccountSid,AccountUpn,AccountObjectId,AccountName,DeviceName,NetworkMessageId,EmailClusterId,RegistryKey,RegistryValueName,RegistryValueData,ApplicationId,ApplicationName,OAuthApplicationId,ThreatFamily,FileName,FolderPath,ResourceIdName,ResourceType,Roles,OSFamily,OSVersion,AntispamDirection,SuspicionLevel,LastVerdict,CountryCode,State,City,dataset_type,is_preparation_alert,is_intrusion_alert,is_compromise_alert,alert_bit_value
i64,i64,i64,i64,datetime[μs],i64,i64,str,str,str,str,str,str,str,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,i64,i64,i64,i64,i64,i64,str,i64,i64,i64,str,str,i64,i64,str,str,str,i64,i64,i64,str,bool,bool,bool,i64
1056561957389,809,58352,712507,2024-06-13 04:52:55,423,298,"""InitialAccess""","""T1189""","""FalsePositive""",,,"""Url""","""Related""",98799,138268,360606,68652,441377,673934,425863,453297,153085,529644,,1631,635,860,2251,3421,881,,289573,117668,3586,,,5,66,,"""Suspicious""","""Suspicious""",242,1445,10630,"""train""",True,False,False,1
1322849927433,11,417400,825450,2024-06-10 13:30:56,0,0,"""InitialAccess""","""T1078;T1078.004""","""FalsePositive""",,,"""Ip""","""Related""",98799,138268,30410,160396,441377,673934,425863,453297,153085,529644,,1631,635,860,2251,3421,881,,289573,117668,3586,,,5,66,,,,8,6,3,"""train""",True,True,False,3
781684051738,2119,6622,23284,2024-06-10 10:28:29,13,11,"""InitialAccess""","""T1566""","""BenignPositive""",,,"""MailMessage""","""Related""",98799,138268,360606,160396,441377,160691,425863,453297,153085,1842,,1631,635,860,2251,3421,881,,289573,117668,3586,,,5,66,,,,242,1445,10630,"""train""",True,False,False,1
635655163305,261,110412,41503,2024-06-03 17:05:40,1794,344,"""Collection""","""T1098;T1114""","""BenignPositive""",,,"""User""","""Impacted""",98799,138268,360606,160396,172996,268738,173595,184537,153085,529644,,1631,635,860,2251,3421,881,,289573,117668,3586,,,5,66,,,,242,1445,10630,"""train""",False,True,False,2
429496732853,51,84683,134887,2024-06-05 04:17:50,39,26,"""Execution""","""T1559;T1106;T1059.005""","""BenignPositive""",,,"""File""","""Related""",98799,4,360606,160396,441377,673934,425863,453297,153085,529644,,1631,635,860,2251,3421,881,,3,6,3586,,,5,66,,,,242,1445,10630,"""train""",False,True,False,2


In [1]:
train_df = dataset.data.filter(pl.col("dataset_type") == 'train').filter(pl.col("IncidentGrade").is_in(["TruePositive"]) )
incident_df = train_df.group_by("IncidentId").agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
incident_df =  incident_df.with_columns(    (
            (pl.col("is_preparation_alert").cast(int) *(2**0)) +  # is_preparation_alert をビット2にシフト
            (pl.col("is_intrusion_alert").cast(int) *(2**1)) +   # is_intrusion_alert をビット1にシフト
            pl.col("is_compromise_alert").cast(int)  *(2**2)           # is_compromise_alert をビット0に
    ).alias("alert_bit_value")
)

NameError: name 'dataset' is not defined

In [9]:
incident_df.group_by("alert_bit_value").agg(pl.count("IncidentId").alias("count")).sort("alert_bit_value")

alert_bit_value,count
i64,u32
0,683
1,58145
2,19321
3,29420
4,2360
5,1241
6,1776
7,1947


## テスト用データ(True Positive and Benign Positive)

In [10]:
test_df = dataset.data.filter(pl.col("dataset_type") == 'test').filter(pl.col("IncidentGrade").is_in(["TruePositive"]) )
incident_df = test_df.group_by("IncidentId").agg([pl.col("is_preparation_alert").any(), pl.col("is_intrusion_alert").any(), pl.col("is_compromise_alert").any()])
incident_df =  incident_df.with_columns(    (
            (pl.col("is_preparation_alert").cast(int) *(2**0)) +  # is_preparation_alert をビット2にシフト
            (pl.col("is_intrusion_alert").cast(int) *(2**1)) +   # is_intrusion_alert をビット1にシフト
            pl.col("is_compromise_alert").cast(int)  *(2**2)           # is_compromise_alert をビット0に
    ).alias("alert_bit_value")
)

In [11]:
incident_df.group_by("alert_bit_value").agg(pl.count("IncidentId").alias("count")).sort("alert_bit_value")

alert_bit_value,count
i64,u32
0,341
1,30058
2,10530
3,12104
4,1572
5,355
6,835
7,348


## インシデントIDの重複確認

In [12]:
train_incident_Id_set = set(dataset.data.filter(pl.col("dataset_type") == 'train').get_column("IncidentId").to_list())
test_incident_Id_set = set(dataset.data.filter(pl.col("dataset_type") == 'test').get_column("IncidentId").to_list())

In [13]:
print(f"共通Incident ID= {len(train_incident_Id_set & test_incident_Id_set)}")

共通Incident ID= 36088
