In [94]:
pip install dask[dataframe]



In [95]:
pip install dask-ml



In [96]:
import dask.dataframe as dd
import dask.array as da
import numpy as np
from dask_ml.model_selection import train_test_split
from xgboost import XGBClassifier
from dask_ml.wrappers import ParallelPostFit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from dask_ml.preprocessing import DummyEncoder

In [97]:
import pandas as pd


In [98]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [99]:
train_df = dd.read_csv('/content/drive/MyDrive/microsoft_projData/GUIDE_Train.csv', dtype={'ActionGranular': 'object',
       'ActionGrouped': 'object',
       'AntispamDirection': 'object',
       'ResourceType': 'object',
       'Roles': 'object',
       'ThreatFamily': 'object'})
test_df = dd.read_csv('/content/drive/MyDrive/microsoft_projData/GUIDE_Test.csv', dtype={'ActionGranular': 'object',
       'ActionGrouped': 'object',
       'AntispamDirection': 'object',
       'ResourceType': 'object',
       'Roles': 'object',
       'ThreatFamily': 'object'})

In [100]:
test_df.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,MitreTechniques,IncidentGrade,...,Roles,OSFamily,OSVersion,AntispamDirection,SuspicionLevel,LastVerdict,CountryCode,State,City,Usage
0,1245540519230,657,11767,87199,2024-06-04T22:56:27.000Z,524,563,LateralMovement,T1021;T1047;T1105;T1569.002,BenignPositive,...,,5,66,,Suspicious,Suspicious,242,1445,10630,Private
1,1400159342154,3,91158,632273,2024-06-03T12:58:26.000Z,2,2,CommandAndControl,,BenignPositive,...,,0,0,,Suspicious,Suspicious,242,1445,10630,Public
2,1279900255923,145,32247,131719,2024-06-08T03:20:49.000Z,2932,10807,LateralMovement,T1021;T1027.002;T1027.005;T1105,BenignPositive,...,,5,66,,Suspicious,Suspicious,242,1445,10630,Public
3,60129547292,222,15294,917686,2024-06-12T12:07:31.000Z,0,0,InitialAccess,T1078;T1078.004,FalsePositive,...,,5,66,,,,242,1445,10630,Public
4,515396080539,363,7615,5944,2024-06-06T17:42:05.000Z,27,18,Discovery,T1087;T1087.002,BenignPositive,...,Suspicious,5,66,,,,242,1445,10630,Public


In [101]:
train_df = train_df.sample(frac=0.002, random_state=42)
test_df = test_df.sample(frac=0.001, random_state=42)

In [102]:
test_df = test_df.drop(['Usage'], axis=1)

In [103]:
train_df = train_df.drop(['Id'], axis=1)
test_df = test_df.drop(['Id'], axis=1)

In [104]:
train_df = train_df.drop(['MitreTechniques','ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType','Roles','AntispamDirection','SuspicionLevel','LastVerdict'], axis=1)
test_df = test_df.drop(['MitreTechniques','ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType','Roles','AntispamDirection','SuspicionLevel','LastVerdict'], axis=1)

In [105]:
train_df = train_df.dropna(subset=['IncidentGrade'])
test_df = test_df.dropna(subset=['IncidentGrade'])

In [106]:
train_df['Timestamp'] = train_df['Timestamp'].str.extract(r'(\d{4}-\d{2}-\d{2})', expand=False)
train_df['Timestamp'] = dd.to_datetime(train_df['Timestamp'], format='%Y-%m-%d')

In [107]:
test_df['Timestamp'] = test_df['Timestamp'].str.extract(r'(\d{4}-\d{2}-\d{2})', expand=False)
test_df['Timestamp'] = dd.to_datetime(test_df['Timestamp'], format='%Y-%m-%d')

In [108]:
train_df['Day_of_Week'] = train_df['Timestamp'].dt.day_name()
test_df['Day_of_Week'] = test_df['Timestamp'].dt.day_name()

In [109]:
train_x = train_df.drop('IncidentGrade', axis=1)
train_y = train_df['IncidentGrade']

In [110]:
test_x = test_df.drop('IncidentGrade', axis=1)
test_y = test_df['IncidentGrade']

In [111]:
for col in train_x.columns:
    if train_x[col].dtype == 'object' or train_x[col].dtype == 'string':
        train_x[col] = train_x[col].astype('category')

In [112]:
for col in test_x.columns:
    if test_x[col].dtype == 'object' or test_x[col].dtype == 'string':
        test_x[col] = test_x[col].astype('category')

In [113]:
categorical_features = test_x.select_dtypes(include=['category']).columns
categorical_features

Index(['Category', 'EntityType', 'EvidenceRole', 'Day_of_Week'], dtype='object')

In [114]:
train_x_cat = train_x.categorize(columns=categorical_features)
test_x_cat = test_x.categorize(columns=categorical_features)

In [115]:
encoder = DummyEncoder()
train_x = encoder.fit_transform(train_x_cat)
train_x.head()

Unnamed: 0,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,DeviceId,Sha256,IpAddress,Url,...,EntityType_User,EvidenceRole_Impacted,EvidenceRole_Related,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday
208502,283,335,509559,2024-06-11,0,0,98799,138268,360606,160396,...,True,True,False,False,False,False,False,False,True,False
57767,51,44733,295881,2024-06-03,39,26,98799,138268,360606,160396,...,True,False,True,False,True,False,False,False,False,False
235357,13,6868,10238,2024-06-10,34,1135,98799,412,360606,160396,...,False,True,False,False,True,False,False,False,False,False
21126,9,64,112355,2024-06-12,90,4415,98799,138268,360606,160396,...,True,True,False,False,False,False,False,False,False,True
128964,16,99016,132381,2024-06-03,24,21634,98799,138268,360606,160396,...,False,True,False,False,True,False,False,False,False,False


In [116]:
for col in train_x.columns:
    if train_x[col].dtype == 'bool':
        train_x[col] = train_x[col].astype(int)

In [117]:
test_x = encoder.transform(test_x_cat)
test_x.head()

Unnamed: 0,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,DeviceId,Sha256,IpAddress,Url,...,EntityType_User,EvidenceRole_Impacted,EvidenceRole_Related,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday
244231,1183,86623,370109,2024-05-22,3422,492,98799,138268,125459,160396,...,False,False,True,False,False,False,False,False,False,True
57535,49,188424,269539,2024-06-04,2,2,98799,138268,360606,834,...,False,False,True,False,False,False,False,False,True,False
43951,54,32601,20563,2024-06-07,156,9604,98799,138268,360606,160396,...,False,True,False,True,False,False,False,False,False,False
158655,66,21295,416356,2024-06-10,5,34,98799,138268,2770,160396,...,False,False,True,False,True,False,False,False,False,False
63950,75,74079,318110,2024-06-06,0,0,98799,138268,0,160396,...,False,False,True,False,False,False,False,True,False,False


In [118]:
for col in test_x.columns:
    if test_x[col].dtype == 'bool':
        test_x[col] = test_x[col].astype(int)

In [119]:
test_x.head()

Unnamed: 0,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,DeviceId,Sha256,IpAddress,Url,...,EntityType_User,EvidenceRole_Impacted,EvidenceRole_Related,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday
244231,1183,86623,370109,2024-05-22,3422,492,98799,138268,125459,160396,...,0,0,1,0,0,0,0,0,0,1
57535,49,188424,269539,2024-06-04,2,2,98799,138268,360606,834,...,0,0,1,0,0,0,0,0,1,0
43951,54,32601,20563,2024-06-07,156,9604,98799,138268,360606,160396,...,0,1,0,1,0,0,0,0,0,0
158655,66,21295,416356,2024-06-10,5,34,98799,138268,2770,160396,...,0,0,1,0,1,0,0,0,0,0
63950,75,74079,318110,2024-06-06,0,0,98799,138268,0,160396,...,0,0,1,0,0,0,0,1,0,0


## SMOTE

In [120]:
from imblearn.over_sampling import SMOTE

In [121]:
combined_df = dd.concat([train_x, train_y], axis=1)

In [122]:
combined_df_test = dd.concat([test_x, test_y], axis=1)

In [123]:
def apply_smote(chunk):
    smote = SMOTE()
    #Exclude datetime columns from X
    X = chunk.select_dtypes(exclude=['datetime']).drop('IncidentGrade', axis=1)
    y = chunk['IncidentGrade']
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return pd.concat([X_resampled, y_resampled], axis=1)

In [124]:
# Apply SMOTE on each chunk
meta = pd.DataFrame(columns=[col for col in combined_df.columns if col != 'Timestamp'])
resampled_chunks = combined_df.map_partitions(apply_smote, meta=meta)

In [125]:
resampled_df = dd.concat([resampled_chunks], axis=0)

## split into train and test for Evaluation

In [126]:
# split resampled data into train_x and y
train_x = resampled_df.drop('IncidentGrade', axis=1)
train_y = resampled_df['IncidentGrade']

In [127]:
test_x = combined_df_test.drop('IncidentGrade', axis=1)
test_y = combined_df_test['IncidentGrade']

In [128]:
test_x = test_x.drop('Timestamp', axis=1)

In [132]:
mapping = {'FalsePositive': 0.0, 'TruePositive': 1.0, 'BenignPositive': 2.0}

In [None]:
train_y = train_y.map(mapping, meta=('IncidentGrade', 'float64'))

In [133]:
test_y = test_y.map(mapping, meta=('IncidentGrade', 'float64'))

In [134]:
train_x = train_x.astype(np.float64)  # Convert train_x and test_x to float64

test_x = test_x.astype(np.float64)

In [135]:
train_x = train_x.drop(['EntityType_Blob'], axis=1)

In [136]:
train_x = train_x.drop('EntityType_MailboxConfiguration', axis=1)

In [137]:
train_x = train_x.to_dask_array(lengths=True)
test_x = test_x.to_dask_array(lengths=True)

In [138]:
test_y = test_y.to_dask_array(lengths=True)
train_y = train_y.to_dask_array(lengths=True)

In [139]:
xgmodel = ParallelPostFit(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'))

In [142]:
xgmodel.fit(train_x, train_y)

Parameters: { "use_label_encoder" } are not used.



In [143]:
predictions = xgmodel.predict(test_x)

In [144]:
print(classification_report(test_y, predictions))

              precision    recall  f1-score   support

         0.0       0.80      0.77      0.78       844
         1.0       0.91      0.85      0.88      1505
         2.0       0.84      0.90      0.87      1795

    accuracy                           0.86      4144
   macro avg       0.85      0.84      0.85      4144
weighted avg       0.86      0.86      0.86      4144

