In [88]:
pip install dask[dataframe]



In [89]:
pip install dask-ml



In [None]:
import dask.dataframe as dd
import dask.array as da
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from scipy.stats import chi2_contingency
from dask_ml.model_selection import train_test_split



In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from dask_ml.wrappers import ParallelPostFit
from dask_ml.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
from dask_ml.linear_model import LogisticRegression

In [None]:
from dask_ml.preprocessing import DummyEncoder


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

# Read the CSV files using Dask and specify dtypes
train_df = dd.read_csv('/content/drive/MyDrive/microsoft_projData/GUIDE_Train.csv', dtype={'ActionGranular': 'object',
       'ActionGrouped': 'object',
       'AntispamDirection': 'object',
       'ResourceType': 'object',
       'Roles': 'object',
       'ThreatFamily': 'object'})
test_df = dd.read_csv('/content/drive/MyDrive/microsoft_projData/GUIDE_Test.csv', dtype={'ActionGranular': 'object',
       'ActionGrouped': 'object',
       'AntispamDirection': 'object',
       'ResourceType': 'object',
       'Roles': 'object',
       'ThreatFamily': 'object'})



In [None]:
train_df = train_df.sample(frac=0.002, random_state=42)
test_df = test_df.sample(frac=0.001, random_state=42)

In [None]:

# train_df.shape[0].compute(), train_df.shape[1]

In [None]:
# test_df.shape[0].compute(), test_df.shape[1]

In [None]:
train_df.columns

Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'Timestamp', 'DetectorId',
       'AlertTitle', 'Category', 'MitreTechniques', 'IncidentGrade',
       'ActionGrouped', 'ActionGranular', 'EntityType', 'EvidenceRole',
       'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
       'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
       'EmailClusterId', 'RegistryKey', 'RegistryValueName',
       'RegistryValueData', 'ApplicationId', 'ApplicationName',
       'OAuthApplicationId', 'ThreatFamily', 'FileName', 'FolderPath',
       'ResourceIdName', 'ResourceType', 'Roles', 'OSFamily', 'OSVersion',
       'AntispamDirection', 'SuspicionLevel', 'LastVerdict', 'CountryCode',
       'State', 'City'],
      dtype='object')

In [None]:
test_df.columns

Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'Timestamp', 'DetectorId',
       'AlertTitle', 'Category', 'MitreTechniques', 'IncidentGrade',
       'ActionGrouped', 'ActionGranular', 'EntityType', 'EvidenceRole',
       'DeviceId', 'Sha256', 'IpAddress', 'Url', 'AccountSid', 'AccountUpn',
       'AccountObjectId', 'AccountName', 'DeviceName', 'NetworkMessageId',
       'EmailClusterId', 'RegistryKey', 'RegistryValueName',
       'RegistryValueData', 'ApplicationId', 'ApplicationName',
       'OAuthApplicationId', 'ThreatFamily', 'FileName', 'FolderPath',
       'ResourceIdName', 'ResourceType', 'Roles', 'OSFamily', 'OSVersion',
       'AntispamDirection', 'SuspicionLevel', 'LastVerdict', 'CountryCode',
       'State', 'City', 'Usage'],
      dtype='object')

In [None]:
test_df['Usage'].head()

Unnamed: 0,Usage
244231,Public
57535,Public
43951,Public
158655,Public
63950,Private


In [None]:
test_df = test_df.drop(['Usage'], axis=1)

In [None]:
train_df.isnull().sum().compute()

Unnamed: 0,0
Id,0
OrgId,0
IncidentId,0
AlertId,0
Timestamp,0
DetectorId,0
AlertTitle,0
Category,0
MitreTechniques,10900
IncidentGrade,98


In [None]:
test_df.isnull().sum().compute()

Unnamed: 0,0
Id,0
OrgId,0
IncidentId,0
AlertId,0
Timestamp,0
DetectorId,0
AlertTitle,0
Category,0
MitreTechniques,2305
IncidentGrade,0


In [None]:
train_df.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,MitreTechniques,IncidentGrade,...,ResourceType,Roles,OSFamily,OSVersion,AntispamDirection,SuspicionLevel,LastVerdict,CountryCode,State,City
208502,8589935127,283,335,509559,2024-06-11T14:15:54.000Z,0,0,InitialAccess,T1078;T1078.004,TruePositive,...,,,5,66,,,,242,1445,10630
57767,635655163043,51,44733,295881,2024-06-03T23:17:54.000Z,39,26,Execution,T1559;T1106;T1059.005,BenignPositive,...,,,5,66,,,,242,1445,10630
235357,1451698948195,13,6868,10238,2024-06-10T18:15:14.000Z,34,1135,Exfiltration,,BenignPositive,...,,,5,66,,,,242,1445,10630
21126,816043791031,9,64,112355,2024-06-12T15:28:38.000Z,90,4415,Exfiltration,,BenignPositive,...,,,5,66,,,,242,1445,10630
128964,1408749274237,16,99016,132381,2024-06-03T20:07:56.000Z,24,21634,Exfiltration,,BenignPositive,...,,,5,66,,,,242,1445,10630


In [None]:
train_df = train_df.drop(['Id'], axis=1)
test_df = test_df.drop(['Id'], axis=1)

In [None]:
train_df = train_df.drop(['MitreTechniques','ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType','Roles','AntispamDirection','SuspicionLevel','LastVerdict'], axis=1)

In [None]:
test_df = test_df.drop(['MitreTechniques','ActionGrouped', 'ActionGranular', 'EmailClusterId', 'ThreatFamily', 'ResourceType','Roles','AntispamDirection','SuspicionLevel','LastVerdict'], axis=1)

In [None]:
train_df = train_df.dropna(subset=['IncidentGrade'])

In [None]:
test_df = test_df.dropna(subset=['IncidentGrade'])

In [None]:
train_df.isnull().sum().compute()

Unnamed: 0,0
OrgId,0
IncidentId,0
AlertId,0
Timestamp,0
DetectorId,0
AlertTitle,0
Category,0
IncidentGrade,0
EntityType,0
EvidenceRole,0


In [None]:
train_df.info()

<class 'dask_expr.DataFrame'>
Columns: 34 entries, OrgId to City
dtypes: int64(29), string(5)

In [None]:
train_df['Timestamp'] = train_df['Timestamp'].str.extract(r'(\d{4}-\d{2}-\d{2})', expand=False)

In [None]:
train_df['Timestamp'] = dd.to_datetime(train_df['Timestamp'], format='%Y-%m-%d')

In [None]:
test_df['Timestamp'] = test_df['Timestamp'].str.extract(r'(\d{4}-\d{2}-\d{2})', expand=False)

In [None]:
test_df['Timestamp'] = dd.to_datetime(test_df['Timestamp'], format='%Y-%m-%d')

## create day of week column

In [None]:
train_df['Day_of_Week'] = train_df['Timestamp'].dt.day_name()

In [None]:
test_df['Day_of_Week'] = test_df['Timestamp'].dt.day_name()

## split train_df into x and y

In [None]:
# first split features and target
train_x = train_df.drop('IncidentGrade', axis=1)
train_y = train_df['IncidentGrade']

In [None]:
test_x = test_df.drop('IncidentGrade', axis=1)
test_y = test_df['IncidentGrade']

# encoding categorical features

In [None]:
for col in train_x.columns:
    if train_x[col].dtype == 'object' or train_x[col].dtype == 'string':
        train_x[col] = train_x[col].astype('category')

In [None]:
for col in test_x.columns:
    if test_x[col].dtype == 'object' or test_x[col].dtype == 'string':
        test_x[col] = test_x[col].astype('category')

In [None]:
categorical_features = train_x.select_dtypes(include=['category']).columns
categorical_features

Index(['Category', 'EntityType', 'EvidenceRole', 'Day_of_Week'], dtype='object')

In [None]:
train_x_cat = train_x.categorize(columns=categorical_features)
test_x_cat = test_x.categorize(columns=categorical_features)

In [None]:
encoder = DummyEncoder()
train_x = encoder.fit_transform(train_x_cat)
train_x.head()

Unnamed: 0,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,DeviceId,Sha256,IpAddress,Url,...,EntityType_User,EvidenceRole_Impacted,EvidenceRole_Related,Day_of_Week_Friday,Day_of_Week_Monday,Day_of_Week_Saturday,Day_of_Week_Sunday,Day_of_Week_Thursday,Day_of_Week_Tuesday,Day_of_Week_Wednesday
208502,283,335,509559,2024-06-11,0,0,98799,138268,360606,160396,...,True,True,False,False,False,False,False,False,True,False
57767,51,44733,295881,2024-06-03,39,26,98799,138268,360606,160396,...,True,False,True,False,True,False,False,False,False,False
235357,13,6868,10238,2024-06-10,34,1135,98799,412,360606,160396,...,False,True,False,False,True,False,False,False,False,False
21126,9,64,112355,2024-06-12,90,4415,98799,138268,360606,160396,...,True,True,False,False,False,False,False,False,False,True
128964,16,99016,132381,2024-06-03,24,21634,98799,138268,360606,160396,...,False,True,False,False,True,False,False,False,False,False


In [None]:
# convert only boolean columns to int data type
for col in train_x.columns:
    if train_x[col].dtype == 'bool':
        train_x[col] = train_x[col].astype(int)

In [None]:
test_x = encoder.transform(test_x_cat)

In [None]:
for col in test_x.columns:
    if test_x[col].dtype == 'bool':
        test_x[col] = test_x[col].astype(int)

## SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
train_y.value_counts().compute()

Unnamed: 0_level_0,count
IncidentGrade,Unnamed: 1_level_1
BenignPositive,8210
TruePositive,6604
FalsePositive,4109


In [None]:
combined_df = dd.concat([train_x, train_y], axis=1)

In [None]:
def apply_smote(chunk):
    smote = SMOTE()
    #Exclude datetime columns from X
    X = chunk.select_dtypes(exclude=['datetime']).drop('IncidentGrade', axis=1)
    y = chunk['IncidentGrade']
    X_resampled, y_resampled = smote.fit_resample(X, y)
    return pd.concat([X_resampled, y_resampled], axis=1)

In [None]:
# combined_df['IncidentGrade'].isnull().sum().compute()

In [None]:
# Apply SMOTE on each chunk
meta = pd.DataFrame(columns=[col for col in combined_df.columns if col != 'Timestamp'])
resampled_chunks = combined_df.map_partitions(apply_smote, meta=meta)

In [None]:
# Combine the resampled chunks
resampled_df = dd.concat([resampled_chunks], axis=0)

In [None]:
test_x = test_x.drop('Timestamp', axis=1)

## split into 80-20 train and test for Evaluation

In [None]:
# split resampled data into train_x and y
train_x = resampled_df.drop('IncidentGrade', axis=1)
train_y = resampled_df['IncidentGrade']

In [None]:
# let's impute the categorical target variable into integer data type by imputation
mapping = {'FalsePositive': 0, 'TruePositive': 1, 'BenignPositive': 2}
train_y = train_y.map(mapping)

In [None]:
test_y = test_y.map(mapping)

In [None]:
# repart_train_x = train_x.repartition(partition_size='100MB')
# repart_train_y = train_y.repartition(partition_size='100MB')

In [None]:
# constant_columns = [col for col in train_x.columns if train_x[col].nunique().compute() == 1]
# constant_columns

In [None]:
# train_x_array = train_x.to_dask_array(lengths=True)
# train_y_array = train_y.to_dask_array(lengths=True)

In [None]:
# split x and y for train and test
x_train, x_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

## Model Evaluation

In [None]:
# from dask.distributed import Client

In [None]:
# client = Client(n_workers = 8, memory_limit='2GB')

In [None]:
models = {
    'Logistic Regression': ParallelPostFit(estimator=LogisticRegression()),
    'Random Forest': ParallelPostFit(estimator=RandomForestClassifier()),
    'Decision Tree': ParallelPostFit(estimator=DecisionTreeClassifier()),
    'XGBoost': ParallelPostFit(estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
}

In [None]:
# wrapped_models = {name: ParallelPostFit(model) for name, model in models.items()}

In [None]:
# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# for name, model in models.items():
#     print(f"Evaluating {name}...")
#     scores = []
#     for train_index, test_index in skf.split(train_x, train_y):
#         skf_x_train, skf_x_test = train_x.iloc[train_index], train_x.iloc[test_index]
#         skf_y_train, skf_y_test = train_y.iloc[train_index], train_y.iloc[test_index]
#         model.fit(skf_x_train, skf_y_train)
#         y_pred = model.predict(skf_x_test)
#         scores.append(accuracy_score(skf_y_test, y_pred))

#     print(f"{name} Accuracy score: {np.mean(scores):.4f}")

In [None]:
x_train = x_train.astype(np.float64)  # Convert x_train y_train to float64
y_train = y_train.astype(np.float64)

In [None]:
y_test = y_test.astype(np.float64)
x_test = x_test.astype(np.float64)

In [None]:
constant_columns = [col for col in x_train.columns if x_train[col].nunique().compute() == 1]
constant_columns

In [None]:
# 'EntityType_Blob'
x_train = x_train.drop(['EntityType_Blob'], axis=1)
x_test = x_test.drop(['EntityType_Blob'], axis=1)

In [None]:
x_train = x_train.to_dask_array(lengths=True)
x_test = x_test.to_dask_array(lengths=True)

In [None]:
y_train = y_train.to_dask_array(lengths=True)
y_test = y_test.to_dask_array(lengths=True)

In [None]:
for name, model in models.items():
    print(f"Evaluating {name}...")
    model.fit(x_train, y_train)
    score = model.score(x_test, y_test)
    print(f"{name} Accuracy score: {score}")

# client.close()

Evaluating Logistic Regression...
Logistic Regression Accuracy score: dask.array<mean_agg-aggregate, shape=(), dtype=float64, chunksize=(), chunktype=numpy.ndarray>
Evaluating Random Forest...
Random Forest Accuracy score: 0.8513486513486513
Evaluating Decision Tree...
Decision Tree Accuracy score: 0.8513486513486513
Evaluating XGBoost...
XGBoost Accuracy score: 0.881918081918082


In [None]:
lgmodel = ParallelPostFit(estimator=LogisticRegression())
lgmodel.fit(x_train, y_train)
acc = lgmodel.score(x_test, y_test)
print(f"Logistic Regression Accuracy score: {acc.compute()}")

Logistic Regression Accuracy score: 0.38741258741258744


## Hyper Parameter Tuning

In [None]:
from scipy.stats import randint

In [None]:
param_dist = {
    "max_depth": [1, 2, 3, None],
    "n_estimators": [100, 200, 300, 400, 500, 600],
    "learning_rate": [0.01, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4],
}

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

In [None]:
Grid_search = GridSearchCV(estimator=model, param_grid=param_dist, cv=5, scoring='accuracy')

Grid_search.fit(x_train, y_train)

print(f"Best parameters: {Grid_search.best_params_}")
print(f"Best score: {Grid_search.best_score_}")

Best parameters: {'learning_rate': 0.35, 'max_depth': None, 'n_estimators': 500}
Best score: 0.8720859207981269


In [None]:
from keras.models import Sequential
from keras.layers import Dense

In [None]:
nn_model = Sequential()
nn_model.add(Dense(128, input_dim=x_train.shape[1], activation='relu'))
nn_model.add(Dense(64, activation='relu'))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(16, activation='relu'))
nn_model.add(Dense(1, activation='sigmoid'))

In [None]:
nn_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
nn_model.summary()

In [None]:
nn_model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test))

Epoch 1/10
[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 233ms/step - accuracy: 0.3197 - loss: 0.0000e+00 - val_accuracy: 0.3451 - val_loss: 0.0000e+00
Epoch 2/10
[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3011 - loss: 0.0000e+00 - val_accuracy: 0.2921 - val_loss: 0.0000e+00
Epoch 3/10
[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2899 - loss: 0.0000e+00 - val_accuracy: 0.2939 - val_loss: 0.0000e+00
Epoch 4/10
[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.2943 - loss: 0.0000e+00 - val_accuracy: 0.3057 - val_loss: 0.0000e+00
Epoch 5/10
[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.2960 - loss: 0.0000e+00 - val_accuracy: 0.3007 - val_loss: 0.0000e+00
Epoch 6/10
[1m614/614[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.2973 - loss: 0.0000e+00 - val_accuracy: 0.3035

<keras.src.callbacks.history.History at 0x7a47f8ccc0a0>