In [None]:
import zipfile

zip_ref = zipfile.ZipFile('/content/GUIDE_Train.csv.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

zip_ref = zipfile.ZipFile('/content/GUIDE_Test.csv.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
dtype_dict={'ActionGranular': 'object',
       'ActionGrouped': 'object',
       'AntispamDirection': 'object',
       'ResourceType': 'object',
       'Roles': 'object',
       'ThreatFamily': 'object'}

ddf_train = dd.read_csv('/content/GUIDE_Train.csv', dtype = dtype_dict)
ddf_test = dd.read_csv('/content/GUIDE_Test.csv', dtype = dtype_dict)

In [None]:
ddf_train.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,MitreTechniques,IncidentGrade,...,ResourceType,Roles,OSFamily,OSVersion,AntispamDirection,SuspicionLevel,LastVerdict,CountryCode,State,City
0,180388628218,0,612,123247,2024-06-04T06:05:15.000Z,7,6,InitialAccess,,TruePositive,...,,,5,66,,,,31,6,3
1,455266534868,88,326,210035,2024-06-14T03:01:25.000Z,58,43,Exfiltration,,FalsePositive,...,,,5,66,,,,242,1445,10630
2,1056561957389,809,58352,712507,2024-06-13T04:52:55.000Z,423,298,InitialAccess,T1189,FalsePositive,...,,,5,66,,Suspicious,Suspicious,242,1445,10630
3,1279900258736,92,32992,774301,2024-06-10T16:39:36.000Z,2,2,CommandAndControl,,BenignPositive,...,,,5,66,,Suspicious,Suspicious,242,1445,10630
4,214748368522,148,4359,188041,2024-06-15T01:08:07.000Z,9,74,Execution,,TruePositive,...,,,5,66,,,,242,1445,10630


In [None]:
ddf_test.head()

Unnamed: 0,Id,OrgId,IncidentId,AlertId,Timestamp,DetectorId,AlertTitle,Category,MitreTechniques,IncidentGrade,...,Roles,OSFamily,OSVersion,AntispamDirection,SuspicionLevel,LastVerdict,CountryCode,State,City,Usage
0,1245540519230,657,11767,87199,2024-06-04T22:56:27.000Z,524,563,LateralMovement,T1021;T1047;T1105;T1569.002,BenignPositive,...,,5,66,,Suspicious,Suspicious,242,1445,10630,Private
1,1400159342154,3,91158,632273,2024-06-03T12:58:26.000Z,2,2,CommandAndControl,,BenignPositive,...,,0,0,,Suspicious,Suspicious,242,1445,10630,Public
2,1279900255923,145,32247,131719,2024-06-08T03:20:49.000Z,2932,10807,LateralMovement,T1021;T1027.002;T1027.005;T1105,BenignPositive,...,,5,66,,Suspicious,Suspicious,242,1445,10630,Public
3,60129547292,222,15294,917686,2024-06-12T12:07:31.000Z,0,0,InitialAccess,T1078;T1078.004,FalsePositive,...,,5,66,,,,242,1445,10630,Public
4,515396080539,363,7615,5944,2024-06-06T17:42:05.000Z,27,18,Discovery,T1087;T1087.002,BenignPositive,...,Suspicious,5,66,,,,242,1445,10630,Public


In [None]:
ddf_train.dtypes

Unnamed: 0,0
Id,int64
OrgId,int64
IncidentId,int64
AlertId,int64
Timestamp,string[pyarrow]
DetectorId,int64
AlertTitle,int64
Category,string[pyarrow]
MitreTechniques,string[pyarrow]
IncidentGrade,string[pyarrow]


In [None]:
ddf_test.dtypes

Unnamed: 0,0
Id,int64
OrgId,int64
IncidentId,int64
AlertId,int64
Timestamp,string[pyarrow]
DetectorId,int64
AlertTitle,int64
Category,string[pyarrow]
MitreTechniques,string[pyarrow]
IncidentGrade,string[pyarrow]


In [None]:
ddf_train = ddf_train.drop_duplicates()
ddf_test = ddf_test.drop_duplicates()

In [None]:
ddf_train['Timestamp'] = dd.to_datetime(ddf_train['Timestamp'])
ddf_test['Timestamp'] = dd.to_datetime(ddf_test['Timestamp'])

ddf_train = ddf_train.astype({col: 'int32' for col in ddf_train.select_dtypes(include='int64').columns})

In [None]:
ddf_train.isna().mean().compute()

Unnamed: 0,0
Id,0.0
OrgId,0.0
IncidentId,0.0
AlertId,0.0
Timestamp,0.0
DetectorId,0.0
AlertTitle,0.0
Category,0.0
MitreTechniques,0.574601
IncidentGrade,0.005395


In [None]:
ddf_test.isna().mean().compute()

Unnamed: 0,0
Id,0.0
OrgId,0.0
IncidentId,0.0
AlertId,0.0
Timestamp,0.0
DetectorId,0.0
AlertTitle,0.0
Category,0.0
MitreTechniques,0.556198
IncidentGrade,0.0


In [None]:
ddf_train = ddf_train.dropna(subset=['IncidentGrade'])
ddf_test = ddf_test.dropna(subset=['IncidentGrade'])

In [None]:
# Dropping the columns with missing values morethan 90%

# missing_value_cols = [col for col in ddf.columns if ddf[col].isna().mean().compute() > 0.84]


# missing_value_cols = []
# for col in ddf.columns:
#   if ddf[col].isna().mean().compute() > 90:
#     missing_value_cols.append(col)


def missing_values(df):
    return df.isna().mean()

missing_values_ddf = ddf.map_partitions(missing_values)
missing_values_col = missing_values_ddf[missing_values_ddf > 0.90].index.compute()
missing_values_col = list(set(missing_values_col))
missing_values_col

In [None]:
missing_values_col = ['ResourceType', 'AntispamDirection', 'ThreatFamily', 'EmailClusterId',
                      'ActionGranular', 'ActionGrouped', 'Roles']

ddf_train = ddf_train.drop(columns=missing_values_col)
ddf_test = ddf_test.drop(columns=missing_values_col)

In [None]:
# len(ddf_train.columns) == len(ddf_test.columns)
set(ddf_test.columns) - set(ddf_train.columns)

{'Usage'}

In [None]:
ddf_test = ddf_test.drop(columns=['Usage'])

In [None]:
# Dropping the columns with same values morethan 90%

# same_value_cols = [col for col in df.columns if df[col].value_counts(normalize=True).max().compute() > 0.90]

same_value_cols = []
for col in ddf.columns:
  if ddf[col].value_counts(normalize=True).max().compute() > 0.90:
    print(col)
    same_value_cols.append(col)

# def same_values(df):
#     return df.value_counts(normalize=True).max()

# same_values_ddf = ddf.map_partitions(same_values)
# same_values_col = same_values_ddf[same_values_ddf > 0.90].index.compute()
# same_values_col = list(set(same_values_col))
# same_values_col

DeviceId
Url
RegistryValueName
RegistryValueData
ApplicationId
ApplicationName
OSFamily
OSVersion
SuspicionLevel
CountryCode
State
City


In [None]:
same_value_cols = ['DeviceId', 'Url', 'RegistryValueName', 'RegistryValueData', 'ApplicationId',
                   'ApplicationName', 'OSFamily', 'OSVersion', 'SuspicionLevel', 'CountryCode', 'State', 'City']

ddf_train = ddf_train.drop(columns=same_value_cols, errors='ignore')
ddf_test = ddf_test.drop(columns=same_value_cols, errors='ignore')

In [None]:
ddf_train.columns

Index(['Id', 'OrgId', 'IncidentId', 'AlertId', 'Timestamp', 'DetectorId',
       'AlertTitle', 'Category', 'MitreTechniques', 'IncidentGrade',
       'EntityType', 'EvidenceRole', 'Sha256', 'IpAddress', 'AccountSid',
       'AccountUpn', 'AccountObjectId', 'AccountName', 'DeviceName',
       'NetworkMessageId', 'RegistryKey', 'OAuthApplicationId', 'FileName',
       'FolderPath', 'ResourceIdName', 'LastVerdict'],
      dtype='object')

In [None]:
# Removing ID columns
Id_cols = ['Id', 'OrgId', 'IncidentId', 'AlertId', 'Sha256', 'NetworkMessageId']

ddf_train = ddf_train.drop(columns=Id_cols)
ddf_test = ddf_test.drop(columns=Id_cols)

In [None]:
df_train = ddf_train.compute()
df_test = ddf_test.compute()

In [None]:
X = df_train.drop(columns=['IncidentGrade'])
y = df_train['IncidentGrade']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
X_train['Year'] = X_train['Timestamp'].dt.year
X_test['Year'] = X_test['Timestamp'].dt.year

X_train['Month'] = X_train['Timestamp'].dt.month
X_test['Month'] = X_test['Timestamp'].dt.month

X_train['Day'] = X_train['Timestamp'].dt.day
X_test['Day'] = X_test['Timestamp'].dt.day

X_train['Hour'] = X_train['Timestamp'].dt.hour
X_test['Hour'] = X_test['Timestamp'].dt.hour

X_train['DayofWeek'] = X_train['Timestamp'].dt.dayofweek
X_test['DayofWeek'] = X_test['Timestamp'].dt.dayofweek

X_train['Business_hours'] = X_train['Hour'].between(9, 17).astype(int)
X_test['Business_hours'] = X_test['Hour'].between(9, 17).astype(int)

X_train['Time_of_day'] = X_train['Hour'].apply(lambda x: 'morning' if x >= 6 and x < 12 else ('afternoon' if x >= 12 and x < 19 else 'night'))
X_test['Time_of_day'] = X_test['Hour'].apply(lambda x: 'morning' if x >= 6 and x < 12 else ('afternoon' if x >= 12 and x < 19 else 'night'))

In [None]:
X_train = X_train.drop(columns=['Timestamp'])
X_test = X_test.drop(columns=['Timestamp'])

In [None]:
X_train.drop(columns=['MitreTechniques', 'LastVerdict'], inplace=True)
X_test.drop(columns=['MitreTechniques', 'LastVerdict'], inplace=True)

In [None]:
same_val_drop_cols = []

for col in X_train.columns:
  same_vals = X_train[col].value_counts(normalize=True).max()
  if same_vals > 0.89:
    same_val_drop_cols.append(col)
    print(f'{col} : {same_vals}' )

X_train = X_train.drop(columns=same_val_drop_cols)
X_test = X_test.drop(columns=same_val_drop_cols)

In [None]:
cat_list_entitytype = X_train['EntityType'].value_counts().head(12).index.tolist()
X_train['EntityType'] = X_train['EntityType'].apply(lambda x: x if x in cat_list_entitytype else 'other')

cat_list_category = X_train['Category'].value_counts().head(12).index.tolist()
X_train['Category'] = X_train['Category'].apply(lambda x: x if x in cat_list_category else 'other')

In [None]:
X_train['AlertTitle'].value_counts().head(30)

In [None]:
import pandas as pd
import numpy as np

def probability_encode(df, feature, target, smooth=10):

    target_classes = df[target].unique()
    encoded_features = pd.DataFrame(index=df.index)

    for target_class in target_classes:
        encoded_features[f"{feature}_{target_class}"] = pd.Series(index=df.index)

    for target_class in target_classes:
        category_counts = df.groupby(feature)[target].apply(lambda x: (x == target_class).sum())
        global_count = (df[target] == target_class).sum()
        smoothed_probs = (category_counts + smooth * (global_count / len(df))) / (df.groupby(feature)[target].count() + smooth)
        encoded_features[f"{feature}_{target_class}"] = df[feature].map(smoothed_probs).fillna(global_count / len(df))

    return encoded_features

In [None]:
features_to_encode = ['DetectorId', 'AlertTitle', 'IpAddress']

X_tr = pd.concat([X_train[features_to_encode], y_train], axis=1)
X_te = pd.concat([X_test[features_to_encode], y_test], axis=1)

In [None]:
X_tr.head()

In [None]:
X_tr = X_tr.reset_index(drop=True)

features_to_encode = ['DetectorId', 'AlertTitle', 'IpAddress']
target_variable = 'IncidentGrade'

encoded_X_tr = pd.DataFrame(index=X_tr.index)

for feature in features_to_encode:
    encoded_feature = probability_encode(X_tr, feature, target_variable, smooth=10)
    encoded_X_tr = pd.concat([encoded_X_tr, encoded_feature], axis=1)

X_tr = pd.concat([X_tr, encoded_X_tr], axis=1)
X_tr.head()

In [None]:
# X_prob_encoded = pd.concat([X_prob_encoded, X_tr], axis=1)
# X_prob_encoded = X_tr.drop(columns=features_to_encode)
# X_prob_encoded = X_prob_encoded.drop(columns=['IncidentGrade'])
# X_prob_encoded.head()

In [None]:
# features_to_encode = ['DetectorId', 'AlertTitle', 'IpAddress']

# X_prob_encoded = pd.concat([X_prob_encoded, X_train[features_to_encode]], axis=1)
# X_prob_encoded.head()

In [None]:
features_to_encode = ['DetectorId', 'AlertTitle', 'IpAddress']
target_variable = 'IncidentGrade'

mappings = {}
for feature in features_to_encode:
    mappings[feature] = {}
    for target_class in X_tr[target_variable].unique():
        mappings[feature][target_class] = X_tr[f"{feature}_{target_class}"].groupby(X_tr[feature]).mean().to_dict()


test_encoded_data = pd.DataFrame(index=X_test.index)
for feature in features_to_encode:
    for target_class in X_te[target_variable].unique():
        test_encoded_data[f"{feature}_{target_class}"] = X_te[feature].map(mappings[feature][target_class]).fillna(X_te[target_variable].value_counts(normalize=True)[target_class])

X_te = pd.concat([X_te, test_encoded_data], axis=1)
X_te.head()

In [None]:
X_tr = X_tr.reset_index(drop=True)
X_te = X_te.reset_index(drop=True)

In [None]:
X_train = X_train.drop(columns=features_to_encode).reset_index(drop=True)
X_test = X_test.drop(columns=features_to_encode).reset_index(drop=True)

In [None]:
X_tr_new = dd.from_pandas(X_tr.drop(columns=features_to_encode), npartitions=10)
X_te_new = dd.from_pandas(X_te.drop(columns=features_to_encode), npartitions=10)

In [None]:
dx_train = dd.from_pandas(X_train, npartitions=10)
dx_test = dd.from_pandas(X_test, npartitions=10)

In [None]:
dx_train = dd.concat([dx_train, X_tr_new], axis=1)
dx_test = dd.concat([dx_test, X_te_new], axis=1)

In [None]:
dx_train.head()

In [None]:
import dask_ml.ensemble as dask_ensemble
import dask_ml.metrics as dask_metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
ddx_train = dx_train.drop(columns=['IncidentGrade'])
ddx_test = dx_test.drop(columns=['IncidentGrade'])

In [None]:
# X_train = dx_train.compute()
# X_test = dx_test.compute()

In [None]:
# cat_list_IpAddress = X_train['IpAddress'].value_counts().head(10).index.tolist()
# X_train['IpAddress'] = X_train['IpAddress'].apply(lambda x: x if x in cat_list_IpAddress else 1000000)

cat_list_AccountSid = X_train['AccountSid'].value_counts().head(10).index.tolist()
X_train['AccountSid'] = X_train['AccountSid'].apply(lambda x: x if x in cat_list_AccountSid else 1000000)

cat_list_AccountUpn = X_train['AccountUpn'].value_counts().head(10).index.tolist()
X_train['AccountUpn'] = X_train['AccountUpn'].apply(lambda x: x if x in cat_list_AccountUpn else 1000000)

cat_list_AccountObjectId = X_train['AccountObjectId'].value_counts().head(10).index.tolist()
X_train['AccountObjectId'] = X_train['AccountObjectId'].apply(lambda x: x if x in cat_list_AccountObjectId else 1000000)

cat_list_AccountName = X_train['AccountName'].value_counts().head(10).index.tolist()
X_train['AccountName'] = X_train['AccountName'].apply(lambda x: x if x in cat_list_AccountName else 1000000)

# cat_list_AlertTitle = X_train['AlertTitle'].value_counts().head(10).index.tolist()
# X_train['AlertTitle'] = X_train['AlertTitle'].apply(lambda x: x if x in cat_list_AlertTitle else 1000000)

# cat_list_DetectorId = X_train['DetectorId'].value_counts().head(10).index.tolist()
# X_train['DetectorId'] = X_train['DetectorId'].apply(lambda x: x if x in cat_list_DetectorId else 1000000)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ohe_cols = ['Category', 'EntityType', 'EvidenceRole', 'Time_of_day', 'IpAddress', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName']
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), ohe_cols)
    ]
)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
X_train.shape

(7099122, 104)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
dtc = DecisionTreeClassifier(random_state=42)
dtc.fit(X_train, y_train)

In [None]:
y_pred = dtc.predict(X_test)

In [None]:
df_train.head()

Unnamed: 0,Timestamp,AlertTitle,Category,MitreTechniques,IncidentGrade,EntityType,EvidenceRole,IpAddress,AccountSid,AccountUpn,AccountObjectId,AccountName,DeviceName,RegistryKey,FileName,FolderPath,ResourceIdName,LastVerdict
0,2024-06-04 06:05:15+00:00,6,InitialAccess,,TruePositive,Ip,Related,27,441377,673934,425863,453297,153085,1631,289573,117668,3586,
1,2024-06-14 03:01:25+00:00,43,Exfiltration,,FalsePositive,User,Impacted,360606,22406,23032,22795,24887,153085,1631,289573,117668,3586,
2,2024-06-13 04:52:55+00:00,298,InitialAccess,T1189,FalsePositive,Url,Related,360606,441377,673934,425863,453297,153085,1631,289573,117668,3586,Suspicious
3,2024-06-10 16:39:36+00:00,2,CommandAndControl,,BenignPositive,Url,Related,360606,441377,673934,425863,453297,153085,1631,289573,117668,3586,Suspicious
4,2024-06-15 01:08:07+00:00,74,Execution,,TruePositive,User,Impacted,360606,449,592,440,479,153085,1631,289573,117668,3586,


In [None]:
# df_train['AlertTitle'].value_counts().plot(kind='bar')

In [None]:
# Preprocessing datetime column

df_train['Year'] = df_train['Timestamp'].dt.year
df_test['Year'] = df_test['Timestamp'].dt.year

df_train['Month'] = df_train['Timestamp'].dt.month
df_test['Month'] = df_test['Timestamp'].dt.month

df_train['Day'] = df_train['Timestamp'].dt.day
df_test['Day'] = df_test['Timestamp'].dt.day

df_train['Hour'] = df_train['Timestamp'].dt.hour
df_test['Hour'] = df_test['Timestamp'].dt.hour

df_train['DayofWeek'] = df_train['Timestamp'].dt.dayofweek
df_test['DayofWeek'] = df_test['Timestamp'].dt.dayofweek

df_train['Business_hours'] = df_train['Hour'].between(9, 17).astype(int)
df_test['Business_hours'] = df_test['Hour'].between(9, 17).astype(int)

df_train['Time_of_day'] = df_train['Hour'].apply(lambda x: 'morning' if x >= 6 and x < 12 else ('afternoon' if x >= 12 and x < 19 else 'night'))
df_test['Time_of_day'] = df_test['Hour'].apply(lambda x: 'morning' if x >= 6 and x < 12 else ('afternoon' if x >= 12 and x < 19 else 'night'))



In [None]:
df_train = df_train.drop(columns=['Timestamp'])
df_test = df_test.drop(columns=['Timestamp'])

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9465497 entries, 0 to 257185
Data columns (total 24 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   AlertTitle       int32 
 1   Category         string
 2   MitreTechniques  string
 3   IncidentGrade    string
 4   EntityType       string
 5   EvidenceRole     string
 6   IpAddress        int32 
 7   AccountSid       int32 
 8   AccountUpn       int32 
 9   AccountObjectId  int32 
 10  AccountName      int32 
 11  DeviceName       int32 
 12  RegistryKey      int32 
 13  FileName         int32 
 14  FolderPath       int32 
 15  ResourceIdName   int32 
 16  LastVerdict      string
 17  Year             int32 
 18  Month            int32 
 19  Day              int32 
 20  Hour             int32 
 21  DayofWeek        int32 
 22  Business_hours   int64 
 23  Time_of_day      object
dtypes: int32(16), int64(1), object(1), string(6)
memory usage: 1.6+ GB


In [None]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4147992 entries, 0 to 259207
Data columns (total 24 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   AlertTitle       int64 
 1   Category         string
 2   MitreTechniques  string
 3   IncidentGrade    string
 4   EntityType       string
 5   EvidenceRole     string
 6   IpAddress        int64 
 7   AccountSid       int64 
 8   AccountUpn       int64 
 9   AccountObjectId  int64 
 10  AccountName      int64 
 11  DeviceName       int64 
 12  RegistryKey      int64 
 13  FileName         int64 
 14  FolderPath       int64 
 15  ResourceIdName   int64 
 16  LastVerdict      string
 17  Year             int32 
 18  Month            int32 
 19  Day              int32 
 20  Hour             int32 
 21  DayofWeek        int32 
 22  Business_hours   int64 
 23  Time_of_day      object
dtypes: int32(5), int64(12), object(1), string(6)
memory usage: 902.8+ MB


In [None]:
df_train.drop(columns=['MitreTechniques'], inplace=True)
df_test.drop(columns=['MitreTechniques'], inplace=True)

In [None]:
# df_train['LastVerdict'].fillna('missing', inplace=True)
# df_test['LastVerdict'].fillna('missing', inplace=True

In [None]:
# df_train['LastVerdict'] = df_train['LastVerdict'].replace(['DomainPII_50d8b4a941c26b89482c94ab324b5a274f9ced66',
#                                                'DomainPII_9207384283ce115db5a590dd9ca5de21e5e99df2'], 'missing')

In [None]:
df_train = df_train.drop(columns=['LastVerdict'])
df_test = df_test.drop(columns=['LastVerdict'])

In [None]:
df_train['EntityType'].value_counts()

Unnamed: 0_level_0,count
EntityType,Unnamed: 1_level_1
Ip,2181194
User,1886088
MailMessage,1173154
Machine,697646
File,688402
Url,682578
CloudLogonRequest,638565
Mailbox,479768
Process,345732
MailCluster,224684


In [None]:
df_train['Category'].value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
InitialAccess,4285115
Exfiltration,1550756
SuspiciousActivity,994992
CommandAndControl,825616
Impact,750849
CredentialAccess,300037
Execution,266791
Malware,143591
Discovery,128000
Persistence,72514


In [None]:
df_train['EvidenceRole'].value_counts()

Unnamed: 0_level_0,count
EvidenceRole,Unnamed: 1_level_1
Related,5208616
Impacted,4256881


In [None]:
cat_list_entitytype = df_train['EntityType'].value_counts().head(12).index.tolist()
df_train['EntityType'] = df_train['EntityType'].apply(lambda x: x if x in cat_list_entitytype else 'other')

cat_list_category = df_train['Category'].value_counts().head(12).index.tolist()
df_train['Category'] = df_train['Category'].apply(lambda x: x if x in cat_list_category else 'other')

In [None]:
same_val_drop_cols = []

for col in df_train.columns:
  same_vals = df_train[col].value_counts(normalize=True).max()
  if same_vals > 0.89:
    same_val_drop_cols.append(col)
    print(f'{col} : {same_vals}' )

DeviceName : 0.9265097226273485
RegistryKey : 0.9981486444927298
OAuthApplicationId : 0.9997258464082763
FileName : 0.8912949842992924
FolderPath : 0.9074921264039278
ResourceIdName : 0.9991436265840029
Year : 0.9999810892127482
Month : 0.9097363825692407


In [None]:
df_train = df_train.drop(columns=same_val_drop_cols)
df_test = df_test.drop(columns=same_val_drop_cols)

In [None]:
df_train.head()

Unnamed: 0,DetectorId,AlertTitle,Category,IncidentGrade,EntityType,EvidenceRole,IpAddress,AccountSid,AccountUpn,AccountObjectId,AccountName,Day,Hour,DayofWeek,Business_hours,Time_of_day
0,7,6,InitialAccess,TruePositive,Ip,Related,27,441377,673934,425863,453297,4,6,1,0,morning
1,58,43,Exfiltration,FalsePositive,User,Impacted,360606,22406,23032,22795,24887,14,3,4,0,night
2,423,298,InitialAccess,FalsePositive,Url,Related,360606,441377,673934,425863,453297,13,4,3,0,night
3,2,2,CommandAndControl,BenignPositive,Url,Related,360606,441377,673934,425863,453297,10,16,0,1,afternoon
4,9,74,Execution,TruePositive,User,Impacted,360606,449,592,440,479,15,1,5,0,night


In [None]:
df_train['AlertTitle'].value_counts().head(15)

Unnamed: 0_level_0,count
AlertTitle,Unnamed: 1_level_1
0,1331600
1,774539
2,596463
4,413241
3,411783
...,...
117,7279
105,7273
79,7201
98,7106


In [None]:
df_train['DetectorId'].value_counts().head(100)

Unnamed: 0_level_0,count
DetectorId,Unnamed: 1_level_1
0,1331600
1,774539
2,596463
3,490324
4,411783
...,...
91,10983
96,10537
101,10511
100,10123


In [None]:
df_train['IpAddress'].value_counts().head(12)

Unnamed: 0_level_0,count
IpAddress,Unnamed: 1_level_1
360606,7287824
0,12489
3,9868
1,9759
2,9246
4,7978
7,6974
5,6725
6,6681
8,5621


In [None]:
df_train['AccountSid'].value_counts().head(13)

Unnamed: 0_level_0,count
AccountSid,Unnamed: 1_level_1
441377,7260833
0,14469
1,10840
2,6017
3,5262
4,3770
5,3596
7,3144
6,2895
8,2832


In [None]:
df_train['AccountUpn'].value_counts().head(15)

Unnamed: 0_level_0,count
AccountUpn,Unnamed: 1_level_1
673934,6050893
0,14469
1,9444
2,8542
4,7800
3,7797
5,7585
6,7487
7,7431
8,7298


In [None]:
df_train['AccountObjectId'].value_counts().head(12)

Unnamed: 0_level_0,count
AccountObjectId,Unnamed: 1_level_1
425863,7284572
0,14469
1,10840
2,6017
3,3770
5,3144
4,2895
6,2832
9,2552
7,2341


In [None]:
df_train['AccountName'].value_counts().head(14)

Unnamed: 0_level_0,count
AccountName,Unnamed: 1_level_1
453297,7148638
0,14469
1,10840
2,6108
3,3771
4,3596
7,3144
6,3080
5,2895
8,2832


In [None]:
cat_list_AlertTitle = df_train['AlertTitle'].value_counts().head(12).index.tolist()
df_train['AlertTitle'] = df_train['AlertTitle'].apply(lambda x: x if x in cat_list_AlertTitle else 1000000)

cat_list_DetectorId = df_train['DetectorId'].value_counts().head(12).index.tolist()
df_train['DetectorId'] = df_train['DetectorId'].apply(lambda x: x if x in cat_list_DetectorId else 1000000)

cat_list_IpAddress = df_train['IpAddress'].value_counts().head(12).index.tolist()
df_train['IpAddress'] = df_train['IpAddress'].apply(lambda x: x if x in cat_list_IpAddress else 1000000)

cat_list_AccountSid = df_train['AccountSid'].value_counts().head(12).index.tolist()
df_train['AccountSid'] = df_train['AccountSid'].apply(lambda x: x if x in cat_list_AccountSid else 1000000)

cat_list_AccountUpn = df_train['AccountUpn'].value_counts().head(12).index.tolist()
df_train['AccountUpn'] = df_train['AccountUpn'].apply(lambda x: x if x in cat_list_AccountUpn else 1000000)

cat_list_AccountObjectId = df_train['AccountObjectId'].value_counts().head(12).index.tolist()
df_train['AccountObjectId'] = df_train['AccountObjectId'].apply(lambda x: x if x in cat_list_AccountObjectId else 1000000)

cat_list_AccountName = df_train['AccountName'].value_counts().head(12).index.tolist()
df_train['AccountName'] = df_train['AccountName'].apply(lambda x: x if x in cat_list_AccountName else 1000000)

In [None]:
# import pandas as pd
# import numpy as np

# def probability_encode(df, feature, target, smooth=10):

#     target_classes = df[target].unique()
#     encoded_features = pd.DataFrame(index=df.index)

#     for target_class in target_classes:
#         encoded_features[f"{feature}_{target_class}"] = pd.Series(index=df.index)

#     for target_class in target_classes:
#         category_counts = df.groupby(feature)[target].apply(lambda x: (x == target_class).sum())
#         global_count = (df[target] == target_class).sum()
#         smoothed_probs = (category_counts + smooth * (global_count / len(df))) / (df.groupby(feature)[target].count() + smooth)
#         encoded_features[f"{feature}_{target_class}"] = df[feature].map(smoothed_probs).fillna(global_count / len(df))

#     return encoded_features

In [None]:
# df_train = df_train.reset_index(drop=True)

# features_to_encode = ['DetectorId', 'AlertTitle', 'IpAddress', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName']
# target_variable = 'IncidentGrade'

# for feature in features_to_encode:
#     encoded_feature = probability_encode(df_train, feature, target_variable, smooth=10)
#     encoded_df_train = pd.concat([encoded_df_train, encoded_feature], axis=1)

# df_train = pd.concat([df_train, encoded_df_train], axis=1)
# df_train.to_csv('encoded_df_train.csv')
# df_train.head()


In [None]:
# df_train = pd.read_csv('/content/encoded_df_train.csv')
# df_train.head()

In [None]:
# features_to_encode = ['DetectorId', 'AlertTitle', 'IpAddress', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName']
# target_variable = 'IncidentGrade'

# mappings = {}
# for feature in features_to_encode:
#     mappings[feature] = {}
#     for target_class in df_train[target_variable].unique():
#         mappings[feature][target_class] = df_train[f"{feature}_{target_class}"].groupby(df_train[feature]).mean().to_dict()


# test_encoded_data = pd.DataFrame(index=df_test.index)
# for feature in features_to_encode:
#     for target_class in df_train[target_variable].unique():
#         test_encoded_data[f"{feature}_{target_class}"] = df_test[feature].map(mappings[feature][target_class]).fillna(df_train[target_variable].value_counts(normalize=True)[target_class])

# df_test = pd.concat([df_test, test_encoded_data], axis=1)

# print(df_test.head())

In [None]:
# df_train = df_train.drop(columns=features_to_encode)
# df_test = df_test.drop(columns=features_to_encode)

In [None]:
set(df_train.columns) == set(df_test.columns)

True

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
X_train = df_train.drop(columns=['IncidentGrade'])
y_train = df_train['IncidentGrade']

X_test = df_test.drop(columns=['IncidentGrade'])
y_test = df_test['IncidentGrade']

In [None]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

In [None]:
X_train.shape

(9465497, 15)

In [None]:
ohe_cols = ['Category', 'EntityType', 'EvidenceRole', 'Time_of_day', 'DetectorId', 'AlertTitle',
            'IpAddress', 'AccountSid', 'AccountUpn', 'AccountObjectId', 'AccountName']
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), ohe_cols)
    ]
)

In [None]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [None]:
X_train.shape

(9465497, 122)

In [None]:
import tensorflow
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization

In [None]:
model = Sequential()

model.add(Dense(128, activation='relu', input_dim=122))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.summary()

In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, epochs=20, batch_size=64, validation_data=(X_test, y_test))

In [None]:
ohe_cols = ['Category', 'EntityType', 'EvidenceRole', 'Time_of_day']
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(handle_unknown='ignore'), ohe_cols)
    ]
)

In [None]:
pipeline_dt = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('dtclassifier', DecisionTreeClassifier(random_state=42))
    ]
)

In [None]:
dtc = DecisionTreeClassifier(random_state=42)

In [None]:
dtc.fit(X_train, y_train)

In [None]:
y_pred_dt = dtc.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred_dt)
print(f'Accuracy is : {accuracy}')
print(classification_report(y_test, y_pred_dt))

Accuracy is : 0.6414860491534217
              precision    recall  f1-score   support

           0       0.62      0.83      0.71   1752940
           1       0.63      0.24      0.34    902698
           2       0.69      0.66      0.67   1492354

    accuracy                           0.64   4147992
   macro avg       0.64      0.58      0.58   4147992
weighted avg       0.64      0.64      0.62   4147992



In [None]:
confusion_matrix(y_test, y_pred_dt)

array([[1462381,   90191,  200368],
       [ 443598,  214186,  244914],
       [ 471286,   36756,  984312]])

In [None]:
param_grid_dt = {
    'dtclassifier__n_estimators': [50, 100, 200],
    'dtclassifier__max_depth': [None, 10, 20, 30],
    'dtclassifier__min_samples_split': [2, 5, 10],
    'dtclassifier__min_samples_leaf': [1, 2, 4]

In [None]:
grid_search = GridSearchCV(estimator=pipeline_dt,
                           param_grid=param_grid_dt,
                           cv=5,
                           scoring='accuracy',
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

In [None]:
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred_dt = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred_dt)

print("Best Parameters:", best_params)
print("Best Model:", best_model)
print("Accuracy:", accuracy)

In [None]:
pipeline_gbc = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('gbclassifier', GradientBoostingClassifier(random_state=42))
    ]
)

pipeline_gbc.fit(X_train, y_train)
y_pred_gbc = pipeline_gbc.predict(X_test)

accuracy_gbc = accuracy_score(y_test, y_pred_gbc)
print(f'Accuracy is : {accuracy_gbc}')
print(classification_report(y_test, y_pred_gbc))
print(confusion_matrix(y_test, y_pred_gbc))