# Week 11 - Introduction to Modeling, part 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, GridSearchCV, RandomizedSearchCV, LeaveOneOut, RepeatedKFold
from sklearn.metrics import make_scorer

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Import data

from google.colab import files

upload = files.upload()

Saving BETH.csv to BETH.csv


In [5]:
df = pd.read_csv("BETH.csv")

pd.set_option('display.max_columns', None)

df.head()

Unnamed: 0,timestamp,processId,threadId,parentProcessId,userId,mountNamespace,processName,hostName,eventId,eventName,stackAddresses,argsNum,returnValue,args,sus,evil
0,1809.495787,381,7337,1,100,4026532231,close,ip-10-100-1-120,157,prctl,"[140662171848350, 11649800180280676]",5,0,"[{'name': 'option', 'type': 'int', 'value': 'P...",1,0
1,1809.495832,381,7337,1,100,4026532231,close,ip-10-100-1-120,3,close,[140662171777451],1,0,"[{'name': 'fd', 'type': 'int', 'value': 19}]",1,0
2,1809.495921,381,7337,1,100,4026532231,close,ip-10-100-1-120,1010,sched_process_exit,[],0,0,[],1,0
3,1894.139651,7347,7347,7341,0,4026531840,sh,ip-10-100-1-120,21,access,[],2,-2,"[{'name': 'pathname', 'type': 'const char*', '...",1,0
4,1894.142127,7347,7347,7341,0,4026531840,sh,ip-10-100-1-120,1005,security_file_open,"[139778263990104, 139778263906698]",4,0,"[{'name': 'pathname', 'type': 'const char*', '...",1,0


In [6]:
df['sus'].value_counts()

Unnamed: 0_level_0,count
sus,Unnamed: 1_level_1
0,761875
1,1269


In [7]:
# Keep these features only: eventId, eventName, argsNum, returnValue
# Use 'sus' as Target variable

# One Hot Encode the eventName feature

encoder = LabelEncoder()

df['eventName_enc'] = encoder.fit_transform(df['eventName'])

df = df.drop(columns=['eventName'])

In [8]:
df.head()

Unnamed: 0,timestamp,processId,threadId,parentProcessId,userId,mountNamespace,processName,hostName,eventId,stackAddresses,argsNum,returnValue,args,sus,evil,eventName_enc
0,1809.495787,381,7337,1,100,4026532231,close,ip-10-100-1-120,157,"[140662171848350, 11649800180280676]",5,0,"[{'name': 'option', 'type': 'int', 'value': 'P...",1,0,19
1,1809.495832,381,7337,1,100,4026532231,close,ip-10-100-1-120,3,[140662171777451],1,0,"[{'name': 'fd', 'type': 'int', 'value': 19}]",1,0,6
2,1809.495921,381,7337,1,100,4026532231,close,ip-10-100-1-120,1010,[],0,0,[],1,0,20
3,1894.139651,7347,7347,7341,0,4026531840,sh,ip-10-100-1-120,21,[],2,-2,"[{'name': 'pathname', 'type': 'const char*', '...",1,0,2
4,1894.142127,7347,7347,7341,0,4026531840,sh,ip-10-100-1-120,1005,"[139778263990104, 139778263906698]",4,0,"[{'name': 'pathname', 'type': 'const char*', '...",1,0,22


In [9]:
from imblearn.over_sampling import SMOTE

X = df[['eventId', 'eventName_enc', 'argsNum', 'returnValue']]
y = df['sus']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Apply SMOTE to training data to increase the number of 'sus' cases (gives better model)
sm = SMOTE(random_state=42)
X_train_res, y_train_res = sm.fit_resample(X_train_scaled, y_train)


model_LR = LogisticRegression(max_iter=2000)
model_LR.fit(X_train_res, y_train_res)

y_pred = model_LR.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))




Accuracy: 0.8080316583968132
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.81      0.89    228563
           1       0.00      0.42      0.01       381

    accuracy                           0.81    228944
   macro avg       0.50      0.61      0.45    228944
weighted avg       1.00      0.81      0.89    228944

Confusion Matrix:
 [[184834  43729]
 [   221    160]]


In [10]:
print(y.value_counts(normalize=True))

sus
0    0.998337
1    0.001663
Name: proportion, dtype: float64


In [11]:

X = df[['eventId', 'eventName_enc', 'argsNum', 'returnValue']]
y = df['sus']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

param_grid = {
    'n_estimators': [100],
    'max_depth': [30],
    'max_features': [0.5],
    'bootstrap': [True]
}

# Random Forest with class weight handling imbalance
model_RF = RandomForestClassifier(random_state=42, class_weight='balanced')


rand_search = RandomizedSearchCV(model_RF, param_distributions=param_grid, n_iter=5, cv=5, n_jobs=-1, random_state=42)

rand_search.fit(X_train_scaled, y_train)

y_pred = rand_search.predict(X_test_scaled)

print("Best Parameters:", rand_search.best_params_)
print("Best Cross-Val Score:", rand_search.best_score_)
print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Best Parameters: {'n_estimators': 100, 'max_features': 0.5, 'max_depth': 30, 'bootstrap': True}
# Best Cross-Val Score: 0.9368925496068888



Best Parameters: {'n_estimators': 100, 'max_features': 0.5, 'max_depth': 30, 'bootstrap': True}
Best Cross-Val Score: 0.9368925496068888

Accuracy: 0.930943811587113

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.93      0.96    228563
           1       0.01      0.62      0.03       381

    accuracy                           0.93    228944
   macro avg       0.51      0.78      0.50    228944
weighted avg       1.00      0.93      0.96    228944


Confusion Matrix:
 [[212898  15665]
 [   145    236]]
