### **TRAIN AND EVALUATE THE BEST MODEL**

In [None]:
# Importing required modules
import os
import pandas as pd
from joblib import load, dump
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

### **Training the best model on the whole train dataset**

In [2]:
# Load dataset
train_df = pd.read_csv(r"C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Data\Processed\Train_DS_Cleaned.csv")

# Define features and target variable
X = train_df.drop('IncidentGrade', axis=1)
y = train_df['IncidentGrade']

# Split the dataset into training and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Encoding the target variable
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_val = le.transform(y_val)

# Scale features that are in different ranges
scaler = StandardScaler()
columns_to_scale = ['Hour', 'Day', 'DayOfWeek', 'Month', 'Year', 'Category', 'EntityType', 'AlertTitle']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_val[columns_to_scale] = scaler.transform(X_val[columns_to_scale])

In [None]:
# Apply RandomUnderSampler only on training data
RUS = RandomUnderSampler(random_state=42)
X_train_RUS, y_train_RUS = RUS.fit_resample(X_train, y_train)

print(pd.Series(y_train).value_counts())
print(pd.Series(y_train_RUS).value_counts())

0    1579154
2    1287166
1     795600
Name: count, dtype: int64
0    795600
1    795600
2    795600
Name: count, dtype: int64


In [5]:
# Initialize Random Forest Classifier 
hp_tuned_rf = RandomForestClassifier(
    n_estimators=200, 
    min_samples_split=5, 
    min_samples_leaf=2, 
    max_features=None, 
    max_depth=None, 
    bootstrap=True,
    random_state=42, 
    n_jobs=-1
    )

# Fit the model
hp_tuned_rf.fit(X_train_RUS, y_train_RUS)

In [9]:
# Evaluate the best model on validation data
y_pred = hp_tuned_rf.predict(X_val)

print("Accuracy score:")
print(accuracy_score(y_val, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

Accuracy score:
0.9789181631493861

Confusion Matrix:
[[385958   5631   3200]
 [  1806 195171   1923]
 [  2299   4441 315051]]

Classification Report:
                precision    recall  f1-score   support

BenignPositive       0.99      0.98      0.98    394789
 FalsePositive       0.95      0.98      0.97    198900
  TruePositive       0.98      0.98      0.98    321791

      accuracy                           0.98    915480
     macro avg       0.97      0.98      0.98    915480
  weighted avg       0.98      0.98      0.98    915480



In [7]:
# Dump the saved model, LabelEncoder, & StandardScaler
model_path = r"C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Models"
dump(le, os.path.join(model_path, "LabelEncoder.pkl"))
dump(scaler, os.path.join(model_path, "StandardScaler.pkl"))
dump(hp_tuned_rf, os.path.join(model_path, "FinalModel.pkl"))

['C:\\Users\\spand\\Projects\\MICROSOFT_CYBERSECURITY\\Guvi---Microsoft-Cybersecurity\\Models\\FinalModel.pkl']

### **Evaluating the best model on the test dataset**

In [10]:
# Load the test dataset
test_df = pd.read_csv(r"C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Data\Processed\Test_DS_Cleaned.csv")

# Load the saved model, LabelEncoder, & StandardScaler
best_rf = load(os.path.join(model_path, "FinalModel.pkl"))
le = load(os.path.join(model_path, "LabelEncoder.pkl"))
scaler = load(os.path.join(model_path, "StandardScaler.pkl"))

# Separate features and target
X_test = test_df.drop('IncidentGrade', axis=1)
y_test = test_df['IncidentGrade']

# Encode target
y_test_encoded = le.transform(y_test)

# Apply same scaling used during training
columns_to_scale = ['Hour', 'Day', 'DayOfWeek', 'Month', 'Year', 'Category', 'EntityType', 'AlertTitle']
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# Make predictions
y_test_pred = best_rf.predict(X_test)

# Evaluate
print("Accuracy score:")
print(accuracy_score(y_test_encoded, y_test_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_encoded, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_test_pred, target_names=le.classes_))

Accuracy score:
0.9446977141990391

Confusion Matrix:
[[1556707   48736   25499]
 [  38038  797927   32932]
 [  25496   46233 1351127]]

Classification Report:
                precision    recall  f1-score   support

BenignPositive       0.96      0.95      0.96   1630942
 FalsePositive       0.89      0.92      0.91    868897
  TruePositive       0.96      0.95      0.95   1422856

      accuracy                           0.94   3922695
     macro avg       0.94      0.94      0.94   3922695
  weighted avg       0.95      0.94      0.94   3922695

