### **TRAIN AND EVALUATE THE BEST MODEL**

In [None]:
# Importing required modules
import os
import pandas as pd
from joblib import load, dump
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

### **Training the best model on the whole train dataset**

In [None]:
# Load dataset
train_df = pd.read_csv(r"C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Data\Processed\Train_DS_Cleaned.csv")

# Define features and target variable
X = train_df.drop('IncidentGrade', axis=1)
y = train_df['IncidentGrade']

# Split the dataset into training and validation sets with stratification
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Encoding the target variable
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_val = le.transform(y_val)

# Scale features that are in different ranges
scaler = StandardScaler()
columns_to_scale = ['Hour', 'Day', 'DayOfWeek', 'Month', 'Year', 'Category', 'EntityType', 'AlertTitle']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_val[columns_to_scale] = scaler.transform(X_val[columns_to_scale])

# Apply SMOTE to the training data for class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize Random Forest Classifier 
hp_tuned_rf = RandomForestClassifier(n_estimators=200, min_samples_split=5, min_samples_leaf=2, max_features=None, max_depth=None, bootstrap=True, random_state=42)

# Fit the model
hp_tuned_rf.fit(X_train_resampled, y_train_resampled)

# Evaluate the best model on validation data
y_pred = hp_tuned_rf.predict(X_val)

print("Accuracy score:")
print(accuracy_score(y_val, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("Classification Report:")
print(classification_report(y_val, y_pred, target_names=le.classes_))

# Dump the saved model, LabelEncoder, & StandardScaler
model_path = r"C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Models"
dump(le, os.path.join(model_path, "LabelEncoder.pkl"))
dump(scaler, os.path.join(model_path, "StandardScaler.pkl"))
dump(hp_tuned_rf, os.path.join(model_path, "FinalModel.pkl"))

### **Evaluating the best model on the test dataset**

In [None]:
# Load the test dataset
test_df = pd.read_csv(r"C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Data\Processed\Test_DS_Cleaned.csv")

# Load the saved model, LabelEncoder, & StandardScaler
best_rf = load(os.path.join(model_path, "Model.pkl"))
le = load(os.path.join(model_path, "LabelEncoder.pkl"))
scaler = load(os.path.join(model_path, "StandardScaler.pkl"))

# Separate features and target
X_test = test_df.drop('IncidentGrade', axis=1)
y_test = test_df['IncidentGrade']

# Encode target
y_test_encoded = le.transform(y_test)

# Apply same scaling used during training
columns_to_scale = ['Hour', 'Day', 'DayOfWeek', 'Month', 'Year', 'Category', 'EntityType', 'AlertTitle']
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

# Make predictions
y_test_pred = best_rf.predict(X_test)

# Evaluate
print("Accuracy score:")
print(accuracy_score(y_test_encoded, y_test_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_encoded, y_test_pred))
print("\nClassification Report:")
print(classification_report(y_test_encoded, y_test_pred, target_names=le.classes_))