### **MODEL BUILDING AND EVALUATION**

In [27]:
# Importing required modules
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

In [28]:
# Loading train data
train_data = pd.read_csv(r'C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Data\Processed\Train_DS_Cleaned.csv')
train_data

Unnamed: 0,OrgId,DetectorId,IncidentGrade,DeviceId,IpAddress,Url,AccountName,DeviceName,ApplicationName,Hour,...,Month,Year,GroupedCategory,GroupedEntityType,IsImpacted,GroupedAlertTitle,IsWeekend,IsBusinessHour,IsMajorState,IsOSVersion66
0,26,31,BenignPositive,98799,360606,160396,453297,153085,3421,5,...,6,2024,7,2,0,11,0,0,1,1
1,33,38,BenignPositive,98799,360606,160396,453297,3142,3421,10,...,6,2024,5,4,1,11,0,1,1,1
2,201,419,BenignPositive,98799,360606,160396,453297,4181,3421,3,...,6,2024,3,4,1,11,0,0,1,1
3,204,44,TruePositive,98799,360606,160396,453297,153085,3421,16,...,6,2024,10,2,0,11,1,1,1,1
4,54,102,BenignPositive,98799,360606,160396,453297,153085,3421,2,...,6,2024,4,2,1,11,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4577395,142,112,FalsePositive,98799,360606,160396,453297,153085,3421,10,...,6,2024,4,5,1,11,0,1,1,1
4577396,36,2,FalsePositive,98799,360606,160396,36982,153085,3421,19,...,6,2024,0,10,1,2,0,0,1,1
4577397,851,1,BenignPositive,98799,360606,160396,453297,153085,3421,4,...,6,2024,6,5,0,1,1,0,1,1
4577398,12,16,BenignPositive,98799,546,160396,453297,153085,3421,23,...,5,2024,5,3,0,11,0,0,1,1


In [29]:
train_data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
OrgId,4577400.0,173.088914,373.17596,0.0,9.0,42.0,161.0,6129.0
DetectorId,4577400.0,109.60995,425.205015,0.0,2.0,9.0,46.0,9522.0
DeviceId,4577400.0,95565.835028,16597.0932,0.0,98799.0,98799.0,98799.0,98799.0
IpAddress,4577400.0,282934.81178,143093.593241,0.0,360606.0,360606.0,360606.0,360606.0
Url,4577400.0,150618.664563,37110.491221,0.0,160396.0,160396.0,160396.0,160396.0
AccountName,4577400.0,358880.80281,173275.365765,0.0,453297.0,453297.0,453297.0,453297.0
DeviceName,4577400.0,142874.761923,36665.288239,0.0,153085.0,153085.0,153085.0,153085.0
ApplicationName,4577400.0,3339.761731,519.911393,0.0,3421.0,3421.0,3421.0,3421.0
Hour,4577400.0,12.153785,6.773679,0.0,6.0,13.0,18.0,23.0
Day,4577400.0,9.813268,6.229133,1.0,5.0,9.0,12.0,31.0


In [30]:
# Sample 5% of the data for faster prototyping
train_sample_df = train_data.sample(frac=0.05, random_state=42)
print("Original data shape:", train_data.shape)
print("Sample data shape:", train_sample_df.shape)

Original data shape: (4577400, 22)
Sample data shape: (228870, 22)


In [31]:
# Feature Matrix & Target variable
y = train_sample_df['IncidentGrade']
X = train_sample_df.drop(columns=['IncidentGrade'], axis=1) 

In [32]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Encoding the target variable
le = LabelEncoder()
y_train = le.fit_transform(y_train) 
y_test = le.transform(y_test)

# Scale features that are in different ranges
scaler = StandardScaler()
columns_to_scale = ['Hour', 'Day', 'DayOfWeek', 'Month', 'Year', 'GroupedCategory',	'GroupedEntityType', 'GroupedAlertTitle']
X_train[columns_to_scale] = scaler.fit_transform(X_train[columns_to_scale])
X_test[columns_to_scale] = scaler.transform(X_test[columns_to_scale])

In [33]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(160209, 21)
(68661, 21)
(160209,)
(68661,)


In [34]:
def train_baseline_models(X_train, X_test, y_train, y_test):
    
    # Define baseline models 
    baseline_models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
        'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=10),
        'Random Forest': RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42, n_estimators=100),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, weights='distance', metric='minkowski', n_jobs=-1),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
        'XGBoost': XGBClassifier(n_jobs=-1, random_state=42, n_estimators=100)
    }
    
    results = {}
    
    # Train and evaluate each model
    for model_name, model in baseline_models.items():
        print("-" * 55)
        print(f'Training Model: {model_name}')
        print("-" * 55)
            
        model.fit(X_train, y_train) # Fit the model
        y_pred = model.predict(X_test) # Predict
        
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
        
        # Store results
        results[model_name] = {
            'model': model,
            'predictions': y_pred,
            'accuracy': accuracy,
            'confusion_matrix': cm,
            'classification_report': report
        }
        
        # Print results
        print(f'\nConfusion Matrix: \n{cm}')
        print(f'\nClassification Report:')
        print(classification_report(y_test, y_pred))
    
    return results

results = train_baseline_models(X_train, X_test, y_train, y_test)

-------------------------------------------------------
Training Model: Logistic Regression
-------------------------------------------------------

Confusion Matrix: 
[[15756  4330  9516]
 [ 5365  4552  5049]
 [ 8398  2738 12957]]

Classification Report:
              precision    recall  f1-score   support

           0       0.53      0.53      0.53     29602
           1       0.39      0.30      0.34     14966
           2       0.47      0.54      0.50     24093

    accuracy                           0.48     68661
   macro avg       0.47      0.46      0.46     68661
weighted avg       0.48      0.48      0.48     68661

-------------------------------------------------------
Training Model: Decision Tree
-------------------------------------------------------

Confusion Matrix: 
[[24624  3191  1787]
 [ 4559  9815   592]
 [ 6039  1983 16071]]

Classification Report:
              precision    recall  f1-score   support

           0       0.70      0.83      0.76     29602
    

In [36]:
def compare_results(results):
    
    comparison_data = []
    
    for model_name, result in results.items():
        report = result['classification_report']
        
        comparison_data.append({
            'Model': model_name,
            'Accuracy': result['accuracy'],
            'Precision (Macro)': report['macro avg']['precision'],
            'Recall (Macro)': report['macro avg']['recall'],
            'F1-Score (Macro)': report['macro avg']['f1-score'],
            'F1-Score (Weighted)': report['weighted avg']['f1-score']
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.round(4).sort_values(by='F1-Score (Macro)', ascending=False)
    
    print(comparison_df.to_string(index=False))
    
    # Find best model based on F1-Score (Macro)
    best_model_idx = comparison_df['F1-Score (Macro)'].idxmax()
    best_model_name = comparison_df.loc[best_model_idx, 'Model']
    best_f1_score = comparison_df.loc[best_model_idx, 'F1-Score (Macro)']
    
    print(f"\nBest Baseline Model: {best_model_name}")
    print(f"Best F1-Score (Macro): {best_f1_score:.4f}")
    
    return comparison_df, best_model_name

In [37]:
# Compare results of baseline models
comparison_df, best_model_name = compare_results(results)

              Model  Accuracy  Precision (Macro)  Recall (Macro)  F1-Score (Macro)  F1-Score (Weighted)
            XGBoost    0.9065             0.9072          0.8951            0.9005               0.9061
      Random Forest    0.8868             0.8882          0.8737            0.8800               0.8863
K-Nearest Neighbors    0.7927             0.7838          0.7781            0.7807               0.7923
  Gradient Boosting    0.7898             0.8296          0.7469            0.7676               0.7841
      Decision Tree    0.7356             0.7417          0.7182            0.7235               0.7355
Logistic Regression    0.4845             0.4654          0.4581            0.4592               0.4806

Best Baseline Model: XGBoost
Best F1-Score (Macro): 0.9005


In [38]:
# Define top 3 baseline models 
top3_models = {
    'XGBoost': XGBClassifier(n_jobs=-1, random_state=42, n_estimators=100),
    'Random Forest': RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42, n_estimators=100),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, weights='distance', metric='minkowski', n_jobs=-1)
}

In [39]:
# Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(pd.Series(y_train).value_counts())
print(pd.Series(y_train_resampled).value_counts())

0    69073
2    56216
1    34920
Name: count, dtype: int64
2    69073
0    69073
1    69073
Name: count, dtype: int64


In [40]:
def train_top3_models_with_SMOTE(X_train_resampled, X_test, y_train_resampled, y_test):
    
    results = {}
    
    # Train and evaluate each model
    for model_name, model in top3_models.items():
        print("-" * 55)
        print(f'Training Model: {model_name} with SMOTE')
        print("-" * 55)
            
        model.fit(X_train_resampled, y_train_resampled) # Fit the model
        y_pred = model.predict(X_test) # Predict
        
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)
            
        # Store results
        results[model_name] = {
            'model': model,
            'predictions': y_pred,
            'accuracy': accuracy,
            'confusion_matrix': cm,
            'classification_report': report
        }
            
        # Print results
        print(f'\nConfusion Matrix: \n{cm}')
        print(f'\nClassification Report:')
        print(classification_report(y_test, y_pred))
    
    return results

results = train_top3_models_with_SMOTE(X_train_resampled, X_test, y_train_resampled, y_test)

-------------------------------------------------------
Training Model: XGBoost with SMOTE
-------------------------------------------------------

Confusion Matrix: 
[[27508  1073  1021]
 [ 1599 12639   728]
 [ 1695   865 21533]]

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91     29602
           1       0.87      0.84      0.86     14966
           2       0.92      0.89      0.91     24093

    accuracy                           0.90     68661
   macro avg       0.89      0.89      0.89     68661
weighted avg       0.90      0.90      0.90     68661

-------------------------------------------------------
Training Model: Random Forest with SMOTE
-------------------------------------------------------

Confusion Matrix: 
[[27413  1023  1166]
 [ 1792 12351   823]
 [ 1965   829 21299]]

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.93      0.90     

In [41]:
# Compare reults of top 3 models with SMOTE
comparison_df, best_model_name = compare_results(results)

              Model  Accuracy  Precision (Macro)  Recall (Macro)  F1-Score (Macro)  F1-Score (Weighted)
            XGBoost    0.8983             0.8950          0.8892            0.8918               0.8982
      Random Forest    0.8893             0.8879          0.8785            0.8827               0.8890
K-Nearest Neighbors    0.7861             0.7722          0.7822            0.7760               0.7877

Best Baseline Model: XGBoost
Best F1-Score (Macro): 0.8918


In [25]:
# Common CV strategy
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameter spaces
param_distributions = {
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6, 10],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'scale_pos_weight': [1, 2, 3],
        'reg_alpha': [0, 0.01, 0.1, 1, 10, 100],
        'reg_lambda': [0.5, 0.7, 1, 1.3]
    },
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'max_features': ['sqrt', 'log2', None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    'K-Nearest Neighbors': {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    }
}

def tune_models(X_train_resampled, y_train_resampled, X_test, y_test):
    results = {}
    
    for model_name, model in top3_models.items():
        print("-" * 55)
        print(f"Tuning {model_name}")
        print("-" * 55)
        
        search = RandomizedSearchCV(
            estimator=model,
            param_distributions=param_distributions[model_name],
            n_iter=20,
            scoring='f1_macro',
            cv=cv_strategy,
            n_jobs=-1,
            verbose=1,
            random_state=42
        )

        search.fit(X_train_resampled, y_train_resampled)
        best_model = search.best_estimator_
        y_pred = best_model.predict(X_test)

        report = classification_report(y_test, y_pred, output_dict=True)
        accuracy = accuracy_score(y_test, y_pred)
        cm = confusion_matrix(y_test, y_pred)

        results[model_name] = {
            'best_model': best_model,
            'best_params': search.best_params_,
            'accuracy': accuracy,
            'confusion_matrix': cm,
            'classification_report': report
        }

        print(f"Best Params: {search.best_params_}")
        print(f'\nConfusion Matrix: \n{cm}')
        print(f'\nClassification Report:')
        print(classification_report(y_test, y_pred))

    return results

results = tune_models(X_train_resampled, y_train_resampled, X_test, y_test)

-------------------------------------------------------
Tuning XGBoost
-------------------------------------------------------
Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Params: {'subsample': 0.8, 'scale_pos_weight': 3, 'reg_lambda': 0.7, 'reg_alpha': 0.01, 'n_estimators': 100, 'max_depth': 10, 'learning_rate': 0.2, 'colsample_bytree': 1.0}

Confusion Matrix: 
[[27979   795   828]
 [ 1126 13181   659]
 [ 1196   718 22179]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93     29602
           1       0.90      0.88      0.89     14966
           2       0.94      0.92      0.93     24093

    accuracy                           0.92     68661
   macro avg       0.92      0.92      0.92     68661
weighted avg       0.92      0.92      0.92     68661

-------------------------------------------------------
Tuning Random Forest
-------------------------------------------------------
Fitting 5 f

In [26]:
# Compare reults of Hyperparameter tuned models with SMOTE
comparison_df, best_model_name = compare_results(results)

              Model  Accuracy  Precision (Macro)  Recall (Macro)  F1-Score (Macro)  F1-Score (Weighted)
      Random Forest    0.9428             0.9371          0.9393            0.9382               0.9429
            XGBoost    0.9225             0.9192          0.9155            0.9172               0.9224
K-Nearest Neighbors    0.7987             0.7851          0.7930            0.7885               0.7997

Best Baseline Model: Random Forest
Best F1-Score (Macro): 0.9382
