### **MODEL BUILDING AND EVALUATION**

In [47]:
# Importing required modules
import joblib
# import numpy as np
import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report 

In [48]:
# Loading train data
train_data = pd.read_csv(r'C:\Users\spand\Projects\MICROSOFT_CYBERSECURITY\Guvi---Microsoft-Cybersecurity\Data\Processed\Train_DS_Cleaned.csv')
train_data

Unnamed: 0,IncidentGrade,Hour,Day,DayOfWeek,Month,Year,GroupedCategory,GroupedEntityType,IsImpacted,GroupedAlertTitle,IsWeekend,IsBusinessHour,IsMajorState,IsOSVersion66,OrgAttackFreq,DetectorIdsGrouped
0,0,5,6,3,6,2024,7,2,0,11,0,0,1,1,0,0
1,0,10,3,0,6,2024,5,4,1,11,0,1,1,1,0,0
2,0,3,13,3,6,2024,3,4,1,11,0,0,1,1,5,1
3,2,16,8,5,6,2024,10,2,0,11,1,1,1,1,5,0
4,0,2,9,6,6,2024,4,2,1,11,1,0,1,1,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4577395,1,10,10,0,6,2024,4,5,1,11,0,1,1,1,5,5
4577396,1,19,11,1,6,2024,0,10,1,2,0,0,1,1,0,0
4577397,0,4,9,6,6,2024,6,5,0,1,1,0,1,1,1,0
4577398,0,23,21,1,5,2024,5,3,0,11,0,0,1,1,0,0


In [49]:
# Sample 5% of the data for faster prototyping
sample_df = train_data.sample(frac=0.05, random_state=42)
print("Sample shape:", sample_df.shape)

Sample shape: (228870, 16)


In [50]:
# Feature Matrix & Target variable
y = sample_df['IncidentGrade']
X = sample_df.drop(columns=['IncidentGrade'], axis=1) 

In [51]:
y.value_counts()

IncidentGrade
0    98675
2    80309
1    49886
Name: count, dtype: int64

In [52]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(183096, 15)
(45774, 15)
(183096,)
(45774,)


In [None]:
# Scale features for models like Logistic Regression, KNN, etc
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
def train_baseline_models(X_train, X_test, y_train, y_test):
    """
    Train baseline models with proper error handling and evaluation
    """
    # Define baseline models 
    baseline_models = {
        'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=2000, solver='saga', random_state=42),
        'Decision Tree': DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=10),
        'Random Forest': RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42, n_estimators=100),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5, weights='distance', metric='minkowski', n_jobs=-1),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
        'XGBoost': XGBClassifier(eval_metric='mlogloss',n_jobs=-1, random_state=42, n_estimators=100)
    }
    
    results = {}
    
    # Train and evaluate each model
    for model_name, model in baseline_models.items():
        print("-" * 55)
        print(f'Training Model: {model_name}')
        print("-" * 55)
        
        try:
            if model_name in ['Logistic Regression', 'K-Nearest Neighbors']:
                X_train_use = X_train_scaled
                X_test_use = X_test_scaled
            else:
                X_train_use = X_train
                X_test_use = X_test
            
            model.fit(X_train_use, y_train) # Fit the model
            y_pred = model.predict(X_test_use) # Predict
            
            accuracy = accuracy_score(y_test, y_pred)
            cm = confusion_matrix(y_test, y_pred)
            report = classification_report(y_test, y_pred, output_dict=True)
            
            # Store results
            results[model_name] = {
                'model': model,
                'predictions': y_pred,
                'accuracy': accuracy,
                'confusion_matrix': cm,
                'classification_report': report
            }
            
            # Print results
            print(f'\nConfusion Matrix \n{cm}')
            print(f'\nClassification Report')
            print(classification_report(y_test, y_pred))

        except Exception as e:
            print(f'Error training {model_name}: {str(e)}')
            continue
    
    return results

results = train_baseline_models(X_train, X_test, y_train, y_test)

-------------------------------------------------------
Training Model: Logistic Regression
-------------------------------------------------------

Confusion Matrix 
[[11154  4964  3617]
 [ 4278  2946  2753]
 [ 3026  2753 10283]]

Classification Report
              precision    recall  f1-score   support

           0       0.60      0.57      0.58     19735
           1       0.28      0.30      0.29      9977
           2       0.62      0.64      0.63     16062

    accuracy                           0.53     45774
   macro avg       0.50      0.50      0.50     45774
weighted avg       0.54      0.53      0.53     45774

-------------------------------------------------------
Training Model: Decision Tree
-------------------------------------------------------

Confusion Matrix 
[[13311  4764  1660]
 [ 2061  6840  1076]
 [ 2992  2666 10404]]

Classification Report
              precision    recall  f1-score   support

           0       0.72      0.67      0.70     19735
        

In [57]:
def compare_baseline_results(results):
    """
    Compare results from all baseline models
    """
    print("\n" + "=" * 105)
    print("BASELINE MODELS COMPARISON")
    print("=" * 105)
    
    comparison_data = []
    
    for model_name, result in results.items():
        report = result['classification_report']
        
        comparison_data.append({
            'Model': model_name,
            'Accuracy': result['accuracy'],
            'Precision (Macro)': report['macro avg']['precision'],
            'Recall (Macro)': report['macro avg']['recall'],
            'F1-Score (Macro)': report['macro avg']['f1-score'],
            'F1-Score (Weighted)': report['weighted avg']['f1-score']
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.round(4).sort_values(by='F1-Score (Macro)', ascending=False)
    
    print(comparison_df.to_string(index=False))
    
    # Find best model based on F1-Score (Macro)
    best_model_idx = comparison_df['F1-Score (Macro)'].idxmax()
    best_model_name = comparison_df.loc[best_model_idx, 'Model']
    best_f1_score = comparison_df.loc[best_model_idx, 'F1-Score (Macro)']
    
    print(f"\nBest Baseline Model: {best_model_name}")
    print(f"Best F1-Score (Macro): {best_f1_score:.4f}")
    
    return comparison_df, best_model_name

# Compare results
comparison_df, best_model_name = compare_baseline_results(results)


BASELINE MODELS COMPARISON
              Model  Accuracy  Precision (Macro)  Recall (Macro)  F1-Score (Macro)  F1-Score (Weighted)
      Random Forest    0.7156             0.7010          0.7015            0.7011               0.7159
            XGBoost    0.7255             0.7381          0.6834            0.6970               0.7180
K-Nearest Neighbors    0.7107             0.7019          0.6848            0.6911               0.7082
      Decision Tree    0.6675             0.6653          0.6693            0.6585               0.6743
  Gradient Boosting    0.6624             0.7063          0.5900            0.5918               0.6339
Logistic Regression    0.5327             0.4994          0.5002            0.4994               0.5346

Best Baseline Model: Random Forest
Best F1-Score (Macro): 0.7011
