In [1]:
import pandas as pd 
import numpy as np
import os
from joblib import dump
import pickle as pkl
import pickle
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, roc_auc_score, classification_report, precision_recall_curve
from sklearn.metrics import make_scorer, fbeta_score,  mean_squared_error, r2_score, f1_score
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, precision_recall_curve, classification_report, confusion_matrix)
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, GridSearchCV, train_test_split, StratifiedShuffleSplit
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance

import xgboost as xgb
from xgboost import XGBClassifier, plot_importance

In [4]:
train_df = pd.read_csv('/work/SarahHvidAndersen#6681/DataScience_project/data/train_df.csv')
eval_df = pd.read_csv('/work/SarahHvidAndersen#6681/DataScience_project/data/eval_df.csv')
test_df = pd.read_csv('/work/SarahHvidAndersen#6681/DataScience_project/data/test_df.csv')


features = ['Fwd IAT Total', 'Fwd Packet Length Max',
        'Bwd Packet Length Mean', 'Bwd Packet Length Std',
        'Bwd Packets Length Total', 'Bwd Packet Length Max',
        'Packet Length Max', 'Packet Length Mean', 'Packet Length Std',
        'Packet Length Variance', 'Avg Packet Size', 'Fwd Header Length',
        'Avg Fwd Segment Size', 'Avg Bwd Segment Size']


#X_train = train_df[selected_features]  # Features from the training data
X_train = train_df.drop(['Label', 'Attack'], axis=1)
X_train_selected_feat = train_df[features]
y_train = train_df['Attack'] 

#X_eval = eval_df[selected_features]  # Features from the evaluation data
X_eval = eval_df.drop(['Label', 'Attack'], axis=1)
y_eval = eval_df['Attack']

#X_test = test_df[selected_features]
X_test = test_df.drop(['Label', 'Attack'], axis=1)
y_test = test_df['Attack']

# Initialize scaler
scaler = StandardScaler()

# Scale the features
X_train[X_train.columns] = scaler.fit_transform(X_train[X_train.columns])

# Applying the 'trained' scaler on eval and test
X_eval[X_eval.columns] = scaler.transform(X_eval[X_eval.columns])
X_test[X_test.columns] = scaler.transform(X_test[X_test.columns])

# summarizing gridsearch results

In [3]:
# functions to load pickled search objects and summarize results in a dataframe

def load_gridsearch(file_path):
    """Load a GridSearchCV or RandomizedSearchCV object from a file."""
    with open(file_path, 'rb') as file:
        search = pickle.load(file)
    return search

def get_model_info(search):
    """Extract model type and features from the search object."""
    model_info = {
        'Model Type': type(search.best_estimator_).__name__,
        'Search Type': 'GridSearchCV' if isinstance(search, GridSearchCV) else 'RandomizedSearchCV'
    }
    # get feature names directly from the estimator
    try:
        feature_names = search.best_estimator_.feature_names_in_
    except AttributeError:
        feature_names = "Feature names not available"
    
    model_info['Features'] = len(feature_names)
    return model_info

def summarize_search(search, model_name):
    """Summarize the search results and return as a dictionary."""
    model_info = get_model_info(search)
    summary = {
        'Model': model_name,
        'Best Parameters': search.best_params_,
        'Best Score': search.best_score_,
        'Scorer': search.scorer_,
        'Features': model_info['Features'],
        'Model Type': model_info['Model Type'],
        'Search Type': model_info['Search Type']
    }
    return summary

def load_and_summarize_searches(file_paths, model_names):
    """Load multiple search objects, summarize their results, and store in a DataFrame."""
    summaries = []
    for file_path, model_name in zip(file_paths, model_names):
        try:
            search = load_gridsearch(file_path)
            summary = summarize_search(search, model_name)
            summaries.append(summary)
        except Exception as e:
            print(f"Error loading {model_name} from {file_path}: {e}")

    return pd.DataFrame(summaries)

In [4]:
# summarizing grid searches
file_paths = [
    '../crossvalidation/RF_random_selected_feat.pkl',
    '../crossvalidation/RF_gridsearch_selected_feat.pkl',
    '../crossvalidation/RF_random_absolute_feat.pkl',
    '../crossvalidation/XGB_random_selected_feat.pkl',
    '../crossvalidation/XGB_gridsearch_selected_feat.pkl',
    '../crossvalidation/XGB_random_absolute_feat.pkl'
]
model_names = [
    'Random Forest with selected Features (random)',
    'Random forest with selected features',
    'Random Forest with all Features (random)',
    'XGBoost with selected Features (random)',
    'XGBoost with selected Features',
    'XGBoost with all Features (random)'
]

summaries_df = load_and_summarize_searches(file_paths, model_names)
summaries_df

Unnamed: 0,Model,Best Parameters,Best Score,Scorer,Features,Model Type,Search Type
0,Random Forest with selected Features (random),"{'n_estimators': 200, 'min_samples_split': 5, ...",0.892216,"make_scorer(fbeta_score, response_method='pred...",14,RandomForestClassifier,RandomizedSearchCV
1,Random forest with selected features,"{'max_depth': 30, 'min_samples_leaf': 1, 'min_...",0.891121,"make_scorer(fbeta_score, response_method='pred...",14,RandomForestClassifier,GridSearchCV
2,Random Forest with all Features (random),"{'n_estimators': 100, 'min_samples_split': 2, ...",0.893596,"make_scorer(fbeta_score, response_method='pred...",77,RandomForestClassifier,RandomizedSearchCV
3,XGBoost with selected Features (random),"{'subsample': 0.7, 'n_estimators': 300, 'max_d...",0.898481,"make_scorer(fbeta_score, response_method='pred...",14,XGBClassifier,RandomizedSearchCV
4,XGBoost with selected Features,"{'colsample_bytree': 1.0, 'gamma': 0.5, 'learn...",0.895588,"make_scorer(fbeta_score, response_method='pred...",14,XGBClassifier,GridSearchCV
5,XGBoost with all Features (random),"{'subsample': 0.7, 'n_estimators': 300, 'max_d...",0.89986,"make_scorer(fbeta_score, response_method='pred...",77,XGBClassifier,RandomizedSearchCV


## random forest grid search

In [5]:
# gridsearch f2 - randomized  - 11 minutes

# Define the model
rf = RandomForestClassifier(random_state=42) 
# Setup stratified cross-validation
cv = StratifiedKFold(n_splits=5)

# Define the parameter grid
random_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define the scorer, F2 Score, could be too high, maybe experiment with F1.5
f2_scorer = make_scorer(fbeta_score, beta=2, average='binary')

# Setup the RandomizedSearchCV
# randomized search samples combination from the param_grid
random_search = RandomizedSearchCV(estimator=rf, param_distributions=random_param_grid, n_iter=100, scoring=f2_scorer, cv=cv, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[CV] END max_depth=70, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  19.2s
[CV] END max_depth=70, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  20.0s
[CV] END max_depth=70, min_samples_leaf=4, min_samples_split=5, n_estimators=200; total time=  20.2s
[CV] END max_depth=60, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  25.0s
[CV] END max_depth=60, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  26.8s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   9.7s
[CV] END max_depth=60, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  30.9s
[CV] END max_depth=60, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  31.1s
[CV] END max_depth=60, min_samples_leaf=1, min_samples_split=5, n_estimators=300; total time=  31.7s
[CV] END max_depth=20, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total tim

#max_depth: 40 70 n_estimators = 200, min samples split = 2-5, min leaf = 1-2 
#using random search 1 = Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30}
#Best F2 Score: 0.8911214118776313
#Best Parameters: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 15, 'max_depth': 70}


gridsearch selected features:
#Best Parameters: {'max_depth': 40, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 110}
#Best F2 Score: 0.8911617330784429


#using random search 1 = Best Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30}
#Best F2 Score: 0.8911214118776313

#max_depth: 40 70 n_estimators = 200, min samples split = 2-5, min leaf = 1-2 

In [None]:
# gridsearch f2 - all

# Define the model
rf = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5)

param_grid = {
    'n_estimators': [90, 100, 110],
    'max_depth': [30, 40, 50, 60, 70],
    'min_samples_split': [5, 8, 10],
    'min_samples_leaf': [1, 2]
}
# Define the scorer, F2 Score, could be too high, maybe experiment with F1.5
f2_scorer = make_scorer(fbeta_score, beta=2, average='binary')
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, scoring=f2_scorer, cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

## xgboost grid search

f1:
#Best Parameters: {'subsample': 0.7, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}
#Best F1 Score: 0.9187878860713816

#selected features
#Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0.5, 'learning_rate': 0.3, 'max_depth': 10, 'n_estimators': 350, 'subsample': 0.5}
#Best f2 Score: 0.8955876203296633

#f2 - all features
#Best Parameters: {'subsample': 0.7, 'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.01, 'gamma': 0, 'colsample_bytree': 1.0}
#Best f2 Score: 0.898480666436833
#identified: n_estimators 250 300 350, learning rate 0.2 0.3, subsample 0.5, gamma 0.5 1, colsample 1, max depth 10 15


In [8]:
# gridsearch f2 - random
# Define the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5, 1, 1.5, 2]
}

# Setup stratified cross-validation
cv = StratifiedKFold(n_splits=5)
f2_scorer = make_scorer(fbeta_score, beta=2, average='binary')
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_grid, n_iter=100, scoring=f2_scorer, cv=cv, verbose=1, random_state=42, n_jobs=-1)

# fit
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
# gridsearch f2 - all

# Define the model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [250, 300, 350],
    'max_depth': [10, 15],
    'learning_rate': [0.2, 0.3],
    'subsample': [0.5],
    'colsample_bytree': [0.7, 1.0],
    'gamma': [0.5, 1, 1.5]
}

# Setup stratified cross-validation
cv = StratifiedKFold(n_splits=5)

# Define the scorer, F2 Score, could be too high, maybe experiment with F1.5
f2_scorer = make_scorer(fbeta_score, beta=2, average='binary')
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring=f2_scorer, cv=cv, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [None]:
print("Best Parameters:", grid_search.best_params_)
print("Best f2 Score:", grid_search.best_score_)
results = grid_search.cv_results_