In [2]:
import numpy as np
import pandas as pd
from scipy.stats import sem, t
from numpy import mean, std
from keras.regularizers import l2
import tensorflow as tf
from tensorflow.keras import layers, losses
import scipy.stats as stats
from collections import Counter
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Dropout
from keras.optimizers import Adam
from keras.activations import relu, sigmoid
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.preprocessing import MinMaxScaler
from user_agents import parse
from tensorflow.keras.losses import mse
from sklearn.preprocessing import OneHotEncoder
import random as random

<b>Feature Engineering - Password Spray</b>

In [None]:
'''
Create features for the detection of password spray attacks. We define the following features:
- Shifting time window features. These are applied to each split of the dataset.
- Features resulting from discretization of 'createdDateTime', 'status.errorCode', and 'userAgent' features.

    Using the following status codes (errorCode) to define failed login attempts:
     - 50053 - "The account is locked because the user tried to sign in too many times with an incorrect user ID or password. The user is blocked due to repeated sign-in attempts."
        / "Or, sign-in was blocked because it came from an IP address with malicious activity."
     - 50055 - "The password is expired. The user's password is expired, and therefore their login or session was ended."
     - 50126 - "Error validating credentials due to invalid username or password. The user didn't enter the right credentials"
'''

df_training['createdDateTime'] = pd.to_datetime(df_training['createdDateTime'])
df_test['createdDateTime'] = pd.to_datetime(df_test['createdDateTime'])
df_threshold['createdDateTime'] = pd.to_datetime(df_threshold['createdDateTime'])
df_validation['createdDateTime'] = pd.to_datetime(df_validation['createdDateTime'])


#Define the status error codes to consider in the creation of shifting time window features
failure_codes = [50053, 50126, 50055]

#List of time window sizes to be used
time_windows = ['30min', '2D', '60min','1D', '4H', '6H', '8H', '12H'] 

#######################################################################
##################Nominal/Datetime Features Binning####################
#######################################################################

datasets = [
    df_training,
    df_test,
    df_threshold,
    df_validation
]

for dataset in datasets:
    
    # Discretizing the "createdDateTime" column
    dataset['Hour'] = dataset['createdDateTime'].dt.hour
    dataset['Day'] = dataset['createdDateTime'].dt.dayofweek + 1
    
    # Extract the 'Browser', 'OS', and 'Mobile' from the 'userAgent' features
    dataset['Browser'] = dataset['userAgent'].apply(lambda x: parse(x).browser.family)
    dataset['OS'] = dataset['userAgent'].apply(lambda x: parse(x).os.family)
    dataset['Mobile'] = dataset['userAgent'].apply(lambda x: 1 if parse(x).is_mobile else 0)
    
    # Boolean feature stating whether an error code represents failure or success
    dataset['failureCode'] = dataset['errorCode'].apply(lambda x: 1 if x in failure_codes else 0)
    
    # Binning based on each failure code vs success codes
    dataset.loc[dataset['errorCode'] == 50055, 'codeExplained'] = 'Password Expired'
    dataset.loc[dataset['errorCode'] == 50126, 'codeExplained'] = 'Invalid Username'
    dataset.loc[dataset['errorCode'] == 50053, 'codeExplained'] = 'Account locked'
    dataset.loc[~(dataset['errorCode'].isin(failure_codes)), 'codeExplained'] = 'Success Code'


##################################################################
##################Time Window Feature Creation####################
##################################################################

def track_attempts_per_group(df, groupby_col, time_windows, failure_codes, count_success=True):
    
    df = df.set_index('createdDateTime')
    df['Country_aux'], _ = pd.factorize(df['Country'])
    
    for time_window in time_windows:
        if groupby_col != "Country":
            # Apply a function that counts the number of failed attempts.
            grouped_failed_attempts = df.groupby(groupby_col, observed=True)['errorCode'].rolling(time_window)
                                        .apply(lambda x: np.isin(x, failure_codes).sum(), raw=True).reset_index(level=0, drop=True)
            df[f'failedAttemptsCountsPer{groupby_col}Last{time_window}'] = grouped_failed_attempts

        if count_success:
            
            # Count the number of successful attempts.
            grouped_successful_attempts = df.groupby(groupby_col, observed=True)['errorCode'].rolling(time_window)
                                        .apply(lambda x: (~np.isin(x, failure_codes)).sum(), raw=True).reset_index(level=0, drop=True)
            df[f'successfulAttemptsCountsPer{groupby_col}Last{time_window}'] = grouped_successful_attempts

            # Compute the difference between failed and successful attempts (for each userId)
            df[f'{time_window}AttemptsDifferencePer{groupby_col}'] = grouped_successful_attempts - grouped_failed_attempts
        
        # Count the number of distinct countries with failed attempts
        if groupby_col == "Country":

            # Consider sign-ins where errorCode is in failure_codes
            df_failures = df[df['errorCode'].isin(failure_codes)]
            # Group by userId and count distinct countries within each rolling window
            unique_countries = df_failures.groupby('userId').rolling(time_window)['Country_aux'].apply(pd.Series.nunique).reset_index(level=0, drop=True)

            df[f'{time_window}DistinctCountriesForFailedAttempts'] = unique_countries
            df[f'{time_window}DistinctCountriesForFailedAttempts'].fillna(0, inplace=True)
    df = df.drop(columns=['Country_aux'])        
    df = df.reset_index()

    return df    

def track_failed_attempts_global(df, time_windows, failure_codes):
    df = df.set_index('createdDateTime')
    for time_window in time_windows:
        # Count the number of failed attempts over the rolling window
        failed_attempts = df['errorCode'].rolling(time_window).apply(lambda x: np.isin(x, failure_codes).sum(), raw=True)
        
        # Assign the result to a new column
        df[f'failedAttemptsCountsLast{time_window}'] = failed_attempts
    df = df.reset_index(level=0)
    return df

def distinct_users_failed_attempts(df, time_windows, failure_codes):
    df['userId_aux'], _ = pd.factorize(df['userId'])
    for time_window in time_windows:
        
        # Create a new feature name based on the selected time window
        feature_name = f'{time_window}FailedAttemptsUserCount'
        
        # group the dataset by the selected time window and count the number of distinct userIds with failure codes
        df_count = df[df['errorCode'].isin(failure_codes)]
                        .rolling(time_window, on='createdDateTime')['userId_aux'].apply(pd.Series.nunique)
     
        # Assign the result directly to the original dataframe
        df[feature_name] = df_count
        
        # Replace missing values with 0
        df[feature_name].fillna(0, inplace=True)
        
    df = df.drop(columns=['userId_aux'])
    return df

def distinct_users_failed_attempts_per_ip(df, time_windows, failure_codes):
    df['userId_aux'], _ = pd.factorize(df['userId'])
    for time_window in time_windows:
        
        # Initialize the name of the feature
        feature_name = f'{time_window}FailedAttemptsUserCountPerIP'
        
        # group the dataset by the selected time window and count the number of distinct userIds with failure codes
        df_count = df[df['errorCode'].isin(failure_codes)].groupby('ipAddress', observed=True)
                    .rolling(time_window, on='createdDateTime')['userId_aux'].apply(pd.Series.nunique).reset_index(level=0, drop=True)

        # Convert df_count to DataFrame for easier merging
        df_count = df_count.reset_index()
        
        # Renaming columns to prepare for the merge
        df_count.columns = ['createdDateTime', feature_name]

        # Assign the result directly to the original dataframe
        df = pd.merge(df, df_count, on='createdDateTime', how='left')
        
        # Replace missing values with 0
        df[feature_name].fillna(0, inplace=True)
    df = df.drop(columns=['userId_aux'])
    return df

# List of columns needed to create the shifting time window features
required_columns = ['createdDateTime', 'userId', 'errorCode', 'ipAddress', 'appDisplayName', 'Country']

df_training_aux = df_training[required_columns].copy()
df_test_aux = df_test[required_columns].copy()
df_validation_aux = df_validation[required_columns].copy()
df_threshold_aux = df_threshold[required_columns].copy()

datasets_aux = [
    df_training_aux,
    df_test_aux,
    df_validation_aux,
    df_threshold_aux
]

# Note that "Country" is the same as "location.countryOrRegion"
column_list = ['userId', 'ipAddress', 'appDisplayName', 'Country']

# Apply the functions defined to each split of the dataset
for dataset_aux in datasets_aux:
    for i in column_list:
        print("Creating time window features based on the column: ", i)
        dataset_aux = track_attempts_per_group(dataset_aux, i, time_windows, failure_codes, (i == "userId"))

    dataset_aux = distinct_users_failed_attempts(track_failed_attempts_global(dataset_aux, time_windows, failure_codes), time_windows, failure_codes)
    dataset_aux = distinct_users_failed_attempts_per_ip(dataset_aux, time_windows, failure_codes)
    
# Merge the content of the auxiliary dataframes back to the original one    
df_training = df_training.merge(df_training_aux, on=required_columns, how='left')
df_test = df_test.merge(df_test_aux, on=required_columns, how='left')
df_validation = df_validation.merge(df_validation_aux, on=required_columns, how='left')
df_threshold = df_threshold.merge(df_threshold_aux, on=required_columns, how='left')

In [7]:
'''
Code used for the normalization of numerical features.
'''

# Lists of: time window features and categorical features.
tw_list = [col for col in df_training.columns.tolist() if any(i in col for i in time_windows)]
categorical_features = ['appDisplayName','userAgent', 'Country', 'authenticationStepResultDetail', 
                        'errorCode',"Browser", "OS", "codeExplained", 'Hour', 'Day'] 
boolean_features = ['Mobile', 'failureCode']

# List of engineered features
relevant_features = tw_list + categorical_features + boolean_features

# Filter each dataset to contain only relevant features for the anomaly detection task
df_training_copy = df_training[relevant_features].copy()
df_threshold_copy = df_threshold[relevant_features].copy()

# For the validation and test data also keep the ground truth labels for tuning and evaluation
df_test_copy = df_test[relevant_features + ['ActualAttack']].copy()
df_validation_copy = df_validation[relevant_features + ['ActualAttack']].copy()

############################################################################################
#################### Normalize numerical features in the datasets. #########################
############################################################################################

# Instantiate a MinMaxScaler object. Fit on the training set
scaler = MinMaxScaler(feature_range=(0, 1))
scaler.fit(df_training_copy[tw_list])

# Normalize each numerical feature using MinMax normalization
df_training_copy[tw_list] = scaler.transform(df_training_copy[tw_list])
df_test_copy[columns_list] = scaler.transform(df_test_copy[columns_list])
df_validation_copy[columns_list] = scaler.transform(df_validation_copy[columns_list])
df_threshold_copy[columns_list] = scaler.transform(df_threshold_copy[columns_list])

72

In [None]:
'''
Encoding the categorical features in the training dataset.
The unknown categories are encoded the same (only 0 values)
'''

# Add the "Unseen" category to each categorical feature except to "Hour" and "Day"
unseen_handling_features = ['appDisplayName','userAgent', 'Country', 'authenticationStepResultDetail', 
                            'errorCode', "Browser", "OS"]

for feature in unseen_handling_features:
    df_training[feature] = df_training[feature].astype('str').astype('category').cat.add_categories('Unseen')

# Add a "dummy" row in the training data, which contains only the value "Unseen" for all features in "unseen_handling_features"
dummy_row = pd.DataFrame([['Unseen'] * len(unseen_handling_features)], columns=unseen_handling_features)
df_training = pd.concat([df_training, dummy_row], ignore_index=True)

# Creating OneHotEncoder instance on the training set
enc = OneHotEncoder(sparse_output=False)
enc_fit = enc.fit(df_training[categorical_features])
df_training = df_training.iloc[:-1, :]

encoded_df_training = enc_fit.transform(df_training_copy[categorical_features])
features = enc_fit.get_feature_names_out(input_features=categorical_features)
df_encoded_training = pd.DataFrame(encoded_data_training, columns=features)

#Function that replaces unseen values in the test/validation/threshold splits with 'Unseen'
def handle_unseen_categories(df, encoder_categories):
    for feature, seen_categories in zip(unseen_handling_features, encoder_categories[:-1]):  
        # We only consider the features in "unseen_handling_features"
        df[feature] = df[feature].astype(str).where(df[feature].astype(str).isin(seen_categories), 'Unseen')
    return df

#########################################################
############# For each split that is not training #######
#########################################################
df_threshold_copy = handle_unseen_categories(df_threshold_copy, enc_fit.categories_)
encoded_df_threshold = enc_fit.transform(df_threshold_copy[categorical_features])
df_encoded_threshold = pd.DataFrame(encoded_df_threshold, columns=features)

df_test_copy = handle_unseen_categories(df_test_copy, enc_fit.categories_)
encoded_df_test = enc_fit.transform(df_test_copy[categorical_features])
df_encoded_test = pd.DataFrame(encoded_df_test, columns=features)

df_validation_copy = handle_unseen_categories(df_validation_copy, enc_fit.categories_)
encoded_df_validation = enc_fit.transform(df_validation_copy[categorical_features])
df_encoded_validation = pd.DataFrame(encoded_df_validation, columns=features)


#############################################
##### Join with the original dataframe ######
#############################################

# Remove any dummy columns that represent "nan" values
columns_to_remove = ['Hour_nan', 'Day_nan', 'codeExplained_nan'] + categorical_features
df_training_copy = df_training_copy.join(df_encoded_training)
df_threshold_copy = df_threshold_copy.join(df_encoded_threshold)
df_validation_copy = df_validation_copy.join(df_encoded_validation)
df_test_copy = df_test_copy.join(df_encoded_test)

df_training_copy = df_training_copy.drop(columns=columns_to_remove).copy()
df_threshold_copy = df_threshold_copy.drop(columns=columns_to_remove).copy()
df_validation_copy = df_threshold_copy.drop(columns=columns_to_remove).copy()
df_test_copy = df_threshold_copy.drop(columns=columns_to_remove).copy()

<b><font size="3">Password Spray: Modeling Phase</font></b>

In [None]:
'''
Feature Selection
'''

# Shifting time-window features only (one size only)
feat_30_min_window = [i for i in tw_list if '30min' in i] #Best performance on the IF model
feat_60_min_window = [i for i in tw_list if '60min' in i]
feat_1D_min_window = [i for i in tw_list if '1D' in i]
feat_2D_min_window = [i for i in tw_list if '2D' in i]
feat_4H_min_window = [i for i in tw_list if '4H' in i]
feat_6H_min_window = [i for i in tw_list if '6H' in i]
feat_8H_min_window = [i for i in tw_list if '8H' in i]
feat_12H_min_window = [i for i in tw_list if '12H' in i]

# Shifting time-window features only (two sizes)
feat_12H_30min_window = [i for i in tw_list if '30min' in i or '12H' in i] # Best performance on the IF model
feat_8H_30min_window = [i for i in tw_list if '30min' in i or '8H' in i]
feat_1D_30min_window = [i for i in tw_list if '30min' in i or '1D' in i]
feat_2D_30min_window = [i for i in tw_list if '30min' in i or '2D' in i]

# Best performer + Country feature
feat_12H_30min_country_window = [i for i in df_training_copy.columns if '30min' in i or '12H' in i or 'Country_' in i]

# Features extracted directly from the sign-in logs.
extracted_init = [i for i in df_training_copy.columns if 'appDisplayName_' in i or 
                  'Country_' in i or 'errorCode_' in i or 'authenticationStepResultDetail_' in i or 
                  'userAgent_' in i or 'Hour_' in i or 'Day_' in i]

# Features extracted directly from the sign-in logs, but "userAgent" and "errorCode" are binned.
extracted_bin = [i for i in df_training_copy.columns if 'Browser_' in i or 
                            'OS_' in i or 'authenticationStepResultDetail_' in i or 
                            i in ['failureCode', 'Mobile'] or 'appDisplayName_' in i or 
                            'Country_' in i or 'Hour_' in i or 'Day_' in i]

# List of non-correlated features.
# Selection criteria: choose as many 30minute TW features as these features do not have a strong 
# correlation with shifting time window features with larger time windows. 
# As such, the set would include more features then, if we chose to select TW features 
# with size of 60 minutes or larger instead of 30 minutes.

non_correlated_features = ['30minAttemptsDifferencePeruserId', '1DFailedAttemptsUserCountPerIP', 
                            '60minDistinctCountriesForFailedAttempts', 'failedAttemptsCountsLast1D',
                            'failedAttemptsCountsLast30min', 'failedAttemptsCountsLast12H',
                            'failedAttemptsCountsPerappDisplayNameLast30min', 'failedAttemptsCountsPerappDisplayNameLast1D',
                            'failedAttemptsCountsPeripAddressLast1D', 'successfulAttemptsCountsPeruserIdLast30min',
                            'failedAttemptsCountsPeruserIdLast30min']

all_features = [i for i in df_training_copy.columns]

features_dictionary = {
    "feat_30_min_window": feat_30_min_window,
    "feat_60_min_window": feat_60_min_window,
    "feat_1D_min_window": feat_1D_min_window,
    "extracted_init": extracted_init,
    "non_correlated_features": non_correlated_features,
    "all_features": all_features,
    "feat_2D_min_window": feat_2D_min_window,
    "feat_4H_min_window": feat_4H_min_window,
    "feat_6H_min_window": feat_6H_min_window,
    "feat_8H_min_window": feat_8H_min_window,
    "feat_12H_min_window": feat_12H_min_window,
    "feat_12H_30min_window": feat_12H_30min_window,
    "feat_8H_30min_window": feat_8H_30min_window,
    "extracted_bin": extracted_bin,
    "feat_12H_30min_country_window": feat_12H_30min_country_window
}


<b>Modeling Autoencoder</b>

In [None]:
'''
Autoencoder Model initialization.
'''

np.random.seed(0)
tf.random.set_seed(0)
random.seed(0)

# Function creating the Autoencoder network
def build_autoencoder(drop_r, l2_reg, l_rate, input_dim):
    
    # Other hyperparameters about the network structure of the AE (chosen empirically)
    bottleneck_size = 4        # Controls the size of the encoding
    num_hidden_layers = 4      # Controls the number of hidden layers
    ratio = 0.5                # Controls the number of neurons in each layer
    
    # Input layer initialization
    layer_dims = [input_dim]
    input_layer = layers.Input(shape=(input_dim,))
    
    # Dense layer with sigmoid activation function
    x = layers.Dense(input_dim, activation="sigmoid")(input_layer)

    for i in range(num_hidden_layers//2):
        new_dim = int(layer_dims[-1] * 0.5)
        layer_dims.append(new_dim)
    
    # Encoding layers
    encoder = tf.keras.Sequential([
      layers.Dropout(drop_r),
      layers.Dense(layer_dims[-2], activation='relu', kernel_regularizer=l2_reg),
      layers.Dropout(drop_r),
      layers.Dense(layer_dims[-1], activation='relu', kernel_regularizer=l2_reg),
      layers.Dropout(drop_r),
      layers.Dense(bottleneck_size, activation=activation_func, kernel_regularizer=l2_reg)])(x)
    
    # Decoding layers
    decoder = tf.keras.Sequential([
          layers.Dense(layer_dims[-1], activation='relu', kernel_regularizer=l2_reg),
          layers.Dense(layer_dims[-2], activation='relu', kernel_regularizer=l2_reg),
          layers.Dense(input_dim, activation="sigmoid")])(encoder)
    
    # Create the autoencoder
    autoencoder = tf.keras.Model(inputs=input_layer, outputs=decoder)
    opt = Adam(learning_rate=l_rate)
    autoencoder.compile(optimizer=opt, loss='mse')
    
    return autoencoder

In [None]:
# Hyperparameter grid space
dropouts = [0.3, 0.5]
l2_regs = [1e-6, 5e-6, 1e-5, 5e-5]
learning_rates = [1e-5, 5e-5, 1e-4, 5e-4]
all_configs = list(itertools.product(dropouts, l2_regs, learning_rates))

# Set the number of iterations for random search
n_iterations = 50

# Other hyperparameters chosen empirically
epochs_num = 20
selected_batch_size = 64

# For computing the CI
confidence_level = 0.95 
degrees_freedom = 49

# Iterate through all of the feature sets
for set_name, feature_set in features_dictionary.items():  
    
    X_train = df_training_copy[feature_set].copy()
    X_validation = df_validation_copy[feature_set].copy()
    y_validation = df_validation_copy['ActualAttack']
    X_threshold = df_threshold_copy[feature_set].copy()
    
    # Size of input space
    input_dimension = X_train.shape[1]
    
    # List containing the results of the experiments for one feature set in this list
    all_results = []
    
    # Select a hyperparameter configuration
    for config in all_configs:
        
        # Lists to store performance metrics
        f1_scores, precisions, recalls, fprs = [], [], [], []
        
        # Take the average over n_iterations for every model configuration
        for iteration in range(n_iterations):

            dr = config[0]       #dropout
            l2_r = l2(config[1]) #l2_regularization
            l_r = config[2]      #learning rate

            # Build and train the model
            model = build_autoencoder(dr, l2_r, l_r, input_dimension)
            model.fit(X_train, X_train, shuffle=False, epochs=epochs_num, batch_size=selected_batch_size, verbose=0)

            # Compute the reconstruction error of the threshold set
            X_threshold_predicted = model.predict(X_threshold)
            re_threshold = np.mean(np.power(X_threshold - X_threshold_predicted, 2), axis=1)

            # Use the 3 sigma rule to compute the threshold
            threshold = np.mean(re_threshold) + 3 * np.std(re_threshold)

            # Validation
            X_validation_predicted = model.predict(X_validation)
            re_validation = np.mean(np.power(X_validation - X_validation_predicted, 2), axis=1)
            
            # Assign an anomaly label to each sample in the validation set based on the computed threshold value
            y_predicted_validation = [1 if re > threshold else 0 for re in re_validation]

            # Compute F1, precision, recall and fpr
            tn, fp, fn, tp = confusion_matrix(y_validation, y_predicted_validation).ravel()
            f1 = f1_score(y_validation, y_predicted_validation)
            precision = precision_score(y_validation, y_predicted_validation, zero_division=1)
            recall = recall_score(y_validation, y_predicted_validation)
            fpr = fp / (tn + fp)

            # Add the computed metric to their corresponding list
            f1_scores.append(f1)
            precisions.append(precision)
            recalls.append(recall)
            fprs.append(fpr)

            print(f"Iteration {iteration + 1}. F1 score: {f1}.")

        # After n_iterations, compute the average and the CI
        
        # Average for each metric
        avg_f1 = mean(f1_scores)
        avg_precision = mean(precisions)
        avg_recall = mean(recalls)
        avg_fpr = mean(fprs)

        # Calculate the standard error of the mean
        error_f1 = sem(f1_scores)
        error_precision = sem(precisions)
        error_recall = sem(recalls)
        error_fpr = sem(fprs)

        # Calculate the confidence intervals
        t_ci_f1 = t.interval(confidence_level, degrees_freedom, avg_f1, error_f1)
        t_ci_precision = t.interval(confidence_level, degrees_freedom, avg_precision, error_precision)
        t_ci_recall = t.interval(confidence_level, degrees_freedom, avg_recall, error_recall)
        t_ci_fpr = t.interval(confidence_level, degrees_freedom, avg_fpr, error_fpr)

        print(f"Average F1 Score: {avg_f1}, Confidence Interval t-Dist: {t_ci_f1}.")
        print(f"Average Precision: {avg_precision}, Confidence Interval t-Dist: {t_ci_precision}.")
        print(f"Average Recall: {avg_recall}, Confidence Interval t-Dist: {t_ci_recall}.")
        print(f"Average FPR: {avg_fpr}, Confidence Interval t-Dist: {t_ci_fpr}.")
        
        # Create dictionary with the results for each config
        config_result = {
                'dropout': dr,
                'l2_regularization': config[1],
                'learning_rate': l_r,
                'avg_f1': avg_f1,
                'avg_precision': avg_precision,
                'avg_recall': avg_recall,
                'avg_fpr': avg_fpr,
                't_ci_f1': t_ci_f1,
                't_ci_precision': t_ci_precision,
                't_ci_recall': t_ci_recall,
                't_ci_fpr': t_ci_fpr
            }

        all_results.append(config_result)

    all_results = sorted(all_results, key=lambda x: x['avg_f1'], reverse=True)

<b>Modeling Isolation Forest</b>

In [None]:
# Hyperparameter grid space
parameter_grid = {
    'n_estimators': [20, 50, 100],
    'max_samples': [64, 128, 256],
    'max_features': [0.1, 0.2, 0.4, 0.6, 0.8, 1.0],
    'contamination': [0.1, 0.15, 0.2, 0.25, 0.3]
}
rng = np.random.RandomState(0)

# Set the number of iterations for random search
n_iterations = 100
  
# Iterate through all of the feature sets
for set_name, feature_set in features_dictionary.items(): 
    all_configs = list(ParameterGrid(parameter_grid))
    all_results = []

    X_train = df_training_copy[feature_set].copy()
    X_validation = df_encoded_validation_sim_copy[feature_set].copy()
    y_validation = df_encoded_validation_sim_copy['Attack']

    # List containing the results of the experiments for one feature set in this list
    all_results = []
    
    for config in all_configs:
        f1_scores, precisions, recalls, fpr_scores, auc_scores = [], [], [], [], []
    
        for iteration in range(n_iterations):
            model = IsolationForest(**config, bootstrap=True, random_state=rng, n_jobs=-2)
            model.fit(X_train)

            # Isolation Forest anomaly labels
            y_predicted_validation = model.predict(X_validation)
            
            # Transform the predictions from -1 to 1 and from 1 to 0
            y_predicted_validation = (y_predicted_validation == -1).astype(int)
            
            # Compute F1, precision, recall and fpr
            f1 = f1_score(y_validation, y_predicted_validation)
            precision = precision_score(y_validation, y_predicted_validation, zero_division=1)
            recall = recall_score(y_validation, y_predicted_validation)
            tn, fp, fn, tp = confusion_matrix(y_validation, y_predicted_validation).ravel()
            fpr = fp / (tn + fp)
            
            f1_scores.append(f1)
            precisions.append(precision)
            recalls.append(recall)
            fpr_scores.append(fpr)

            print(f"Iteration {iteration + 1}. F1 score: {f1}")

        # After n_iterations, compute the average and the CI
        
        # Average for each metric
        avg_f1 = mean(f1_scores)
        avg_precision = mean(precisions)
        avg_recall = mean(recalls)
        avg_fpr = mean(fprs)

        # Calculate the standard error of the mean
        error_f1 = sem(f1_scores)
        error_precision = sem(precisions)
        error_recall = sem(recalls)
        error_fpr = sem(fprs)

        # Calculate the confidence intervals
        t_ci_f1 = t.interval(confidence_level, degrees_freedom, avg_f1, error_f1)
        t_ci_precision = t.interval(confidence_level, degrees_freedom, avg_precision, error_precision)
        t_ci_recall = t.interval(confidence_level, degrees_freedom, avg_recall, error_recall)
        t_ci_fpr = t.interval(confidence_level, degrees_freedom, avg_fpr, error_fpr)

        print(f"Average F1 Score: {avg_f1}, Confidence Interval t-Dist: {t_ci_f1}.")
        print(f"Average Precision: {avg_precision}, Confidence Interval t-Dist: {t_ci_precision}.")
        print(f"Average Recall: {avg_recall}, Confidence Interval t-Dist: {t_ci_recall}.")
        print(f"Average FPR: {avg_fpr}, Confidence Interval t-Dist: {t_ci_fpr}.")
        
        # Create dictionary with the results for each config
        config_result = {
            **config,
            'avg_f1': avg_f1,
            'avg_precision': avg_precision,
            'avg_recall': avg_recall,
            'avg_fpr': avg_fpr,
            't_ci_f1': t_ci_f1,
            't_ci_precision': t_ci_precision,
            't_ci_recall': t_ci_recall,
            't_ci_fpr': t_ci_fpr
        }
    
        all_results.append(config_result)
        
    all_results = sorted(all_results, key=lambda x: x['avg_f1'], reverse=True)

<b><font size="3">Password Spray: Evaluation Phase</font></b>

<b>Evaluation Autoencoder</b>

In [None]:
# Best Hyperparameter configuration
dr = 0.3
l2_r = l2(1e-6)
l_r = 1e-5

# Best feature set
feature_set = all_features

X_train = df_training_copy[feature_set].copy()
X_threshold = df_threshold_copy[feature_set].copy()
X_test = df_test_copy[feature_set].copy()
y_test = df_test_copy['ActualAttack']

input_dimension = X_train.shape[1]

# Build and train the model
model = build_autoencoder(dr, l2_r, l_r, input_dimension)
history = model.fit(X_train, X_train, shuffle=False, epochs=epochs_num, batch_size=selected_batch_size, verbose=0)

# Compute the reconstruction error of the threshold set
X_threshold_predicted = model.predict(X_threshold)
re_threshold = np.mean(np.power(X_threshold - X_threshold_predicted, 2), axis=1)

# Use the 3 sigma rule to compute the threshold
threshold = np.mean(re_threshold) + 3 * np.std(re_threshold)

# Assign anomaly label for each sample in X_test
X_test_predicted = model.predict(X_test)
re_test = np.mean(np.power(X_test - X_test_predicted, 2), axis=1)

y_test_predicted = [1 if e > threshold else 0 for e in re_test]

# Compute F1, precision, recall and fpr on the test dataset
f1_test = f1_score(y_test, y_test_pred)
precision_test = precision_score(y_test, y_test_pred, zero_division=1)
recall_test = recall_score(y_test, y_test_pred)
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()
fpr_test = fp_test / (tn_test + fp_test)

print(f"TP: {tp_test}")
print(f"FP: {fp_test}")
print(f"TN: {tn_test}")
print(f"FN: {fn_test}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-score: {f1_test:.4f}")
print(f"FPR: {fpr_test:.4f}")

<b>Evaluation Isolation Forest</b>

In [None]:
# Best feature set
feature_set = feat_12H_30min_country_window

# Best Hyperparameter configuration
param_grid = {
    'n_estimators': [100],
    'max_samples': [64],
    'max_features': [0.1],
    'contamination': [0.2]
}
rng = np.random.RandomState(0)

# One configuration only
all_configs = list(ParameterGrid(param_grid))

X_train = df_training_copy[feature_set].copy()
X_test = df_encoded_test_sim_copy[feature_set].copy()
y_test = df_encoded_test_sim_copy['ActualAttack']

model = IsolationForest(**all_configs[0], bootstrap=True, random_state=rng, n_jobs=-2)
model.fit(X_train)

# Isolation Forest anomaly labels
y_predicted_test = model.predict(X_test)

# Transform the predictions from -1 to 1 and from 1 to 0
y_predicted_test = (y_predicted_test == -1).astype(int)

f1_test = f1_score(y_test_2, y_predicted_test)
precision_test = precision_score(y_test_2, y_predicted_test, zero_division=1)
recall_test = recall_score(y_test_2, y_predicted_test)
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test_2, y_predicted_test).ravel()
fpr_test = fp_test / (tn_test + fp_test)

print(f"TP: {tp_test}")
print(f"FP: {fp_test}")
print(f"TN: {tn_test}")
print(f"FN: {fn_test}")
print(f"Precision: {precision_test:.4f}")
print(f"Recall: {recall_test:.4f}")
print(f"F1-score: {f1_test:.4f}")
print(f"FPR: {fpr_test:.4f}")