In [2]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append('../')

# Data loading and transformation
_(You can skip to the next step if you already have the parquets in floor)_

The data is stored in a MongoDB database. The `loader.py` script will load the data from the database, apply the transformers and save the result in an optimised way to a set of _parquet_ files. The transformers are defined in the `transformers` directory.

The loader can be run from the command line, or we can import it as a module and use it in a notebook. It can either use cached data, which is stored in the `cache` directory once it has been loaded from the database, or it can load the data from the database directly. To load from the database, pass the `--cache-mode` or `-c` option to the CLI or use the `cache_mode` argument when running the loader's `run()` function. Possible values are `auto`, `force-local`, and `force-refresh`.

In [18]:
from loader import run

run_loader = False  # Set to True to trigger the loader. It won't do anything if the parquet files are already created.
force_refresh = False  # Set to True to force grabbing and transformation of the latest data from DB.
if run_loader:
    mode = 'force-refresh' if force_refresh else 'auto'
    run(cache_mode=mode)

# Model training

The following cell configures the model training parameters.

In [3]:
from model.data_loader import make_train_test, make_train, basic_preprocessor_df, basic_preprocessor_table
import model.model_xgb as xgbm
import pickle
import os

model_name = "my_lovely_model_1"

benign_parquet = "../floor/benign_cesnet2_intersect"
malign_parquet = "../floor/phishing"
benign_sample = 1.0
malign_sample = 1.0

use_gpu = False
# If true, a search of optimal hyperparameters will be performed
find_optimal_params = False

# If true, the input data will be split into train/test subsets. If false, all the data will be used for training.
split_train_test = False
split_test_size = 0.3
plot_model_evaluation = False

# If true, cross-validation will be done for the specified model parameters, over the whole dataset
do_cross_validation = False
cross_validation_n_splits = 5

# If find_optimal_params is true, this defines the number of splits for the internally used cross-validation
optimal_params_search_n_splits = 5

# Defines the model hyperparameters
xgboost_params = {
    "max_depth": 9,
    "eta": 0.15,
    "objective": "binary:logistic",
    "min_child_weight": 2.0,
    "subsample": 0.6,
    "alpha": 0,
    "gamma": 0.1,
    "lambda": 1.0,
    "max_delta_step": 0,
    "grow_policy": "lossguide",
    "max_bin": 512,
    "tree_method": "gpu_hist",
    "sampling_method": "gradient_based"
}
xgboost_number_of_estimators = 290  # Number of "trees"

optimal_params_search_grid = {
    "max_depth": [9],
    "min_child_weight": [2],
    "sampling_method": ["uniform"],
    "subsample": [0.6],
    "gamma": [0.1],
    "grow_policy": ["lossguide"],
    "max_bin": [512],
    "n_estimators": [280, 290, 300, 310],
    "lambda": [1.0],
    "alpha": [0.0]
}



In [None]:
# Load the data

if split_train_test:
    X_all, y_all, X_train, X_test, y_train, y_test, benign_label, malign_label = make_train_test(
        benign_parquet, malign_parquet, basic_preprocessor_table, basic_preprocessor_df,
        split_test_size, benign_sample, malign_sample)
else:
    X_all, y_all, benign_label, malign_label = make_train(benign_parquet, malign_parquet, basic_preprocessor_table,
                                                          basic_preprocessor_df, benign_sample, malign_sample)
    X_train, y_train = X_all, y_all

In [21]:
# Find the optimal set of hyperparameters

scores = "Hyperparameter search disabled"
if find_optimal_params:
    scores = xgbm.find_optimal_model(X_train, y_train, optimal_params_search_grid, use_gpu,
                                     optimal_params_search_n_splits)

In [22]:
scores

'Hyperparameter search disabled'

In [23]:
# Create the XGBoost model

model = xgbm.make_model(xgboost_params, xgboost_number_of_estimators, use_gpu)

In [24]:
# Do cross-validation

if do_cross_validation:
    xgbm.cross_validate_model(model, X_all, y_all, cross_validation_n_splits, 42)

In [25]:
# Train the model (and evaluate it – only in split train/test mode)

if split_train_test:
    eval_set = [(X_train, y_train), (X_test, y_test)]
    model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
else:
    model.fit(X_train, y_train, verbose=False)

In [26]:
# Plot evaluation results (only in split train/test mode)

if split_train_test and plot_model_evaluation:
    xgbm.plot_metrics(model.evals_result())

In [29]:
# Save the model

if not os.path.exists("../stored_models"):
    os.makedirs("../stored_models")

with open(f"../stored_models/{model_name}.model.pickle.dat", "wb") as target_file:
    pickle.dump(model, target_file)

if split_train_test:
    import pyarrow
    import pyarrow.parquet

    X_test["_labels"] = y_test
    # noinspection PyArgumentList
    cache_table = pyarrow.Table.from_pandas(X_test)
    pyarrow.parquet.write_table(cache_table, f"../stored_models/{model_name}.test_data.parquet")