# AdFlush Model
Test AdFlush ONNX, or train and test AdFlush upon H2O

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import time

Select the dataset to use during the session

In [None]:
# Available datasets
testset="../dataset/AdFlush_test.csv"
GAN_testset="../dataset/GAN_mutated_AdFlush.csv"
gnirts_testset="../dataset/JS_obfuscated_gnirts.csv"
javascript_obfuscator_testset="../dataset/JS_obfuscated_javascript_obfuscator.csv"
wobfuscator_testset="../dataset/JS_obfuscated_wobfuscator.csv"

# Select dataset to import
DATASET=testset
ISMUTATED=False
if DATASET==testset or DATASET==GAN_testset:
    data_df=pd.read_csv(DATASET, index_col=0)
elif DATASET==gnirts_testset or DATASET==javascript_obfuscator_testset or DATASET==wobfuscator_testset:
    js_features=['brackettodot', 'num_get_storage', 'num_set_storage',
        'num_get_cookie', 'num_requests_sent', 'ng_0_0_2', 'ng_0_15_15', 'ng_2_13_2',
        'ng_15_0_3', 'ng_15_0_15', 'ng_15_15_15', 'avg_ident',
        'avg_charperline']
    test_df=pd.read_csv(testset,index_col=0)
    mutate_df=pd.read_csv(DATASET,index_col=0)
    mutate_df=mutate_df[js_features].copy()
    mutate_performed=mutate_df.index
    test_df=test_df.loc[mutate_performed].copy()
    test_df.loc[mutate_performed,js_features]=mutate_df.loc[mutate_performed,js_features]
    data_df=test_df.copy()
    ISMUTATED=True
label=data_df['label'].astype(int)

print("Using Dataset: ",DATASET)
data_df

In [None]:
def metrics(true, pred, _is_mutated):
    print(f"Accuracy : {accuracy_score(true, pred)} ")
    print(f"Precision : {precision_score(true, pred)} ")
    print(f"Recall : {recall_score(true, pred)} ")
    print(f"F1 : {f1_score(true, pred)} ")
    # Number of attacks
    total_attacks = len(true)
    
    # Number of successful attacks (misclassifications)
    successful_attacks = sum(true != pred)
    tn, fp, fn, tp = confusion_matrix(true, pred).ravel()

    # Calculate FNR
    fnr = fn / (tp + fn)
    print('False Negative Rate:', fnr)

    # Calculate FPR
    fpr = fp / (fp + tn)
    print('False Positive Rate:', fpr)
    # ASR
    if _is_mutated:
        asr = successful_attacks / total_attacks
        print("Attack Success Rate: ",asr)

### 1. AdFlush_ONNX

In [None]:
import onnx
import onnxruntime as ort
import onnxmltools

In [None]:
print('Loading Model')
model = onnx.load('../model/AdFlush.onnx')
# Check that the IR is well formed
try:
    print("Checking model integrity")
    onnx.checker.check_model(model)
    # Create an ONNX runtime session
    ort_session = ort.InferenceSession('../model/AdFlush.onnx')
    input_data = data_df.drop('label', axis=1).values.astype('float32')
    input_name = ort_session.get_inputs()[0].name
    label_name = ort_session.get_outputs()[0].name

    # Run the inference session to get the prediction results
    print('Running Inference Session')
    start_time=time.time()
    pred = ort_session.run([label_name], {input_name: input_data})[0]
    print("Inference time elapsed: ", time.time()-start_time, "seconds for ", len(label), " samples.")
    metrics(label.astype(int),pred.astype(int),ISMUTATED)
    print("AUROC: ",roc_auc_score(label, pred.astype('float64')))
    
except Exception as e:
    print("Error in loading model: ",e)

### 2. AdFlush_H2O_MOJO

In [None]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(nthreads = 12, max_mem_size = "64g", enable_assertions = False)
h2o.no_progress()
custom_path=""

#### 2-1. Making a new AdFlush with H2O AutoML (optional)

##### 2-1-1. Training custom AdFlush

In [None]:
train=pd.read_csv('../dataset/AdFlush_train.csv',index_col=0)
test=data_df.copy() #Use selected Dataset to test
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

y = 'label'
x = list(train)
x.remove(y)

h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

h2o_train[y] = h2o_train[y].asfactor()
h2o_test[y] = h2o_test[y].asfactor()

# Select runtime for exploration
MAXRUNTIME=3600
aml = H2OAutoML(max_runtime_secs=MAXRUNTIME, max_models=None, exclude_algos=['XGBoost', 'StackedEnsemble'], nfolds=5)
# Run below code instead if you want to convert to ONNX format.
# aml = H2OAutoML(max_runtime_secs=100, max_models=None,include_algos=['GBM'], nfolds=5)

aml.train(x = x, y = y, training_frame = h2o_train, leaderboard_frame = h2o_test)
custom_path=aml.leader.save_mojo('../model/AdFlush_custom')
custom_path

##### 2-1-2. Convert custom AdFlush to ONNX

In [None]:
## Only available with GBM model
## We found that there is a problem in converting h2o to onnx format in windows due to automatically caching to ~AppData\temp, thus recommend to run this code in Linux
onnx_model = onnxmltools.convert.convert_h2o(custom_path, target_opset=9)
onnxmltools.utils.save_model(onnx_model, '../model/AdFlush_custom/AdFlush_custom.onnx')

#### 2-2. Testing AdFlush with H20 AutoML

In [None]:
path = '../model/AdFlush_mojo.zip'

## Run code below to use custom AdFlush
# if custom_path!="":
#     path=custom_path
h2o_model = h2o.import_mojo(path)
h2o_test=h2o.H2OFrame(data_df)

start_time=time.time()
pred = h2o_model.predict(h2o_test)
pred = pred.as_data_frame().predict.to_list()
print("Inference time elapsed: ", time.time()-start_time, "seconds for ", len(label), " samples.")
metrics(label, pred, ISMUTATED)
if ISMUTATED:
    print("AUROC: ",roc_auc_score(label, pred.astype('float64')))

#### 2-3.Explanable values for AdFlush H2O AutoML
We can see some explanations of the processed AdFlush model using H2O AutoML,
including SHAP values.

In [None]:
h2o_model

In [None]:
h2o_model.shap_summary_plot(h2o_test)

In [None]:
h2o.cluster().shutdown()