# AdFlush Model
Test AdFlush ONNX, or train and test AdFlush upon H2O

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
import time

Select the dataset to use during the session

In [3]:
# Available datasets
testset="../dataset/AdFlush_test.csv"
GAN_testset="../dataset/GAN_mutated_AdFlush.csv"
gnirts_testset="../dataset/JS_obfuscated_gnirts.csv"
javascript_obfuscator_testset="../dataset/JS_obfuscated_javascript_obfuscator.csv"
wobfuscator_testset="../dataset/JS_obfuscated_wobfuscator.csv"

# Select dataset to import
DATASET=testset
ISMUTATED=False
if DATASET==testset or DATASET==GAN_testset:
    data_df=pd.read_csv(DATASET, index_col=0)
elif DATASET==gnirts_testset or DATASET==javascript_obfuscator_testset or DATASET==wobfuscator_testset:
    js_features=['brackettodot', 'num_get_storage', 'num_set_storage',
        'num_get_cookie', 'num_requests_sent', 'ng_0_0_2', 'ng_0_15_15', 'ng_2_13_2',
        'ng_15_0_3', 'ng_15_0_15', 'ng_15_15_15', 'avg_ident',
        'avg_charperline']
    test_df=pd.read_csv(testset,index_col=0)
    mutate_df=pd.read_csv(DATASET,index_col=0)
    mutate_df=mutate_df[js_features].copy()
    mutate_performed=mutate_df.index
    test_df=test_df.loc[mutate_performed].copy()
    test_df.loc[mutate_performed,js_features]=mutate_df.loc[mutate_performed,js_features]
    data_df=test_df.copy()
    ISMUTATED=True
label=data_df['label'].astype(int)

print("Using Dataset: ",DATASET)
data_df

Using Dataset:  ../dataset/AdFlush_test.csv


Unnamed: 0,content_policy_type,url_length,brackettodot,is_third_party,keyword_char_present,num_get_storage,num_set_storage,num_get_cookie,num_requests_sent,req_url_33,...,fqdn_27,ng_0_0_2,ng_0_15_15,ng_2_13_2,ng_15_0_3,ng_15_0_15,ng_15_15_15,avg_ident,avg_charperline,label
622089,0.384715,72.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,-0.112972,...,-0.972673,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
273066,0.742155,51.0,1.151515,1.0,0.0,1.0,1.0,1.0,0.0,0.248519,...,-0.967004,0.027778,0.037037,0.00463,0.00463,0.101852,0.064815,6.053763,34.166668,1.0
764490,0.384715,723.0,0.000000,1.0,1.0,8.0,0.0,8.0,1.0,-0.388964,...,-0.998259,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0
76875,0.474811,70.0,0.000000,1.0,1.0,37.0,6.0,8.0,5.0,0.112937,...,-0.845820,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,3.000000,1.0
112686,0.384715,85.0,0.000000,0.0,0.0,1.0,0.0,1.0,0.0,0.002374,...,-0.826542,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805310,0.505220,141.0,0.000000,1.0,1.0,0.0,0.0,0.0,0.0,-0.326381,...,-0.772378,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0
797245,0.078971,66.0,0.000000,0.0,0.0,1.0,0.0,1.0,0.0,0.044739,...,-0.654221,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0
279262,0.505220,478.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,-0.514433,...,-0.746507,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0
6891,0.384715,93.0,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,0.013773,...,-0.833339,0.000000,0.000000,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,1.0


In [4]:
def metrics(true, pred, _is_mutated):
    print(f"Accuracy : {accuracy_score(true, pred)} ")
    print(f"Precision : {precision_score(true, pred)} ")
    print(f"Recall : {recall_score(true, pred)} ")
    print(f"F1 : {f1_score(true, pred)} ")
    # Number of attacks
    total_attacks = len(true)
    
    # Number of successful attacks (misclassifications)
    successful_attacks = sum(true != pred)
    tn, fp, fn, tp = confusion_matrix(true, pred).ravel()

    # Calculate FNR
    fnr = fn / (tp + fn)
    print('False Negative Rate:', fnr)

    # Calculate FPR
    fpr = fp / (fp + tn)
    print('False Positive Rate:', fpr)
    # ASR
    if _is_mutated:
        asr = successful_attacks / total_attacks
        print("Attack Success Rate: ",asr)

### 1. AdFlush_ONNX

In [1]:
import onnx
import onnxruntime as ort
import onnxmltools

In [None]:
print('Loading Model')
model = onnx.load('../model/AdFlush.onnx')
# Check that the IR is well formed
try:
    print("Checking model integrity")
    onnx.checker.check_model(model)
    # Create an ONNX runtime session
    ort_session = ort.InferenceSession('../model/AdFlush.onnx')
    input_data = data_df.drop('label', axis=1).values.astype('float32')
    input_name = ort_session.get_inputs()[0].name
    label_name = ort_session.get_outputs()[0].name

    # Run the inference session to get the prediction results
    print('Running Inference Session')
    start_time=time.time()
    pred = ort_session.run([label_name], {input_name: input_data})[0]
    print("Inference time elapsed: ", time.time()-start_time, "seconds for ", len(label), " samples.")
    metrics(label.astype(int),pred.astype(int),ISMUTATED)
    print("AUROC: ",roc_auc_score(label, pred.astype('float64')))
    
except Exception as e:
    print("Error in loading model: ",e)

### 2. AdFlush_H2O_MOJO

In [4]:
import h2o
from h2o.automl import H2OAutoML
h2o.init(nthreads = 12, max_mem_size = "64g", enable_assertions = False)
h2o.no_progress()
custom_path=""

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 17.0.7+8-LTS-224, mixed mode, sharing)
  Starting server from C:\Users\chaejin lim\Documents\GitHub\venv\adflush\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\CHAEJI~1\AppData\Local\Temp\tmpayejlq2i
  JVM stdout: C:\Users\CHAEJI~1\AppData\Local\Temp\tmpayejlq2i\h2o_chaejin_lim_started_from_python.out
  JVM stderr: C:\Users\CHAEJI~1\AppData\Local\Temp\tmpayejlq2i\h2o_chaejin_lim_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.44.0.1
H2O_cluster_version_age:,9 days
H2O_cluster_name:,H2O_from_python_chaejin_lim_9k9g8g
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,64 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


#### 2-1. Making a new AdFlush with H2O AutoML (optional)

##### 2-1-1. Training custom AdFlush

In [24]:
train=pd.read_csv('../dataset/AdFlush_train.csv',index_col=0)
test=data_df.copy() #Use selected Dataset to test
train.reset_index(inplace=True, drop=True)
test.reset_index(inplace=True, drop=True)

y = 'label'
x = list(train)
x.remove(y)

h2o_train = h2o.H2OFrame(train)
h2o_test = h2o.H2OFrame(test)

h2o_train[y] = h2o_train[y].asfactor()
h2o_test[y] = h2o_test[y].asfactor()

# Select runtime for exploration
MAXRUNTIME=3600
aml = H2OAutoML(max_runtime_secs=MAXRUNTIME, max_models=None, exclude_algos=['XGBoost', 'StackedEnsemble'], nfolds=5)
# Run below code instead if you want to convert to ONNX format.
# aml = H2OAutoML(max_runtime_secs=100, max_models=None,include_algos=['GBM'], nfolds=5)

aml.train(x = x, y = y, training_frame = h2o_train, leaderboard_frame = h2o_test)
custom_path=aml.leader.save_mojo('../model/AdFlush_custom')
custom_path

##### 2-1-2. Convert custom AdFlush to ONNX

In [10]:
## Only available with GBM model
## We found that there is a problem in converting h2o to onnx format in windows due to automatically caching to ~AppData\temp, thus recommend to run this code in Linux
onnx_model = onnxmltools.convert.convert_h2o(custom_path, target_opset=9)
onnxmltools.utils.save_model(onnx_model, '../model/AdFlush_custom/AdFlush_custom.onnx')

NameError: name 'custom_path' is not defined

#### 2-2. Testing AdFlush with H20 AutoML

In [19]:
path = '../model/AdFlush_mojo.zip'

## Run code below to use custom AdFlush
# if custom_path!="":
#     path=custom_path
h2o_model = h2o.import_mojo(path)
h2o_test=h2o.H2OFrame(data_df)

start_time=time.time()
pred = h2o_model.predict(h2o_test)
pred = pred.as_data_frame().predict.to_list()
print("Inference time elapsed: ", time.time()-start_time, "seconds for ", len(label), " samples.")
metrics(label, pred, ISMUTATED)
if ISMUTATED:
    print("AUROC: ",roc_auc_score(label, pred.astype('float64')))

Inference time elapsed:  0.4647097587585449 seconds for  166032  samples.
Accuracy : 0.9395538209501783 
Precision : 0.9323462029306395 
Recall : 0.9149578259623075 
F1 : 0.9235701774426929 
False Negative Rate: 0.08504217403769257
False Positive Rate: 0.04410629617377881


#### 2-3.Explanable values for AdFlush H2O AutoML
We can see some explanations of the processed AdFlush model using H2O AutoML,
including SHAP values.

In [None]:
h2o_model

In [None]:
h2o_model.shap_summary_plot(h2o_test)

In [None]:
h2o.cluster().shutdown()