In [1]:
import pandas as pd
import numpy as np
import joblib
import onnx
import onnxruntime as ort
import torch
print(torch.cuda.is_available())
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split
from tabgan.sampler import OriginalGenerator, GANGenerator
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score
)
from sewar.full_ref import mse, rmse, psnr, uqi, ssim, ergas, scc, rase, sam, msssim, vifp
from art.estimators.classification import PyTorchClassifier
from art.metrics import clever

In [2]:
def metrics(true, pred):
    print(accuracy_score(true, pred))
    print(precision_score(true, pred))
    print(recall_score(true, pred))
    print(f1_score(true, pred))
     # Number of attacks
    total_attacks = len(true)
    
    # Number of successful attacks (misclassifications)
    successful_attacks = sum(true != pred)
    
    # ASR
    asr = successful_attacks / total_attacks
    print(asr)

In [3]:
df = pd.read_csv('./dataset/all_df_350.csv', index_col=[0])
# df, _ = reduce_mem_usage(df)
te = TargetEncoder()
df['content_policy_type'] = te.fit_transform(df['content_policy_type'], df.label)

In [None]:
label_df = df[['visit_id', 'name','content_policy_type',
  'fqdn_0',
  'fqdn_1',
  'fqdn_12',
  'fqdn_14',
  'fqdn_17',
  'fqdn_23',
  'fqdn_24',
  'fqdn_25',
  'fqdn_26',
  'fqdn_27',
  'fqdn_4',
  'fqdn_6',
  'is_subdomain',
  'is_third_party',
  'keyword_char_present',
  'num_requests_sent',
  'num_set_storage',
  'req_url_121',
  'req_url_135',
  'req_url_179',
  'req_url_18',
  'req_url_21',
  'req_url_22',
  'req_url_33',
  'req_url_38',
  'req_url_91', 'label']]

In [None]:
# label_df.to_csv('/data/kiho/webtrack/feature_extracted/RFECV_27_0625.csv')

In [7]:
df_ = label_df[label_df.columns[:]].sample(frac=1)
train, test = train_test_split(df_, test_size = 0.2, shuffle = True, random_state=42)
# random input data
train_df = train[train.columns[2:-1]]
target_df = pd.DataFrame(train['label'])
test_df = test[test.columns[2:-1]]

new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
           adversarial_model_params={
               "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": 42, "n_estimators": 100,
           }, pregeneration_frac=2, only_generated_data=False,
           gan_params = {"batch_size": 2000, "patience": 25, "epochs" : 50,}).generate_data_pipe(train_df, target_df,
                                          test_df, deep_copy=True, only_adversarial=False, use_adversarial=True)

Fitting CTGAN transformers for each column:   0%|          | 0/28 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/50 [00:00<?, ?it/s]

## AdGraph

In [12]:
data_df = pd.read_csv('./dataset/ad_features.csv', index_col=[0])
label_df = pd.read_csv('./dataset/ad_labelled.csv', index_col=[0])
data_df = data_df.reset_index(drop=True)
label_df = label_df.reset_index(drop=True)
# label_df = label_df['label']
# data_df = pd.concat([data_df, label_df], axis = 1)
df_labelled = data_df.merge(label_df[['visit_id', 'name', 'label']], on=['visit_id', 'name'])
df_labelled = df_labelled[df_labelled['label'] != "Error"]
te = TargetEncoder()
df_labelled['content_policy_type'] = te.fit_transform(df_labelled['content_policy_type'], df_labelled.label)

In [13]:
df_ = df_labelled[df_labelled.columns[2:]].sample(frac=1)
train, test = train_test_split(df_, test_size = 0.2, shuffle = True, random_state=42)
# random input data
train_df = train[train.columns[:-1]]
target_df = pd.DataFrame(train['label'])
test_df = test[test.columns[:-1]]

In [14]:
# generate data
# new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train_df, target_df, test_df, )
# new_train2, new_target2 = GANGenerator().generate_data_pipe(train_df, target_df, test_df, )

# example with all params defined
new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
           adversarial_model_params={
               "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": 42, "n_estimators": 100,
           }, pregeneration_frac=2, only_generated_data=False,
           gan_params = {"batch_size": 2000, "patience": 25, "epochs" : 50,}).generate_data_pipe(train_df, target_df,
                                          test_df, deep_copy=True, only_adversarial=False, use_adversarial=True)

Fitting CTGAN transformers for each column:   0%|          | 0/29 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/50 [00:00<?, ?it/s]

In [15]:
adgraph_model_path = './model/model_0.joblib'
adgraph_model = joblib.load(adgraph_model_path)
tmp = adgraph_model.predict(new_train3.reset_index())

In [16]:
metrics(new_target3, tmp) # accu prec recl f1-sc

0.732571794598057
0.7121826537884932
0.8972350373099669
0.7940700956347793
0.267428205401943


In [17]:
y_pred_proba = adgraph_model.predict_proba(new_train3.reset_index())[:,1]
roc_auc_score(new_target3, y_pred_proba)

0.8208126032623237

In [20]:
pd.concat([new_train3, new_target3], axis=1).to_csv('./dataset/GAN_mutated_AdGraph.csv')

In [None]:
# mut_ad_df = pd.concat([new_train3, new_target3], axis=1)

In [None]:
# mut_ad_df.to_csv('/data/kiho/webtrack/feature_extracted/adgraph/mut_ad_df.csv')

## WebGraph

In [21]:
data_df = pd.read_csv('./dataset/web_features.csv', index_col=[0])
label_df = pd.read_csv('./dataset/web_labelled.csv', index_col=[0])
data_df = data_df.reset_index(drop=True)
label_df = label_df.reset_index(drop=True)
# label_df = label_df['label']
# data_df = pd.concat([data_df, label_df], axis = 1)
df_labelled = data_df.merge(label_df[['visit_id', 'name', 'label']], on=['visit_id', 'name'])
df_labelled = df_labelled[df_labelled['label'] != "Error"]
# te = TargetEncoder()
# df_labelled['content_policy_type'] = te.fit_transform(df_labelled['content_policy_type'], df_labelled.label)

In [29]:
df_ = df_labelled[df_labelled.columns[2:]].sample(frac=1)
train, test = train_test_split(df_, test_size = 0.2, shuffle = True)
# random input data
train_df = train[train.columns[:-1]]
target_df = pd.DataFrame(train['label'])
test_df = test[test.columns[:-1]]

# generate data
# new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train_df, target_df, test_df, )
# new_train2, new_target2 = GANGenerator().generate_data_pipe(train_df, target_df, test_df, )

# example with all params defined
new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
           adversarial_model_params={
               "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": 42, "n_estimators": 100,
           }, pregeneration_frac=2, only_generated_data=False,
           gan_params = {"batch_size": 2000, "patience": 25, "epochs" : 50,}).generate_data_pipe(train_df, target_df,
                                          test_df, deep_copy=True, only_adversarial=False, use_adversarial=True)

Fitting CTGAN transformers for each column:   0%|          | 0/60 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/50 [00:00<?, ?it/s]

In [30]:
# new_train3.reset_index(inplace=True)

In [37]:
webgraph_model_path = './model/web_graph_model.joblib'
webgraph_model = joblib.load(webgraph_model_path)
tmp = webgraph_model.predict(new_train3[new_train3.columns[:]])

In [40]:
pd.concat([new_train3, new_target3], axis=1).to_csv('./dataset/GAN_mutated_WebGraph.csv')

In [112]:
metrics(new_target3, tmp) # accu prec recl f1-sc ASR

0.8946204509398586
0.8843035284907454
0.8723040432456286
0.8782628012884595
0.10537954906014134


In [113]:
y_pred_proba = webgraph_model.predict_proba(new_train3)[:,1]
roc_auc_score(new_target3, y_pred_proba)

0.9612457986728018

## De-Adlock

In [122]:
df_labelled = pd.read_csv('./RFECV_27_0525.csv', index_col=[0])
                          
df_ = df_labelled[df_labelled.columns[2:]].sample(frac=1)
train, test = train_test_split(df_, test_size = 0.2, shuffle = True)
# random input data
train_df = train[train.columns[:-1]]
target_df = pd.DataFrame(train['label'])
test_df = test[test.columns[:-1]]

# generate data
# new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train_df, target_df, test_df, )
# new_train2, new_target2 = GANGenerator().generate_data_pipe(train_df, target_df, test_df, )

# example with all params defined
new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
           adversarial_model_params={
               "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": 42, "n_estimators": 100,
           }, pregeneration_frac=2, only_generated_data=False,
           gan_params = {"batch_size": 2000, "patience": 25, "epochs" : 50,}).generate_data_pipe(train_df, target_df,
                                          test_df, deep_copy=True, only_adversarial=False, use_adversarial=True)

Fitting CTGAN transformers for each column:   0%|          | 0/28 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/50 [00:00<?, ?it/s]

In [123]:
model = onnx.load('./model/De-Adlock.onnx')
# Check that the IR is well formed
onnx.checker.check_model(model)
# Create an ONNX runtime session
ort_session = ort.InferenceSession('./model/De-Adlock.onnx')
input_data = new_train3.values.astype('float32')
input_name = ort_session.get_inputs()[0].name

# Run the inference session to get the prediction results
tmp = ort_session.run(None, {input_name: input_data})

In [124]:
true = np.where(np.array(new_target3) > 0.5, 1, 0)
pred = pd.Series(tmp[0].astype(int), name='pred')

In [125]:
label_name = ort_session.get_outputs()[0].name
pred_onx = ort_session.run([label_name], {input_name: input_data})[0]
metrics(true, pred) # accu prec recl f1-sc
# y_pred_proba = model.predict_proba(pred_onx)[:,1]
roc_auc_score(true, pred_onx.astype('float64'))

0.9478501868849958
0.9511280668289154
0.9170471841704718
0.9337767589034903
0.0521498131150042


0.9427560100640994

In [12]:
df_labelled = pd.read_csv('./dataset/testset.csv', index_col=[0])
                          
df_ = df_labelled[df_labelled.columns[2:]].sample(frac=1)
train, test = train_test_split(df_, test_size = 0.2, shuffle = True)
# random input data
train_df = train[train.columns[:-1]]
target_df = pd.DataFrame(train['label'])
test_df = test[test.columns[:-1]]

# generate data
# new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train_df, target_df, test_df, )
# new_train2, new_target2 = GANGenerator().generate_data_pipe(train_df, target_df, test_df, )

# example with all params defined
new_train3, new_target3 = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
           adversarial_model_params={
               "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": 42, "n_estimators": 100,
           }, pregeneration_frac=2, only_generated_data=False,
           gan_params = {"batch_size": 2000, "patience": 25, "epochs" : 50,}).generate_data_pipe(train_df, target_df,
                                          test_df, deep_copy=True, only_adversarial=False, use_adversarial=True)

Fitting CTGAN transformers for each column:   0%|          | 0/28 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/50 [00:00<?, ?it/s]

In [183]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.18" 2023-01-17; OpenJDK Runtime Environment (build 11.0.18+10-post-Ubuntu-0ubuntu118.04.1); OpenJDK 64-Bit Server VM (build 11.0.18+10-post-Ubuntu-0ubuntu118.04.1, mixed mode, sharing)
  Starting server from /data/kiho/mambaforge/envs/autocomplete/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpve10p792
  JVM stdout: /tmp/tmpve10p792/h2o_kiho_started_from_python.out
  JVM stderr: /tmp/tmpve10p792/h2o_kiho_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.2
H2O_cluster_version_age:,3 months and 15 days
H2O_cluster_name:,H2O_from_python_kiho_s70525
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,29.97 Gb
H2O_cluster_total_cores:,40
H2O_cluster_allowed_cores:,40


In [185]:
path = './model/De-Adlock_mojo.zip'
h2o_model = h2o.import_mojo(path)

generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%


In [186]:
h2o_new_train = h2o.H2OFrame(new_train3)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [188]:
pred = h2o_model.predict(h2o_new_train)

generic prediction progress: |███████████████████████████████████████████████████| (done) 100%


In [195]:
pred = pred.as_data_frame().predict.to_list()

In [200]:
metrics(true, pred) # accu prec recl f1-sc
# y_pred_proba = model.predict_proba(pred_onx)[:,1]
roc_auc_score(true, pred)

0.9619014761329752
0.9642924380294948
0.940219054029248
0.9521035999752153
0.038098523867024815


0.9583707912496463