In [489]:
import pandas as pd
import numpy as np
import joblib
import onnx
import onnxruntime as ort
import torch
print(torch.cuda.is_available())
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split
from tabgan.sampler import OriginalGenerator, GANGenerator
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix
)
from sewar.full_ref import mse, rmse, psnr, uqi, ssim, ergas, scc, rase, sam, msssim, vifp
from art.estimators.classification import PyTorchClassifier
from art.metrics import clever
import json
from tqdm import tqdm
import h2o
h2o.init()

True
Checking whether there is an H2O instance running at http://localhost:54321. connected.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,1 day 14 hours 11 mins
H2O_cluster_timezone:,Asia/Seoul
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.2
H2O_cluster_version_age:,3 months and 18 days
H2O_cluster_name:,H2O_from_python_kiho_evczt9
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,27.84 Gb
H2O_cluster_total_cores:,40
H2O_cluster_allowed_cores:,40


In [458]:
def metrics(true, pred):
    print(accuracy_score(true, pred))
    print(precision_score(true, pred))
    print(recall_score(true, pred))
    print(f1_score(true, pred))
     # Number of attacks
    total_attacks = len(true)
    
    # Number of successful attacks (misclassifications)
    successful_attacks = sum(true != pred)
    tn, fp, fn, tp = confusion_matrix(true, pred).ravel()

    # Calculate FNR
    fnr = fn / (tp + fn)
    print('False Negative Rate:', fnr)

    # Calculate FPR
    fpr = fp / (fp + tn)
    print('False Positive Rate:', fpr)
    # ASR
    asr = successful_attacks / total_attacks
    print(asr)

### Content mutated

In [73]:
mut_data_df = pd.read_csv('./dataset/WebGraph/code/out/features.csv', index_col=[0])
mut_label_df = pd.read_csv('./dataset/WebGraph/code/out/labelled.csv', index_col=[0])
mut_data_df = mut_data_df.reset_index(drop=True)
mut_label_df = mut_label_df.reset_index(drop=True)

mut_df_labelled = mut_data_df.merge(mut_label_df[['visit_id', 'name', 'label']], on=['visit_id', 'name'])
mut_df_labelled = mut_df_labelled[mut_df_labelled['label'] != "Error"]
# label_df = label_df['label']
# data_df = pd.concat([data_df, label_df], axis = 1)
# te = TargetEncoder()
# df_labelled['content_policy_type'] = te.fit_transform(df_labelled['content_policy_type'], df_labelled.label)

## AdGraph

In [424]:
data_df = pd.read_csv('./dataset/adgraph/ad_features.csv', index_col=[0])
label_df = pd.read_csv('./dataset/adgraph/ad_labelled.csv', index_col=[0])
data_df = data_df.reset_index(drop=True)
label_df = label_df.reset_index(drop=True)
# label_df = label_df['label']
# data_df = pd.concat([data_df, label_df], axis = 1)
df_labelled = data_df.merge(label_df[['visit_id', 'name', 'label']], on=['visit_id', 'name'])
df_labelled = df_labelled[df_labelled['label'] != "Error"]
te = TargetEncoder()
te.fit(df_labelled['content_policy_type'], df_labelled.label)
# df_labelled['content_policy_type'] = te.fit_transform(df_labelled['content_policy_type'], df_labelled.label)

In [214]:
_, x_test = train_test_split(df_labelled, test_size=0.2, random_state=42)

In [215]:
mut_df_labelled.content_policy_type = te.transform(mut_df_labelled.content_policy_type)

In [216]:
only_at = mut_df_labelled[mut_df_labelled.label == 1][df_labelled.columns]
# only_at = mut_df_labelled[df_labelled.columns]

In [224]:
mut_test_x = mut_df_labelled[df_labelled.columns]

In [231]:
test_df = pd.concat([only_at, x_test[x_test.label==0]], axis=0)

In [238]:
adgraph_model_path = "./model/adgraph/results/model_0.joblib"
adgraph_model = joblib.load(adgraph_model_path)
tmp = adgraph_model.predict(only_at[only_at.columns[2:]])

In [239]:
metrics(only_at.label, tmp) # accu prec recl f1-sc asr

0.007643976682468755
1.0
0.007643976682468755
0.01517197911039078
0.9923560233175313


## WebGraph

In [305]:
data_df = pd.read_csv('./dataset/webgraph/web_features.csv', index_col=[0])
label_df = pd.read_csv('./dataset/webgraph/web_labelled.csv', index_col=[0])
data_df = data_df.reset_index(drop=True)
label_df = label_df.reset_index(drop=True)
# label_df = label_df['label']
# data_df = pd.concat([data_df, label_df], axis = 1)
df_labelled = data_df.merge(label_df[['visit_id', 'name', 'label']], on=['visit_id', 'name'])
df_labelled = df_labelled[df_labelled['label'] != "Error"]
# te = TargetEncoder()
# df_labelled['content_policy_type'] = te.fit_transform(df_labelled['content_policy_type'], df_labelled.label)

In [243]:
_, x_test = train_test_split(df_labelled, test_size=0.2, random_state=42)

In [251]:
only_at = mut_df_labelled[mut_df_labelled.label == 1][df_labelled.columns]
# only_at = mut_df_labelled[df_labelled.columns]

In [245]:
test_df = pd.concat([only_at, x_test[x_test.label==1]], axis=0)

In [246]:
mut_test_x = mut_df_labelled[df_labelled.columns]

In [252]:
webgraph_model_path = "./dataset/webgraph/results/model_0.joblib"
webgraph_model = joblib.load(webgraph_model_path)
tmp = webgraph_model.predict(mut_test_x[mut_test_x.columns[2:]])

In [253]:
metrics(mut_test_x.label, tmp) # accu prec recl f1-sc ASR

0.642438422378773
0.5485512599523927
0.3031732710322771
0.39051612054986634
0.3575615776212269


## De-Adlock

In [490]:
df_labelled = pd.read_csv('./dataset/testset.csv', index_col=[0])

In [375]:
mut_data_df = pd.read_csv('./dataset/WebGraph/code/out/features.csv', index_col=[0])
mut_label_df = pd.read_csv('./dataset/WebGraph/code/out/labelled.csv', index_col=[0])
mut_data_df = mut_data_df.reset_index(drop=True)
mut_label_df = mut_label_df.reset_index(drop=True)

mut_df_labelled = mut_data_df.merge(mut_label_df[['visit_id', 'name', 'label']], on=['visit_id', 'name'])
mut_df_labelled = mut_df_labelled[mut_df_labelled['label'] != "Error"]

In [260]:
mut_fqdn = pd.read_csv('./dataset/wordvec/mut_fqdn.csv', index_col=0)
mut_req = pd.read_csv('./dataset/wordvec/mut_req.csv', index_col=0)
mut_df_labelled = pd.concat([mut_df_labelled, mut_fqdn, mut_req], axis=1)

In [261]:
_, x_test = train_test_split(df_labelled, test_size=0.2, random_state=42)
only_at = mut_df_labelled[mut_df_labelled.label == 1][df_labelled.columns]
# only_at = mut_df_labelled[df_labelled.columns]
test_df = pd.concat([only_at, x_test[x_test.label==0]], axis=0)

In [264]:
path = './model/De-Adlock_mojo.zip'
h2o_model = h2o.import_mojo(path)
mut_test_x = h2o.H2OFrame(mut_df_labelled[mut_df_labelled[df_labelled.columns].columns[2:-1]])
# h2o_new_train = h2o.H2OFrame(mut_test_x[mut_test_x.columns[2:-1]])
pred = h2o_model.predict(mut_test_x)
pred = pred.as_data_frame().predict.to_list()
metrics(mut_df_labelled.label.astype(int), pred) # accu prec recl f1-sc
# y_pred_proba = model.predict_proba(pred_onx)[:,1]

generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%
0.9828251144135343
0.9869026958232096
0.9673826751650146
0.9770451994226936
0.01717488558646578


In [265]:
roc_auc_score(mut_df_labelled.label.astype(int), pred)

0.9797929974764394

In [None]:
pd.concat([df_labelled.name, pd.Series(pred)], axis =1, keys= ['name', 'pred']).to_csv('/data/kiho/webtrack/out/0628_all_df_pred.csv')

In [165]:
with open('/data/kiho/webtrack/feature_extracted/wordvec/fqdnwordvec2.json', 'r') as f:
    fqdndict=json.load(f)
with open('/data/kiho/webtrack/feature_extracted/wordvec/reqwordvec2.json','r') as f:
    reqdict=json.load(f)

In [201]:
mut_url = pd.DataFrame(mut_df_labelled.name)
req_df = []
fqdn_df = []
for i in tqdm(range(len(mut_url))):
    requrl=mut_url.name[i]
    domain= requrl.split('/')[2]

    reqvector=np.zeros(200)
    fqdnvector=np.zeros(30)

    for c in requrl:
        reqvector+=reqdict[c]
    reqvector/=len(requrl)
    req_df.append(reqvector)
    for c in domain:
        fqdnvector+=fqdndict[c]
    fqdnvector/=len(domain)
    fqdn_df.append(fqdnvector)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 116682/116682 [04:05<00:00, 474.62it/s]


In [211]:
fqdn_name = []
for i in range(0,30):
    col = "fqdn_"+ str(i)
    fqdn_name.append(col)
    
req_name = []
for i in range(0,200):
    col = "req_url_"+ str(i)
    req_name.append(col)
fqdn_df = pd.DataFrame(fqdn_df, columns=fqdn_name)
req_df = pd.DataFrame(req_df, columns=req_name)

word_vec_df = pd.concat([fqdn_df, req_df], axis=1)

In [27]:
mut_df

Unnamed: 0,content_policy_type,fqdn_0,fqdn_1,fqdn_12,fqdn_14,fqdn_17,fqdn_23,fqdn_24,fqdn_25,fqdn_26,...,req_url_121,req_url_135,req_url_179,req_url_18,req_url_21,req_url_22,req_url_33,req_url_38,req_url_91,label
0,main_frame,-0.152640,-0.546892,-0.667190,0.013601,0.441756,-0.093347,0.060051,0.954488,-0.909166,...,-0.228745,-0.539392,-0.107055,-0.759763,0.432369,0.521323,0.405985,-0.040894,-0.523111,False
1,main_frame,-0.117671,-0.560468,-0.792520,-0.012433,0.535006,-0.034954,-0.084181,0.980041,-0.963042,...,-0.176061,-0.559774,-0.081364,-0.767314,0.417772,0.531816,0.389185,0.011260,-0.497677,False
2,script,0.364265,0.059967,0.055646,-0.042909,-0.404186,-0.303092,-0.216884,0.263311,-0.811278,...,-0.249772,-0.472135,-0.027587,-0.645954,0.473350,0.490576,0.282350,0.031252,-0.463946,False
3,script,0.342284,0.004442,-0.075580,0.081413,-0.227664,-0.235425,0.097586,0.512876,-0.736975,...,-0.222038,-0.393168,0.004918,-0.393204,0.169106,0.370731,0.159044,0.069194,-0.336326,False
4,script,0.342284,0.004442,-0.075580,0.081413,-0.227664,-0.235425,0.097586,0.512876,-0.736975,...,-0.222038,-0.393168,0.004918,-0.393204,0.169106,0.370731,0.159044,0.069194,-0.336326,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
116677,font,-0.170602,-0.619880,-0.405115,-0.052260,0.420568,-0.101938,0.335232,0.722515,-1.295664,...,-0.084049,-0.486129,-0.009968,-0.760929,0.173264,0.190228,-0.191214,0.170518,-0.087641,False
116678,sub_frame,0.073656,-0.440562,-0.409615,-0.140995,0.761098,0.063120,0.587713,0.804132,-0.939155,...,-0.159320,-0.415338,0.021846,-0.626516,0.266453,0.496677,0.308287,0.061006,-0.406189,False
116679,script,-0.194270,-0.662524,-0.429552,-0.088807,0.468143,-0.317437,0.430545,0.744713,-1.199056,...,-0.172181,-0.586788,0.011731,-0.674680,0.264336,0.392884,0.087102,0.104103,-0.425445,False
116680,script,-0.098370,-0.482271,-0.320844,0.031588,0.252629,-0.249348,0.449975,0.702837,-0.948058,...,-0.301018,-0.753296,-0.133408,-0.758383,0.351124,0.437582,0.326339,0.012052,-0.520431,False


In [26]:
mut_test_x

content_policy_type,fqdn_0,fqdn_1,fqdn_12,fqdn_14,fqdn_17,fqdn_23,fqdn_24,fqdn_25,fqdn_26,fqdn_27,fqdn_4,fqdn_6,is_subdomain,is_third_party,keyword_char_present,num_requests_sent,num_set_storage,req_url_121,req_url_135,req_url_179,req_url_18,req_url_21,req_url_22,req_url_33,req_url_38,req_url_91
main_frame,-0.15264,-0.546892,-0.66719,0.0136007,0.441756,-0.0933466,0.0600509,0.954488,-0.909166,-0.770389,-0.159188,0.585305,1,0,0,2,0,-0.228745,-0.539392,-0.107055,-0.759763,0.432369,0.521323,0.405985,-0.0408941,-0.523111
main_frame,-0.117671,-0.560468,-0.79252,-0.0124325,0.535006,-0.0349536,-0.0841807,0.980041,-0.963042,-0.745699,-0.239322,0.637288,1,0,0,9,0,-0.176061,-0.559774,-0.0813635,-0.767314,0.417772,0.531816,0.389185,0.0112596,-0.497677
script,0.364265,0.0599665,0.0556457,-0.0429086,-0.404186,-0.303092,-0.216884,0.263311,-0.811278,-0.624617,-0.0689018,-0.17864,0,1,0,1,0,-0.249772,-0.472135,-0.0275868,-0.645954,0.47335,0.490576,0.28235,0.0312524,-0.463946
script,0.342284,0.00444213,-0.0755802,0.081413,-0.227664,-0.235425,0.0975857,0.512876,-0.736975,-0.555548,0.0979708,0.0768501,0,1,0,1,1,-0.222038,-0.393168,0.00491813,-0.393204,0.169106,0.370731,0.159044,0.0691944,-0.336326
script,0.342284,0.00444213,-0.0755802,0.081413,-0.227664,-0.235425,0.0975857,0.512876,-0.736975,-0.555548,0.0979708,0.0768501,0,1,0,1,1,-0.222038,-0.393168,0.00491813,-0.393204,0.169106,0.370731,0.159044,0.0691944,-0.336326
script,0.342284,0.00444213,-0.0755802,0.081413,-0.227664,-0.235425,0.0975857,0.512876,-0.736975,-0.555548,0.0979708,0.0768501,0,1,0,0,0,-0.222038,-0.393168,0.00491813,-0.393204,0.169106,0.370731,0.159044,0.0691944,-0.336326
script,0.342284,0.00444213,-0.0755802,0.081413,-0.227664,-0.235425,0.0975857,0.512876,-0.736975,-0.555548,0.0979708,0.0768501,0,1,0,0,0,-0.222038,-0.393168,0.00491813,-0.393204,0.169106,0.370731,0.159044,0.0691944,-0.336326
sub_frame,0.364265,0.0599665,0.0556457,-0.0429086,-0.404186,-0.303092,-0.216884,0.263311,-0.811278,-0.624617,-0.0689018,-0.17864,0,1,1,0,0,-0.0621938,-0.0774157,0.0444398,-0.0155474,-0.0277931,0.0638106,-0.128063,0.181248,-0.0870932
image,0.342284,0.00444213,-0.0755802,0.081413,-0.227664,-0.235425,0.0975857,0.512876,-0.736975,-0.555548,0.0979708,0.0768501,0,1,0,0,0,-0.239372,-0.434929,0.0176569,-0.64488,0.389464,0.452307,0.191497,0.0549185,-0.398228
stylesheet,-0.117671,-0.560468,-0.79252,-0.0124325,0.535006,-0.0349536,-0.0841807,0.980041,-0.963042,-0.745699,-0.239322,0.637288,1,0,0,0,0,-0.131715,-0.516956,-0.0151478,-0.693912,0.348919,0.49455,0.34061,0.0499303,-0.466844
