In [91]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import pymongo
import pyarrow as pa
import pyarrow.parquet as pq
from pymongoarrow.monkey import patch_all
patch_all()

from config import Config

client = pymongo.MongoClient(Config.MONGO_URI)
db = client[Config.MONGO_DB]

In [2]:
# aggregate + save
from loader import run
need_to_rebuild_parquets = False # set to True to rebuild parquet files from DB
if need_to_rebuild_parquets:
    run()

In [3]:
# load
phishing = pq.read_table('floor/phishing.parquet')
benign_u = pq.read_table('floor/benign.parquet')

from transformers.drop_nontrain import drop_nontrain
phishing = drop_nontrain(phishing)
benign_u = drop_nontrain(benign_u)

# realign schemas (parquet files save in nonsense orders)
phishing_order = phishing.schema.names
benign = benign_u.select(phishing_order)

data = pa.concat_tables([phishing, benign])
df = data.to_pandas()
from transformers.cast_timestamp import cast_timestamp
df = cast_timestamp(df)

df

Unnamed: 0,label,domain_registration_date,domain_expiration_date,domain_last_changed_date,has_tls,chain_len,tls_version_score,cipher_score,root_crt_validity__len,root_crt_time_to_expire,...,countries_count,lat_stddev,lon_stddev,dns_A_count,dns_AAAA_count,dns_CNAME_count,dns_MX_count,dns_NS_count,dns_SOA_count,dns_TXT_count
0,misp:phishing,,,,True,4.0,2.0,0.0,9066.0,4129.0,...,1,0.0,0.0,4,0,0,0,0,0,0
1,misp:phishing,,,,False,,,,,,...,1,0.0,0.0,1,0,0,0,0,0,0
2,misp:phishing,,,,False,,,,,,...,0,0.0,0.0,0,0,0,0,0,0,0
3,misp:phishing,,,,False,,,,,,...,1,0.0,0.0,1,0,0,1,0,0,0
4,misp:phishing,,,,True,3.0,3.0,1.0,3583.0,2124.0,...,1,0.0,0.0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295348,benign:unknown,,,,True,4.0,3.0,2.0,9132.0,2124.0,...,1,0.0,0.0,1,0,1,0,2,1,0
295349,benign:unknown,,,,True,3.0,3.0,2.0,1349.0,570.0,...,1,0.0,0.0,2,0,1,0,0,0,0
295350,benign:unknown,,,,True,3.0,3.0,2.0,1349.0,570.0,...,1,0.0,0.0,2,2,1,0,0,0,0
295351,benign:unknown,,,,True,3.0,2.0,0.0,1349.0,570.0,...,1,0.0,0.0,1,0,0,0,0,0,0


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels = le.fit_transform(df['label'])
features = df.drop('label', axis=1).copy()

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.3, random_state=42, shuffle=True, stratify=labels)

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier

params = {
    "max_depth": 45,
    "eta": 0.2,
    "objective": "multi:softmax",
    "num_class": 2
}
epochs = 20

model = XGBClassifier(
  **params, n_estimators=epochs
)
do_cross_val = True
if do_cross_val:
  kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=7)
  results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='f1')
  print("F1: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

F1: 90.65% (0.26%)


In [27]:
_ = model.fit(X_train, y_train)

In [28]:
predicted = model.predict(X_test)

In [29]:
from sklearn.metrics import f1_score
print(f1_score(y_test, predicted))

0.9092017062766605


In [30]:
# get gain score
score = model.get_booster().get_score(importance_type='gain')
sorted_score = sorted(score.items(), key=lambda x: x[1], reverse=True)
sorted_score

[('version_2_used_cnt', 54.55470657348633),
 ('client_auth', 46.63755416870117),
 ('broken_chain', 14.715441703796387),
 ('has_tls', 13.631027221679688),
 ('CA_ratio', 9.030411720275879),
 ('root_crt_time_to_expire', 7.775990962982178),
 ('dns_SOA_count', 7.171017169952393),
 ('cipher_score', 6.978414058685303),
 ('domain_registration_date', 6.95945930480957),
 ('dns_TXT_count', 6.446043968200684),
 ('CA_count', 5.569526195526123),
 ('critical_extensions', 5.120163440704346),
 ('mean_cert_len', 4.9212188720703125),
 ('dns_CNAME_count', 3.6793980598449707),
 ('tls_version_score', 3.552593469619751),
 ('dns_A_count', 3.1915926933288574),
 ('subdomain_count', 2.8307034969329834),
 ('subdomain_length', 2.7968759536743164),
 ('dns_NS_count', 2.3137500286102295),
 ('leaf_crt_validity_len', 1.9326372146606445),
 ('chain_len', 1.8833298683166504),
 ('digit_count', 1.8228983879089355),
 ('root_crt_validity__len', 1.7836785316467285),
 ('ip_entropy', 1.7110414505004883),
 ('countries_count', 1.6

In [31]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted, digits=4))

[[76971   537]
 [ 1400  9698]]
              precision    recall  f1-score   support

           0     0.9821    0.9931    0.9876     77508
           1     0.9475    0.8739    0.9092     11098

    accuracy                         0.9781     88606
   macro avg     0.9648    0.9335    0.9484     88606
weighted avg     0.9778    0.9781    0.9778     88606

