In [17]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [18]:
import pymongo
import pyarrow as pa
import pyarrow.parquet as pq
from pymongoarrow.monkey import patch_all
patch_all()

from config import Config

client = pymongo.MongoClient(Config.MONGO_URI)
db = client[Config.MONGO_DB]

In [19]:
# aggregate + save
from loader import run
need_to_rebuild_parquets = False # set to True to rebuild parquet files from DB
if need_to_rebuild_parquets:
    run()

In [20]:
# load
phishing = pq.read_table('floor/phishing.parquet')
benign_u = pq.read_table('floor/benign.parquet')

from transformers.drop_nontrain import drop_nontrain
phishing = drop_nontrain(phishing)
benign_u = drop_nontrain(benign_u)

# realign schemas (parquet files save in nonsense orders)
phishing_order = phishing.schema.names
benign = benign_u.select(phishing_order)

data = pa.concat_tables([phishing, benign])
df = data.to_pandas()
from transformers.cast_timestamp import cast_timestamp
df = cast_timestamp(df)

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
labels = le.fit_transform(df['label'])
features = df.drop('label', axis=1).copy()

X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42, shuffle=True, stratify=labels)

In [22]:
df

Unnamed: 0,label,domain_registration_date,domain_expiration_date,domain_last_changed_date,tls_version_score,cipher_score,root_crt_validity__len,root_crt_time_to_expire,leaf_crt_validity_len,leaf_cert_time_to_live,mean_cert_len,broken_chain,expired_chain,total_extension_count,critical_extensions,have_policies,percentage_of_policies,unknown_usage,X_509_used_cnt,version_2_used_cnt,version_1_used_cnt,subject_count,server_auth,client_auth,CA_count,CA_ratio,name_length,ip_entropy,countries_count,lat_stddev,lon_stddev,dns_A_count,dns_AAAA_count,dns_CNAME_count,dns_MX_count,dns_NS_count,dns_SOA_count,dns_TXT_count
0,misp:phishing,-9223372037,-9223372037,-9223372037,2.0,0.0,9066.0,4129.0,345.0,329.0,5147.2,0.0,0.0,7.0,-1.0,4.0,1.0,0.0,2.0,2.0,0.0,8.0,2.0,2.0,2.0,0.5,34,2.5,1,0.0,0.0,4,0,0,0,0,0,0
1,misp:phishing,-9223372037,-9223372037,-9223372037,,,,,,,,,,,,,,,,,,,,,,,35,1.0,1,0.0,0.0,1,0,0,0,0,0,0
2,misp:phishing,-9223372037,-9223372037,-9223372037,,,,,,,,,,,,,,,,,,,,,,,17,0.0,0,0.0,0.0,0,0,0,0,0,0,0
3,misp:phishing,-9223372037,-9223372037,-9223372037,,,,,,,,,,,,,,,,,,,,,,,22,1.0,1,0.0,0.0,1,0,0,1,0,0,0
4,misp:phishing,-9223372037,-9223372037,-9223372037,3.0,1.0,3583.0,2124.0,396.0,357.0,2544.0,0.0,0.0,7.0,-1.0,3.0,1.0,0.0,1.0,0.0,2.0,2.0,2.0,2.0,1.0,0.3,29,1.0,1,0.0,0.0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295348,benign:unknown,-9223372037,-9223372037,-9223372037,3.0,2.0,9132.0,2124.0,366.0,327.0,4381.0,0.0,0.0,4.0,-1.0,3.0,0.8,0.0,2.0,0.0,1.0,2.0,2.0,2.0,2.0,0.5,23,1.0,1,0.0,0.0,1,0,1,0,2,1,0
295349,benign:unknown,-9223372037,-9223372037,-9223372037,3.0,2.0,1349.0,570.0,90.0,52.0,1092.3,0.0,0.0,7.0,-1.0,3.0,1.0,0.0,0.0,3.0,0.0,1.0,2.0,2.0,1.0,0.3,17,2.0,1,0.0,0.0,2,0,1,0,0,0,0
295350,benign:unknown,-9223372037,-9223372037,-9223372037,3.0,2.0,1349.0,570.0,90.0,72.0,1092.3,0.0,0.0,7.0,-1.0,3.0,1.0,0.0,0.0,3.0,0.0,8.0,2.0,2.0,1.0,0.3,13,1.5,1,0.0,0.0,2,2,1,0,0,0,0
295351,benign:unknown,-9223372037,-9223372037,-9223372037,2.0,0.0,1349.0,570.0,90.0,45.0,1092.3,0.0,0.0,7.0,-1.0,3.0,1.0,0.0,0.0,3.0,0.0,33.0,2.0,2.0,1.0,0.3,23,1.0,1,0.0,0.0,1,0,0,0,0,0,0


In [23]:
import xgboost as xgb
from xgboost import XGBClassifier

train = xgb.DMatrix(X_train, label=y_train)
test = xgb.DMatrix(X_test, label=y_test)

params = {
    "max_depth": 4,
    "eta": 0.2,
    "objective": "multi:softmax",
    "num_class": 2
}
epochs = 10

In [24]:
clf = XGBClassifier()

model = clf.fit(X_train, y_train)

In [25]:
predicted = clf.predict(X_test)

In [26]:
from sklearn.metrics import f1_score
print(f1_score(y_test, predicted))

0.8678832116788321


In [27]:
# get gain score
score = clf.get_booster().get_score(importance_type='gain')
sorted_score = sorted(score.items(), key=lambda x: x[1], reverse=True)
sorted_score

[('version_2_used_cnt', 270.34716796875),
 ('dns_SOA_count', 246.24798583984375),
 ('cipher_score', 202.4058380126953),
 ('domain_registration_date', 136.02772521972656),
 ('dns_CNAME_count', 133.58033752441406),
 ('client_auth', 116.72305297851562),
 ('CA_ratio', 111.17245483398438),
 ('dns_A_count', 96.15673828125),
 ('dns_NS_count', 93.26383972167969),
 ('root_crt_time_to_expire', 80.3289566040039),
 ('tls_version_score', 78.25501251220703),
 ('countries_count', 72.16368103027344),
 ('root_crt_validity__len', 58.74394989013672),
 ('server_auth', 57.670616149902344),
 ('leaf_crt_validity_len', 55.677001953125),
 ('leaf_cert_time_to_live', 55.487403869628906),
 ('version_1_used_cnt', 51.5734977722168),
 ('dns_TXT_count', 44.27719497680664),
 ('name_length', 40.764041900634766),
 ('subject_count', 40.40646743774414),
 ('mean_cert_len', 39.73749542236328),
 ('broken_chain', 38.085350036621094),
 ('have_policies', 32.55231857299805),
 ('ip_entropy', 31.576173782348633),
 ('dns_AAAA_count

In [28]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predicted))
print(classification_report(y_test, predicted, digits=4))

[[51316   357]
 [ 1453  5945]]
              precision    recall  f1-score   support

           0     0.9725    0.9931    0.9827     51673
           1     0.9434    0.8036    0.8679      7398

    accuracy                         0.9694     59071
   macro avg     0.9579    0.8983    0.9253     59071
weighted avg     0.9688    0.9694    0.9683     59071

