# Preprocessing

In [None]:
from __future__ import annotations

import asyncio


async def validate_urls():
    urls = pd.read_csv("urls2.csv")
    handled_urls = pd.read_csv("dataset.csv")
    # remove duplicates
    urls = urls[~urls["url"].isin(handled_urls["url"])]
    urls = urls[urls["is_spam"] == False]
    tasks = []
    # iterate phishing_urld in batches of size 100
    for i in range(0, len(urls), 1000):
        batch = urls.iloc[i: i + 1000]
        for index, record in batch.iterrows():
            url = record["url"]
            tasks.append(get_features(url))
        results = await gather(*tasks, limit=500)
        dataset = pd.DataFrame(results)
        # save to dataset.csv in the end
        dataset.to_csv("dataset.csv", mode="a", header=False, index=False)
        tasks = []


async def get_features(url) -> dict | None:
    async with aiohttp.ClientSession() as session:
        try:
            res = await session.get(url)
            valid = True
            html = await res.text()
            res.close()
        except Exception:
            valid = False
    if not valid:
        return {
            "url": url,
            "status": "invalid",
        }
    features = extract_features_from_url(url, html)
    return {
        "url": url
    } | features | {"status": "legitimate"}


asyncio.run(validate_urls())

## Add URL Bert classifcaiton

In [None]:
import asyncio

import aiohttp
import pandas as pd
from aioitertools.asyncio import gather

from classifiers.extractors.feature_extractor import extract_features_from_url

from transformers import BertTokenizerFast, BertForSequenceClassification, pipeline
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "CrabInHoney/urlbert-tiny-v4-phishing-classifier"

tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)
model.to(device)

classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    return_all_scores=True
)


async def add_url_bert_classification():
    urls = pd.read_csv("urls_with_features.csv")
    urls["url_bert_classification"] = None
    unhandled_urls = urls[urls["url_bert_classification"].isna()]
    counter = 0
    for index, record in unhandled_urls.iterrows():
        url = record["url"]
        try:
            result = classifier(url)
            label_1_score = result[0][1]['score']
            urls.at[index, "url_bert_classification"] = label_1_score
        except Exception as e:
            urls.at[index, "url_bert_classification"] = str(e)

        if counter == 100:
            urls.to_csv("urls_with_features_and_url_classification.csv", index=False)
            counter = 0
        more_to_go = urls[urls["url_bert_classification"].isna()].shape[0]
        if more_to_go % 1000 == 0:
            print(f"{more_to_go} urls left to process")
        counter += 1


asyncio.run(add_url_bert_classification())

# Tests - Compare the models

In [31]:
import pandas as pd

df = pd.read_csv('data/urls_with_features_and_url_classification.csv')

In [32]:
df

Unnamed: 0,url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_qm,nb_eq,nb_underscore,nb_slash,...,external_favicon,links_in_tags,safe_anchor,empty_title,domain_in_title,domain_with_copyright,external_brand_logo,suspicious_js_patterns,status,url_bert_classification
0,https://www.omnihotels.com/,18.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,160.0,7.0,0.0,0.0,0.0,0.0,0.0,legitimate,0.7218989133834839
1,https://www.out.com/,11.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,...,1.0,196.0,2.0,0.0,0.0,0.0,0.0,0.0,legitimate,0.0030340133234858513
2,https://get4click.ru/,12.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,47.0,1.0,0.0,1.0,0.0,1.0,0.0,legitimate,0.34836170077323914
3,https://www.iovox.com/,13.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,108.0,3.0,0.0,0.0,0.0,1.0,0.0,legitimate,0.046591758728027344
4,https://www.bergfex.at/,14.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,265.0,11.0,0.0,0.0,0.0,0.0,0.0,legitimate,0.00636637257412076
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
80818,http://webmailadmin0.myfreesites.net/,29.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,3.0,...,1.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,phishing,
80819,http://www.ezblox.site/free/jennifer111/helpdesk,15.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,5.0,...,1.0,16.0,0.0,0.0,0.0,0.0,0.0,0.0,phishing,
80820,http://www.formbuddy.com/cgi-bin/formdisp.pl?u...,17.0,0.0,3.0,1.0,0.0,1.0,2.0,0.0,4.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,phishing,
80821,http://www.habbocreditosparati.blogspot.com/,36.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,3.0,...,1.0,26.0,1.0,0.0,0.0,0.0,1.0,0.0,phishing,


In [34]:
df['status'] = df['status'].map({"legitimate": 0, "phishing": 1})

In [35]:
# convert url_bert_classification to float with silent errors
df['url_bert_classification'] = pd.to_numeric(df['url_bert_classification'], errors='coerce')

In [36]:
# remove rows with non-numeric values in url_bert_classification
df = df[df['url_bert_classification'].notna()]

In [37]:
print(df[df["status"] == 0].shape)
print(df[df["status"] == 1].shape)

(20202, 35)
(40334, 35)


In [38]:
df_legit = df[df['status'] == 0]
df_phishing = df[df['status'] == 1].sample(n=len(df_legit) // 3, random_state=42)

In [39]:
df = pd.concat([df_legit, df_phishing])

In [40]:
corr = df.corr()

  corr = df.corr()


In [41]:
corr["status"].sort_values(ascending=False)

status                     1.000000
url_bert_classification    0.791174
prefix_suffix              0.443140
ratio_digits_host          0.332810
length_hostname            0.276582
phish_hints                0.247354
empty_title                0.246290
shortening_service         0.154767
domain_in_title            0.093028
login_form                 0.091833
suspicious_js_patterns     0.080412
external_favicon           0.074727
ratio_extHyperlinks        0.047504
longest_words_raw          0.042537
nb_at                      0.018024
ratio_digits_url           0.011291
ip                         0.010554
shortest_word_path        -0.005251
nb_qm                     -0.024255
nb_eq                     -0.030974
nb_dots                   -0.042966
domain_with_copyright     -0.067485
nb_underscore             -0.077794
safe_anchor               -0.079972
domain_in_brand           -0.109779
links_in_tags             -0.166525
nb_hyperlinks             -0.196447
longest_word_path         -0

In [42]:
df.drop(["status", "url"], axis=1).columns

Index(['length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_qm',
       'nb_eq', 'nb_underscore', 'nb_slash', 'nb_www', 'ratio_digits_url',
       'ratio_digits_host', 'prefix_suffix', 'shortening_service',
       'length_words_raw', 'shortest_word_path', 'longest_words_raw',
       'longest_word_path', 'phish_hints', 'domain_in_brand', 'suspecious_tld',
       'nb_hyperlinks', 'ratio_extHyperlinks', 'login_form',
       'external_favicon', 'links_in_tags', 'safe_anchor', 'empty_title',
       'domain_in_title', 'domain_with_copyright', 'external_brand_logo',
       'suspicious_js_patterns', 'url_bert_classification'],
      dtype='object')

In [43]:
from sklearn.model_selection import train_test_split

X = df.drop(["status", "url"], axis=1)
# remove features with small correselation with status
X.columns = [f'f{i}' for i in range(X.shape[1])]
y = df["status"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [44]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

# Hyperparameter grids
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
}

adb_param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 1],
}

xgb_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

# Grid search
rf_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=3, scoring='accuracy')
adb_search = GridSearchCV(AdaBoostClassifier(random_state=42), adb_param_grid, cv=3, scoring='accuracy')
xgb_search = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42),
                          xgb_param_grid, cv=3, scoring='accuracy')

# Fit models
rf_search.fit(X_train, y_train)
adb_search.fit(X_train, y_train)
xgb_search.fit(X_train, y_train)

# Best models
best_rf = rf_search.best_estimator_
best_adb = adb_search.best_estimator_
best_xgb = xgb_search.best_estimator_

print("Best RF params:", rf_search.best_params_)
print("Best AdaBoost params:", adb_search.best_params_)
print("Best XGBoost params:", xgb_search.best_params_)

Best RF params: {'max_depth': 20, 'n_estimators': 100}
Best AdaBoost params: {'learning_rate': 1, 'n_estimators': 150}
Best XGBoost params: {'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200}


We chose to keep going with XGBoost for the reasons in the paper.

In [47]:
from sklearn.metrics import classification_report

y_pred = best_xgb.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4051
           1       0.98      0.96      0.97      1337

    accuracy                           0.98      5388
   macro avg       0.98      0.97      0.98      5388
weighted avg       0.98      0.98      0.98      5388



In [45]:
from skl2onnx import update_registered_converter
import numpy
from sklearn.datasets import load_iris, load_diabetes, make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, XGBRegressor, DMatrix, train as train_xgb
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, to_onnx, update_registered_converter
from skl2onnx.common.shape_calculator import (
    calculate_linear_classifier_output_shapes,
    calculate_linear_regressor_output_shapes,
)
from onnxmltools.convert.xgboost.operator_converters.XGBoost import convert_xgboost
from onnxmltools.convert import convert_xgboost as convert_xgboost_booster

update_registered_converter(
    XGBClassifier,
    "XGBoostXGBClassifier",
    calculate_linear_classifier_output_shapes,
    convert_xgboost,
    options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
)

In [46]:
model_onnx = convert_sklearn(
    best_xgb,
    "pipeline_xgboost",
    [("input", FloatTensorType([None, 33]))],
    target_opset={"": 12, "ai.onnx.ml": 2},
    options={'zipmap': False}
)

# And save.
with open("xgb.onnx", "wb") as f:
    f.write(model_onnx.SerializeToString())