In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from url_features import featurize_urls


In [2]:
urls = [
    "https://google.com",
    "https://github.com/login",
    "https://stackoverflow.com/questions",
    "http://secure-login.paypal.com.verify-user.ru/login",
    "http://account-update-free-gift.xyz/verify",
    "http://login-update-bank-security-alert.com/auth",
]

labels = [0, 0, 0, 1, 1, 1]

df = pd.DataFrame(featurize_urls(urls))
df["label"] = labels

X = df.drop(columns=["label"])
y = df["label"]

df


Unnamed: 0,url_length,hostname_length,path_length,query_length,num_dots,num_hyphens,num_at,num_slashes,num_digits,digit_ratio,has_ip,has_https,hostname_entropy,label
0,18,10,0,0,1,0,0,2,0,0.0,0,1,2.646439,0
1,24,10,6,0,1,0,0,3,0,0.0,0,1,3.321928,0
2,35,17,10,0,1,0,0,3,0,0.0,0,1,3.690117,0
3,51,38,6,0,4,2,0,3,0,0.0,0,0,4.017536,1
4,42,28,7,0,1,3,0,3,0,0.0,0,0,4.012188,1
5,48,36,5,0,1,4,0,3,0,0.0,0,0,4.162573,1


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.4,
    stratify=y,
    random_state=42
)

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.50      1.00      0.67         1

    accuracy                           0.67         3
   macro avg       0.75      0.75      0.67         3
weighted avg       0.83      0.67      0.67         3

[[1 1]
 [0 1]]


In [4]:
y_pred_probs = pipeline.predict_proba(X_test)[:, 1]

pd.DataFrame({
    "url": X_test.index.map(lambda i: urls[i]),
    "phishing_probability": y_pred_probs,
    "label": y_test.values
})


Unnamed: 0,url,phishing_probability,label
0,https://stackoverflow.com/questions,0.739421,0
1,https://google.com,0.006458,0
2,http://login-update-bank-security-alert.com/auth,0.907584,1


In [5]:
clf = pipeline.named_steps["clf"]

importance = pd.Series(
    clf.coef_[0],
    index=X.columns
).sort_values(key=abs, ascending=False)

importance


has_https          -0.362570
hostname_entropy    0.362429
num_hyphens         0.349111
url_length          0.336081
hostname_length     0.332300
path_length         0.198521
num_dots            0.164049
query_length        0.000000
num_at              0.000000
num_digits          0.000000
num_slashes         0.000000
has_ip              0.000000
digit_ratio         0.000000
dtype: float64