In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

from url_features import featurize_urls


In [None]:
df = pd.read_csv("data/labelled_urls.csv")
df.head()

df = df.drop(columns=["label"])
df.rename(columns={"result": "label"}, inplace=True)

df_benign = df[df["label"] == 0]
df_phishing = df[df["label"] == 1]
ratio = 1/20
n_phish = len(df_phishing)
df_ben_sampled = df_benign.sample(n=int(n_phish * ratio), random_state=42)
df_final = pd.concat([df_ben_sampled, df_phishing]).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
urls = df_final["url"].tolist()
labels = df_final["label"].tolist()

features = pd.DataFrame(featurize_urls(urls))
features["label"] = labels

X, y = features.drop(columns=["label", "path_length"]), features["label"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000, class_weight="balanced"))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

In [None]:
y_pred_probs = pipeline.predict_proba(X_test)[:, 1]

inspection = pd.DataFrame({
    "url": X_test.index.map(lambda i: urls[i]),
    "phishing_probability": y_pred_probs,
    "label": y_test.values
})
inspection

In [None]:
clf = pipeline.named_steps["clf"]

importance = pd.Series(
    clf.coef_[0],
    index=X.columns
).sort_values(key=abs, ascending=False)

importance


In [None]:
stackoverflow_url = "https://stackoverflow.com/questions/12345/how-to-use-train-test-split"

so_features = pd.DataFrame(
    featurize_urls([stackoverflow_url])
)
so_features = so_features[X.columns]
so_prob = pipeline.predict_proba(so_features)[0, 1]
so_prob # ~0.16 -> too high for a known benign URL
