In [2]:
import json
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

from url_features import featurize_urls


In [None]:
df = pd.read_csv("data/labelled_urls.csv")
df.head()

df = df.drop(columns=["label"])
df.rename(columns={"result": "label"}, inplace=True)

df = df.sample(frac=1, random_state=42).reset_index(drop=True)

urls = df["url"].tolist()
labels = df["label"].tolist()

In [None]:
features = pd.DataFrame(featurize_urls(urls))
features["label"] = labels

X, y = features.drop(columns=["label", "path_length"]), features["label"]

In [None]:
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    )),
])

pipeline.fit(X, y)


In [None]:
scaler = pipeline.named_steps["scaler"]
clf = pipeline.named_steps["clf"]

feature_names = list(X.columns)

export = {
    "model": "logistic_regression",
    "features": feature_names,
    "scaler": {
        "mean": scaler.mean_.tolist(),
        "scale": scaler.scale_.tolist()
    },
    "weights": clf.coef_[0].tolist(),
    "bias": clf.intercept_[0],
    "policy": {
        "allow_threshold": 0.1,
        "block_threshold": 0.9
    }
}

with open("lexical_model.json", "w") as f:
    json.dump(export, f, indent=2)