In [1]:
!pip install skl2onnx onnx onnxruntime

Collecting skl2onnx
  Downloading skl2onnx-1.19.1-py3-none-any.whl.metadata (3.8 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading skl2onnx-1.19.1-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading onnxruntime-1.22.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (16.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.5/16.5 MB[0m [31m92.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("samahsadiq/benign-and-malicious-urls")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/benign-and-malicious-urls


In [3]:
%load_ext cuml.accel
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import hstack

[2025-08-18 12:09:14.582] [CUML] [info] cuML: Installed accelerator for sklearn.


2025-08-18 12:09:31.040237: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755518971.435439      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755518971.547038      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[2025-08-18 12:09:58.740] [CUML] [info] cuML: Installed accelerator for umap.
[2025-08-18 12:09:58.858] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-08-18 12:09:58.858] [CUML] [info] cuML: Successfully initialized accelerator.


In [4]:
df=pd.read_csv("/kaggle/input/benign-and-malicious-urls/balanced_urls.csv")
df.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [5]:
tfidf = TfidfVectorizer(analyzer='char', ngram_range=(3,5), max_features=5000)
X_tfidf = tfidf.fit_transform(df['url'])

In [6]:
df["https"]=df["url"].str.contains("https").astype(int)
df["length"]=df["url"].str.len()
df["num_digits"]=df["url"].str.count(r'\d')
df["dots"]=df["url"].str.count(".")
df["at_symbol"] = df["url"].str.contains("@").astype(int)
df["hyphen_count"] = df["url"].str.count("-")

X_features = df[["https","length", "num_digits", "dots", "at_symbol", "hyphen_count"]]
X_combined = hstack([X_tfidf, X_features])
y = df["result"]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [8]:
model=LogisticRegression(solver='saga',max_iter=3000)

In [9]:
model.fit(X_train, y_train)

In [10]:
y_pred=model.predict(X_test)

In [11]:
from sklearn.metrics import f1_score
print("Accuracy:", f1_score(y_test, y_pred))

Accuracy: 0.9973689408604276


In [12]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = model.predict(X_test)
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[63053    82]
 [  251 63116]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     63135
           1       1.00      1.00      1.00     63367

    accuracy                           1.00    126502
   macro avg       1.00      1.00      1.00    126502
weighted avg       1.00      1.00      1.00    126502



In [13]:
import json

# Convert vocab keys to str and values to int
vocab_clean = {str(k): int(v) for k, v in tfidf.vocabulary_.items()}

# Save cleaned vocab
with open("vocab.json", "w") as f:
    json.dump(vocab_clean, f)

# Convert IDF to Python list of floats
idf_clean = [float(x) for x in tfidf.idf_]

# Save IDF
with open("idf.json", "w") as f:
    json.dump(idf_clean, f)


In [14]:
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# your trained model
# e.g., model = sklearn.pipeline.Pipeline(...)

initial_type = [('input', FloatTensorType([None, 5006]))]

# Disable zipmap to remove the unsupported dictionary output
onnx_model = convert_sklearn(
    model,
    initial_types=initial_type,
    options={id(model): {'zipmap': False}}
)

with open("logreg_url_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [15]:
import shutil

shutil.move("vocab.json", "/kaggle/working/vocab.json")
shutil.move("idf.json", "/kaggle/working/idf.json")
shutil.move("logreg_url_model.onnx", "/kaggle/working/logreg_url_model.onnx") 

'/kaggle/working/logreg_url_model.onnx'