In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/benign-and-malicious-urls/balanced_urls.csv


In [2]:
!python --version

Python 3.11.13


In [3]:
%load_ext cuml.accel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import cross_val_score

import joblib

[2025-08-26 07:16:04.741] [CUML] [info] cuML: Installed accelerator for sklearn.


2025-08-26 07:16:19.177699: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1756192579.379767      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1756192579.443614      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[2025-08-26 07:16:37.945] [CUML] [info] cuML: Installed accelerator for umap.
[2025-08-26 07:16:38.019] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-08-26 07:16:38.019] [CUML] [info] cuML: Successfully initialized accelerator.


In [4]:
df=pd.read_csv("/kaggle/input/benign-and-malicious-urls/balanced_urls.csv")
df.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [5]:
X=df[["url"]]
y=df["result"]

In [6]:
# Simplified IPv4: numbers separated by dots
ipv4_simple = r"\b\d{1,3}(?:\.\d{1,3}){3}\b"

# Simplified IPv6: hex digits + colons (not full validation)
ipv6_simple = r"\b[0-9A-Fa-f:]{2,}\b"

ip_pattern_simple = f"(?:{ipv4_simple}|{ipv6_simple})"

def add_url_features(X):
    df_temp = pd.DataFrame({"url": X})

    # Basic features
    df_temp["https"] = df_temp["url"].str.contains("https").astype(int)
    df_temp["length"] = df_temp["url"].str.len()
    df_temp["num_digits"] = df_temp["url"].str.count(r"\d")
    df_temp["subdomain_count"] = df_temp["url"].str.count(r"\.") - 1
    df_temp["path_length"] = df_temp["url"].str.split("/", n=3).str[-1].str.len()
    df_temp["ip_in_url"] = df_temp["url"].str.contains(ip_pattern_simple, regex=True).astype(int)

    # Special characters
    df_temp["dots"] = df_temp["url"].str.count(r"\.")
    df_temp["at_count"] = df_temp["url"].str.count("@")
    df_temp["question_count"] = df_temp["url"].str.count(r"\?")
    df_temp["hyphen_count"] = df_temp["url"].str.count("-")
    df_temp["equal_count"] = df_temp["url"].str.count("=")
    df_temp["hash_count"] = df_temp["url"].str.count("#")
    df_temp["percent_count"] = df_temp["url"].str.count("%")
    df_temp["plus_count"] = df_temp["url"].str.count(r"\+")
    df_temp["dollar_count"] = df_temp["url"].str.count(r"\$")
    df_temp["exclaim_count"] = df_temp["url"].str.count("!")
    df_temp["star_count"] = df_temp["url"].str.count(r"\*")
    df_temp["comma_count"] = df_temp["url"].str.count(",")
    df_temp["param_count"] = df_temp["url"].str.count("&")

    return df_temp.drop(columns=["url"])

In [7]:
feature_transformer = FunctionTransformer(add_url_features, validate=False)

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", HashingVectorizer(analyzer="char",ngram_range=(3,3),n_features=400,alternate_sign=False,binary=True,dtype=np.float32), "url"),
        ("custom", feature_transformer, "url")
    ]
)

In [9]:
pipeline = Pipeline([
    ("features", preprocessor),
    ("clf", LogisticRegression(max_iter=2000))
])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# scores = cross_val_score(pipeline, X, y, cv=5)
# print("CV accuracy:", scores.mean(), scores.std())

WITH HashingVectorizer : 99.XX ||| WITHOUT HashingVectorizer : 97.XX

In [12]:
pipeline.fit(X_train, y_train)



In [13]:
# --- Predict ---
y_pred = pipeline.predict(X_test)

In [14]:
# --- Evaluate ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9960474933202638

Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00     63251
           1       1.00      0.99      1.00     63251

    accuracy                           1.00    126502
   macro avg       1.00      1.00      1.00    126502
weighted avg       1.00      1.00      1.00    126502



In [15]:
joblib.dump(pipeline, "url_model.pkl")
print("Got Model")

Got Model


In [16]:
pipeline = joblib.load("url_model.pkl")

df = pd.DataFrame({"url": ["http://badwebsite.com/login"]})
print(pipeline.predict(df))

[1]


In [17]:
import sys

# Check size on disk
import os
print("Model size (MB):", os.path.getsize("url_model.pkl") / (1024*1024))

# Check object size in memory
print("Object size (bytes):", sys.getsizeof(pipeline))


Model size (MB): 0.0070972442626953125
Object size (bytes): 56
