In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/benign-and-malicious-urls/balanced_urls.csv


In [2]:
%load_ext cuml.accel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import joblib

[2025-08-18 12:40:09.855] [CUML] [info] cuML: Installed accelerator for sklearn.


2025-08-18 12:40:26.774859: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755520827.126764      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755520827.228790      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


[2025-08-18 12:40:53.321] [CUML] [info] cuML: Installed accelerator for umap.
[2025-08-18 12:40:53.430] [CUML] [info] cuML: Installed accelerator for hdbscan.
[2025-08-18 12:40:53.430] [CUML] [info] cuML: Successfully initialized accelerator.


In [3]:
df=pd.read_csv("/kaggle/input/benign-and-malicious-urls/balanced_urls.csv")
df.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [4]:
X=df[["url"]]
y=df["result"]

In [5]:
def add_url_features(X):
    df_temp = pd.DataFrame({"url": X})
    df_temp["https"] = df_temp["url"].str.contains("https").astype(int)
    df_temp["length"] = df_temp["url"].str.len()
    df_temp["num_digits"] = df_temp["url"].str.count(r'\d')
    df_temp["dots"] = df_temp["url"].str.count(r'\.')
    df_temp["at_symbol"] = df_temp["url"].str.contains("@").astype(int)
    df_temp["hyphen_count"] = df_temp["url"].str.count("-")
    return df_temp[["https","length","num_digits","dots","at_symbol","hyphen_count"]]

In [6]:
feature_transformer = FunctionTransformer(add_url_features, validate=False)

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=5000), "url"),
        ("custom", feature_transformer, "url")
    ]
)

In [8]:
pipeline = Pipeline([
    ("features", preprocessor),
    ("clf", LogisticRegression(max_iter=2000))
])

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
pipeline.fit(X_train, y_train)



In [11]:
# --- Predict ---
y_pred = pipeline.predict(X_test)

In [12]:
# --- Evaluate ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9974703957249688

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     63251
           1       1.00      1.00      1.00     63251

    accuracy                           1.00    126502
   macro avg       1.00      1.00      1.00    126502
weighted avg       1.00      1.00      1.00    126502



In [13]:
joblib.dump(pipeline, "url_model.pkl")

['url_model.pkl']

In [14]:
pipeline = joblib.load("url_model.pkl")

# New URL
df = pd.DataFrame({"url": ["http://badwebsite.com/login"]})
print(pipeline.predict(df))   # uses saved TF-IDF vocab + model

[1]
