In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/benign-and-malicious-urls/balanced_urls.csv


In [2]:
!python --version

Python 3.11.13


In [3]:
import numpy, sklearn
print("numpy:", numpy.__version__)
print("sklearn:", sklearn.__version__)


numpy: 1.26.4
sklearn: 1.2.2


In [4]:
!pip freeze

absl-py==1.4.0
accelerate==1.8.1
aiofiles==22.1.0
aiohappyeyeballs==2.6.1
aiohttp==3.12.13
aiosignal==1.3.2
aiosqlite==0.21.0
alabaster==1.0.0
albucore==0.0.24
albumentations==2.0.8
ale-py==0.11.1
alembic==1.16.2
altair==5.5.0
annotated-types==0.7.0
annoy==1.17.3
ansicolors==1.1.8
antlr4-python3-runtime==4.9.3
anyio==4.9.0
argon2-cffi==25.1.0
argon2-cffi-bindings==21.2.0
args==0.1.0
array_record==0.7.2
arrow==1.3.0
arviz==0.21.0
astropy==7.1.0
astropy-iers-data==0.2025.6.23.0.39.50
asttokens==3.0.0
astunparse==1.6.3
atpublic==5.1
attrs==25.3.0
audioread==3.0.1
autograd==1.8.0
babel==2.17.0
backcall==0.2.0
backports.tarfile==1.2.0
bayesian-optimization==3.0.0
beartype==0.21.0
beautifulsoup4==4.13.4
betterproto==2.0.0b6
bigframes==2.8.0
bigquery-magics==0.9.0
bleach==6.2.0
blinker==1.9.0
blis==1.3.0
blobfile==3.0.0
blosc2==3.5.0
bokeh==3.7.3
Boruta==0.4.3
boto3==1.39.1
botocore==1.39.1
Bottleneck==1.4.2
-e git+https://github.com/SohierDane/BigQuery_Helper@8615a7f6c1663e7f2d48aa2b32c2dbcb

In [5]:
#%load_ext cuml.accel
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

import joblib

In [6]:
df=pd.read_csv("/kaggle/input/benign-and-malicious-urls/balanced_urls.csv")
df.head()

Unnamed: 0,url,label,result
0,https://www.google.com,benign,0
1,https://www.youtube.com,benign,0
2,https://www.facebook.com,benign,0
3,https://www.baidu.com,benign,0
4,https://www.wikipedia.org,benign,0


In [7]:
X=df[["url"]]
y=df["result"]

In [8]:
def add_url_features(X):
    df_temp = pd.DataFrame({"url": X})
    df_temp["https"] = df_temp["url"].str.contains("https").astype(int)
    df_temp["length"] = df_temp["url"].str.len()
    df_temp["num_digits"] = df_temp["url"].str.count(r'\d')
    df_temp["dots"] = df_temp["url"].str.count(r'\.')
    df_temp["at_symbol"] = df_temp["url"].str.contains("@").astype(int)
    df_temp["hyphen_count"] = df_temp["url"].str.count("-")
    return df_temp[["https","length","num_digits","dots","at_symbol","hyphen_count"]]

In [9]:
feature_transformer = FunctionTransformer(add_url_features, validate=False)

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ("tfidf", TfidfVectorizer(analyzer="char", ngram_range=(3,5), max_features=5000), "url"),
        ("custom", feature_transformer, "url")
    ]
)

In [11]:
pipeline = Pipeline([
    ("features", preprocessor),
    ("clf", LogisticRegression(max_iter=2000))
])

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [13]:
pipeline.fit(X_train, y_train)

In [14]:
# --- Predict ---
y_pred = pipeline.predict(X_test)

In [15]:
# --- Evaluate ---
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9974545856982499

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     63251
           1       1.00      1.00      1.00     63251

    accuracy                           1.00    126502
   macro avg       1.00      1.00      1.00    126502
weighted avg       1.00      1.00      1.00    126502



In [16]:
joblib.dump(pipeline, "url_model.pkl")
print("ok")

ok


In [17]:
pipeline = joblib.load("url_model.pkl")

# New URL
df = pd.DataFrame({"url": ["http://badwebsite.com/login"]})
print(pipeline.predict(df))   # uses saved TF-IDF vocab + model

[1]
