In [1]:
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse

# Load your combined dataset
df = pd.read_csv("../master_dataset/MASTER_URL_DATASET.csv")

print("Before cleaning:", df.shape)

# ---------- 1. Remove duplicates ----------
df.drop_duplicates(subset=["url"], inplace=True)

# ---------- 2. Remove empty / missing URLs ----------
df = df[df["url"].notna()]
df = df[df["url"].str.strip() != ""]

# ---------- 3. Normalize HTTP/HTTPS ----------
def normalize_url(u):
    if not u.startswith("http"):
        return "http://" + u
    return u

df["url"] = df["url"].apply(normalize_url)

# ---------- 4. Remove malformed URLs ----------
def is_valid_url(url):
    try:
        result = urlparse(url)
        return all([result.scheme, result.netloc])
    except:
        return False

df = df[df["url"].apply(is_valid_url)]

# ---------- 5. Remove extremely long URLs (rare, usually garbage) ----------
df = df[df["url"].str.len() < 250]

# ---------- 6. Filter labels ----------
df = df[df["label"].isin([0, 1])]

# ---------- 7. Reset index ----------
df.reset_index(drop=True, inplace=True)

print("After cleaning:", df.shape)

# Save cleaned dataset
df.to_csv("../master_dataset/MASTER_URL_DATASET_CLEANED.csv", index=False)

print("CLEANED DATASET SAVED!")


Before cleaning: (1030335, 2)
After cleaning: (1030334, 2)
CLEANED DATASET SAVED!


In [2]:
import pandas as pd
import numpy as np
import tldextract
import re
from urllib.parse import urlparse

df = pd.read_csv("../master_dataset/MASTER_URL_DATASET_CLEANED.csv")

print("Loaded:", df.shape)

# -----------------------------------------
# 1. BASIC LEXICAL FEATURES
# -----------------------------------------

df["url_length"] = df["url"].apply(len)
df["num_digits"] = df["url"].apply(lambda x: sum(c.isdigit() for c in x))
df["num_letters"] = df["url"].apply(lambda x: sum(c.isalpha() for c in x))
df["num_special"] = df["url"].apply(lambda x: sum(not c.isalnum() for c in x))

special_chars = ['.', '-', '_', '@', '?', '=', '&', '%', '/', ':', '~', '#']

for ch in special_chars:
    df[f"count_{ch.replace('.', 'dot')}"] = df["url"].apply(lambda x: x.count(ch))

df["has_https"] = df["url"].apply(lambda x: 1 if x.startswith("https") else 0)

# -----------------------------------------
# 2. DOMAIN FEATURES
# -----------------------------------------

def extract_domain(url):
    domain = tldextract.extract(url)
    return domain.subdomain, domain.domain, domain.suffix

df["subdomain"], df["maindomain"], df["tld"] = zip(*df["url"].apply(extract_domain))

df["subdomain_length"] = df["subdomain"].apply(len)
df["maindomain_length"] = df["maindomain"].apply(len)
df["num_subdomain_dots"] = df["subdomain"].apply(lambda x: x.count('.'))

# Suspicious TLDs
suspicious_tlds = ['tk','ml','ga','cf','gq','xyz','zip','link','buzz','rest']

df["tld_suspicious"] = df["tld"].apply(lambda x: 1 if x in suspicious_tlds else 0)

# -----------------------------------------
# 3. ENTROPY (measures randomness)
# -----------------------------------------

from collections import Counter
def entropy(string):
    p, lns = Counter(string), float(len(string))
    return -sum( count/lns * np.log2(count/lns) for count in p.values() )

df["entropy"] = df["url"].apply(entropy)

# -----------------------------------------
# 4. KEYWORD FEATURES (real phishing indicators)
# -----------------------------------------

keywords = [
    "login","secure","update","verify","account","bank","payment",
    "signin","checkout","admin","password","billing","support",
    "helpdesk","dropbox","office","microsoft","apple","google"
]

for kw in keywords:
    df[f"kw_{kw}"] = df["url"].apply(lambda x: 1 if kw in x.lower() else 0)

df["num_keywords"] = df[[f"kw_{kw}" for kw in keywords]].sum(axis=1)

# -----------------------------------------
# 5. URL COMPLEXITY FEATURES
# -----------------------------------------

df["num_slashes"] = df["url"].apply(lambda x: x.count('/'))
df["num_params"] = df["url"].apply(lambda x: x.count('&'))
df["num_fragments"] = df["url"].apply(lambda x: x.count('#'))
df["num_question"] = df["url"].apply(lambda x: x.count('?'))
df["num_equal"] = df["url"].apply(lambda x: x.count('='))

# -----------------------------------------
# SAVE
# -----------------------------------------

df.to_csv("../master_dataset/MASTER_WITH_FEATURES.csv", index=False)
print("FEATURE EXTRACTION COMPLETE!")


Loaded: (1030334, 2)




FEATURE EXTRACTION COMPLETE!


In [7]:
import tensorflow as tf
print(tf.__version__)


2.17.0


In [5]:
pip install tensorflow


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [14]:
!pip uninstall tensorflow -y
!pip uninstall keras -y




In [7]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("../master_dataset/MASTER_WITH_FEATURES.csv")
print("Loaded feature dataset:", df.shape)

# -------------------------------------------
# FEATURE SELECTION
# -------------------------------------------

ignore_cols = ["url", "subdomain", "maindomain", "tld"]
features = [c for c in df.columns if c not in ignore_cols + ["label"]]

X = df[features]
y = df["label"]

# -------------------------------------------
# TRAIN-TEST SPLIT
# -------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape)

# -------------------------------------------
# MODEL A: LOGISTIC REGRESSION
# -------------------------------------------

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=2000)
log_reg.fit(X_train, y_train)
pred_lr = log_reg.predict(X_test)

acc_lr = accuracy_score(y_test, pred_lr)
f1_lr = f1_score(y_test, pred_lr)

print("\nLogistic Regression Accuracy:", acc_lr)
print("Logistic Regression F1 Score:", f1_lr)

# -------------------------------------------
# MODEL B: XGBOOST
# -------------------------------------------

from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=500,
    max_depth=7,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.7,
    tree_method="hist"
)

xgb.fit(X_train, y_train)
pred_xgb = xgb.predict(X_test)

acc_xgb = accuracy_score(y_test, pred_xgb)
f1_xgb = f1_score(y_test, pred_xgb)

print("\nXGBoost Accuracy:", acc_xgb)
print("XGBoost F1 Score:", f1_xgb)

# -------------------------------------------
# MODEL C: 1D CNN ON RAW URL CHARACTERS
# -------------------------------------------

# Convert URL strings to padded sequences of integers
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPool1D, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

url_texts = df["url"].astype(str).tolist()

tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(url_texts)

sequences = tokenizer.texts_to_sequences(url_texts)
max_len = 200
X_seq = pad_sequences(sequences, maxlen=max_len)

# Train-test split for CNN
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(
    X_seq, y, test_size=0.2, random_state=42, stratify=y
)

# CNN MODEL
cnn = Sequential([
    Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=32, input_length=max_len),
    Conv1D(64, 5, activation='relu'),
    GlobalMaxPool1D(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

cnn.fit(X_seq_train, y_seq_train, epochs=2, batch_size=256, validation_split=0.1)

pred_cnn = (cnn.predict(X_seq_test) > 0.5).astype(int)

acc_cnn = accuracy_score(y_seq_test, pred_cnn)
f1_cnn = f1_score(y_seq_test, pred_cnn)

print("\nCNN Accuracy:", acc_cnn)
print("CNN F1 Score:", f1_cnn)

# -------------------------------------------
# SAVE MODELS
# -------------------------------------------

import joblib

joblib.dump(log_reg, "logistic_model.pkl")
joblib.dump(xgb, "xgboost_model.pkl")
cnn.save("cnn_model.keras")  # Save in native Keras format
joblib.dump(tokenizer, "cnn_tokenizer.pkl")

print("\nModels Saved Successfully!")


  df = pd.read_csv("../master_dataset/MASTER_WITH_FEATURES.csv")


Loaded feature dataset: (1030334, 52)
Train: (824267, 47) Test: (206067, 47)

Logistic Regression Accuracy: 0.9999514720940277
Logistic Regression F1 Score: 0.9991751897063675

XGBoost Accuracy: 0.9999611776752222
XGBoost F1 Score: 0.9993402605970642




Epoch 1/2
[1m2898/2898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m73s[0m 24ms/step - accuracy: 0.9986 - loss: 0.0056 - val_accuracy: 1.0000 - val_loss: 2.0003e-04
Epoch 2/2
[1m2898/2898[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 24ms/step - accuracy: 1.0000 - loss: 5.2448e-04 - val_accuracy: 1.0000 - val_loss: 1.5560e-04
[1m6440/6440[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 3ms/step

CNN Accuracy: 0.9999514720940277
CNN F1 Score: 0.9991751897063675

Models Saved Successfully!


In [11]:
# Assume X_train is your training dataframe (47 features)
import joblib

feature_columns = X_train.columns.tolist()
joblib.dump(feature_columns, "models/feature_columns.pkl")
print("Feature columns saved:", len(feature_columns))


Feature columns saved: 47


In [25]:
from feature_extractor import extract_features_for_single_url
from model_loader import predict_soft_voting
import numpy as np

url = "http://paypal-login-verification-secure-update.com"
fv = extract_features_for_single_url(url).values.astype(np.float32)
fv = fv.reshape(1, -1)

prob = predict_soft_voting(fv)
print("Phishing probability:", prob)


Phishing probability: 0.5362428920406972




In [15]:
fv = extract_features_for_single_url(url).values
print("Feature vector shape:", fv.shape)


Feature vector shape: (1, 47)


In [17]:
X_train.shape


(824267, 47)

In [13]:
from model_loader import predict_soft_voting

# Example
url = "http://paypal-login-verification-secure-update.com"

# numeric_features_vector: compute features exactly like you did in MASTER_WITH_FEATURES.csv
numeric_features_vector = [...]  # list of 47 values

result = predict_soft_voting(url, numeric_features_vector)
print(result)




TypeError: float() argument must be a string or a real number, not 'ellipsis'

In [21]:
from model_loader import predict_soft_voting
from feature_extractor import extract_features  # your existing function

urls = [
    "https://google.com",
    "http://paypal-login-verification-secure-update.com",
    "https://openai.com",
]

for url in urls:
    f = extract_features(url)  # extract features for this URL
    prob, score = predict_soft_voting(f)
    print(f"{url} → {prob:.4f} → {score}")


ValueError: Must pass 2-d input. shape=(1, 1, 57)

In [23]:
import pandas as pd
import joblib

df = pd.read_csv("../master_dataset/MASTER_WITH_FEATURES.csv")  # adjust path if needed
feature_columns = joblib.load("models/feature_columns.pkl")  # the same columns used for training

# Keep only the required features
X = df[feature_columns]
y = df["label"]  # make sure this column exists: 1=phishing, 0=safe
print(X.shape, y.shape)


  df = pd.read_csv("../master_dataset/MASTER_WITH_FEATURES.csv")  # adjust path if needed


(1030334, 47) (1030334,)


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [27]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Test accuracy
print("LogReg test accuracy:", logreg.score(X_test, y_test))


LogReg test accuracy: 0.9999514720940277


In [29]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=5,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

print("XGB test accuracy:", xgb_model.score(X_test, y_test))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGB test accuracy: 0.9999611776752222


In [31]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

X_train_cnn = np.array(X_train)
X_test_cnn = np.array(X_test)

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_cnn.shape[1],)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train_cnn, y_train, epochs=20, batch_size=32, validation_split=0.1)

print("CNN test accuracy:", model.evaluate(X_test_cnn, y_test))


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m23183/23183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 3ms/step - accuracy: 0.9992 - loss: 0.0043 - val_accuracy: 0.9999 - val_loss: 5.6431e-04
Epoch 2/20
[1m23183/23183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 3ms/step - accuracy: 0.9998 - loss: 0.0012 - val_accuracy: 1.0000 - val_loss: 4.1381e-04
Epoch 3/20
[1m23183/23183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 3ms/step - accuracy: 0.9999 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 4.7994e-04
Epoch 4/20
[1m23183/23183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 2ms/step - accuracy: 0.9999 - loss: 0.0011 - val_accuracy: 0.9999 - val_loss: 5.5675e-04
Epoch 5/20
[1m23183/23183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 2ms/step - accuracy: 0.9999 - loss: 9.2173e-04 - val_accuracy: 1.0000 - val_loss: 5.1586e-04
Epoch 6/20
[1m23183/23183[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 2ms/step - accuracy: 0.9999 - loss: 0.0011 - val_accuracy: 

In [33]:
import joblib
logreg_path = "models/logistic_model.pkl"
xgb_path = "models/xgboost_model.pkl"
cnn_path = "models/cnn_model.keras"

joblib.dump(logreg, logreg_path)
joblib.dump(xgb_model, xgb_path)
model.save(cnn_path)
