In [3]:
import re
import numpy as np
import tldextract
import whois
import requests
import joblib
import onnx
import onnxruntime as ort
import pandas as pd
from datetime import datetime
from urllib.parse import urlparse
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx import convert_sklearn, update_registered_converter
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm

In [None]:
# # Load dataset
# df1 = pd.read_csv(r'D:\Wayne\Arish proj\malicious_phish.csv')

# # Extract URLs
# urls2 = df1['url'].tolist()

# # Convert labels using NumPy vectorized operations
# malicious_types = {'phishing', 'defacement', 'malware'}
# labels2 = np.where(df1['type'].isin(malicious_types), 1, 0)


In [3]:
# Load dataset
df2 = pd.read_csv(r'D:\Wayne\Arish proj\PhiUSIIL_Phishing_URL_Dataset.csv')

# Extract URLs
urls3 = df2['URL'].tolist()
labels3 = df2['label']
labels3 = 1 - labels3


In [5]:
print(labels3[1], urls3[1])

0 https://www.uni-mainz.de


In [6]:
# Load dataset
df3 = pd.read_csv(r'D:\Wayne\Arish proj\top-1m.csv')
# Extract URLs
df_set = set(df3['URL'].iloc[:100000])
df3['URL'] = 'https://' + df3['URL']
urls4 = df3['URL'].tolist()
labels4 = [0] * len(urls4)

In [7]:
print("myntra.com" in df_set)

True


In [8]:
urls4[100]

'https://samsung.com'

In [9]:
# Suspicious words that commonly appear in phishing URLs
SUSPICIOUS_WORDS = ["login", "bank", "secure", "account", "update", "password", "verification", "paypal", "ebay", "confirm"]

# Load PhishTank blacklist
def load_phishtank():
    try:
        phish_df = pd.read_csv("/content/verified_online.csv")
        return set(phish_df['url'].tolist())
    except:
        return set()

PHISHTANK_BLACKLIST = load_phishtank()

In [10]:
global df_set
def extract_features(url):
    """ Extracts multiple features from a URL for phishing detection. """
    features = []
    parsed_url = urlparse(url)
    extracted = tldextract.extract(url)
    
    domain = extracted.domain
    suffix = extracted.suffix
    path = parsed_url.path
    trusted_domains = df_set

    # Domain Reputation Check (Avoids false positives for known safe domains)
    # trusted_domains = {"chatgpt.com", "myntra.com", "amazon.com", "notion.so"}
    if f"{domain}.{suffix}" in trusted_domains:
        return np.zeros(20)  # Return a safe default

    # Lexical Features
    features.append(len(url))  # URL Length
    features.append(url.count('.'))  # Count of '.'
    features.append(url.count('/'))  # Count of '/'
    features.append(url.count('-'))  # Count of '-'
    features.append(url.count('@'))  # Count of '@'
    features.append(url.count('?'))  # Count of '?'
    features.append(url.count('&'))  # Count of '&'
    features.append(1 if "https" in url.lower() else 0)  # HTTPS presence
    features.append(sum(1 for word in SUSPICIOUS_WORDS if word in url.lower()))  # Suspicious word count

    # Domain-based Features
    features.append(len(domain))  # Domain Length
    features.append(sum(c.isdigit() for c in domain))  # Numeric characters in domain
    features.append(1 if re.search(r"[0-9]{3,}", domain) else 0)  # Long digit sequences

    # WHOIS-based Features
    try:
        domain_info = whois.whois(domain)
        creation_date = domain_info.creation_date
        expiration_date = domain_info.expiration_date
        
        if isinstance(creation_date, list):
            creation_date = creation_date[0]
        if isinstance(expiration_date, list):
            expiration_date = expiration_date[0]

        age_days = (datetime.now() - creation_date).days if creation_date else -1
        expiry_days = (expiration_date - datetime.now()).days if expiration_date else -1
    except:
        age_days = -1
        expiry_days = -1

    features.append(age_days)  # Domain Age
    features.append(expiry_days)  # Domain Expiry

    # IP Address Feature
    features.append(1 if re.match(r"^\d{1,3}(\.\d{1,3}){3}$", domain) else 0)  # If domain is an IP address

    # Path-based Features (🔹 Reducing False Positives for Long URLs)
    features.append(len(path))  # Path Length
    features.append(min(sum(1 for c in path if c.isdigit()), 5))  # Numbers in Path (Limit to 5)
    features.append(min(path.count('/'), 6))  # Path Depth (Limit to 6)
    features.append(min(path.count('-'), 4))  # Hyphens in Path (Limit to 4)

    # Blacklist Features
    features.append(1 if url in PHISHTANK_BLACKLIST else 0)  # PhishTank Match

    return np.array(features)


In [11]:
print("myntra.com" in df_set)
extract_features("https://www.myntra.com")

True


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

In [12]:
# Load dataset
df1 = pd.read_csv(r"D:\Wayne\Arish proj\another.csv")

# Extract URLs
urls = df1['url'].tolist()

# Assign phishing labels efficiently
# labels = np.ones(len(urls), dtype=int)

labels = [1] * len(urls)  # Phishing labels 

In [None]:
print(len(urls),len(labels))
# print(len(urls2),len(labels2))
print(len(urls3),len(labels3))
print(len(urls4),len(labels4))

59481 59481
651191 651191
235795 235795
1000000 1000000


In [14]:
labels = np.array(labels, dtype=int)
labels4 = np.array(labels4, dtype=int) 

In [None]:
# Keep URLs as lists (more efficient for strings)
# urls.extend(urls2)  # List concatenation is efficient for strings
urls.extend(urls3)
urls.extend(urls4)
# Convert labels to NumPy array and concatenate
labels = np.hstack((labels, labels3, labels4))
#labels = np.hstack((labels2, labels3, labels4))

In [15]:
print(labels[0])

1


In [16]:
len(labels), len(urls)

(1946467, 1946467)

In [17]:
X = np.array([extract_features(url) for url in urls])
y = labels

In [18]:
print((labels==1).sum())
print((labels==0).sum())

383514
1562953


In [19]:
print(X[0])

[41.  3.  3.  0.  0.  0.  0.  1.  0.  7.  0.  0. -1. -1.  0. 11.  0.  1.
  0.  0.]


In [68]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [71]:
X_test.shape

(389294, 20)

In [72]:
print(X_train)

[[20.  1.  2. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [23.  2.  2. ...  0.  0.  0.]
 ...
 [14.  2.  0. ...  0.  0.  0.]
 [25.  3.  2. ...  2.  0.  0.]
 [28.  1.  3. ...  1.  0.  0.]]


In [73]:
# Train LightGBM Model
lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=40, max_depth=8, n_estimators=200, random_state=42)
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 306728, number of negative: 1250445
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.125554 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 716
[LightGBM] [Info] Number of data points in the train set: 1557173, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.196977 -> initscore=-1.405293
[LightGBM] [Info] Start training from score -1.405293


In [74]:
def predict_url_lgb(url):
    features = extract_features(url).reshape(1, -1)
    print(f"Extracted features shape: {features.shape}")
    prediction = lgb_model.predict(features)[0]
    return "Phishing" if prediction == 1 else "Safe"

# Example Prediction
test_url = "https://www.myntra.com/"
print(predict_url_lgb(test_url))

Extracted features shape: (1, 20)
Safe




In [77]:
joblib.dump(lgb_model, "lightgbm_model.pkl")
print("✅ LightGBM Model Trained!")



✅ LightGBM Model Trained!


In [None]:
def predict_url_rf(url):
    features = extract_features(url).reshape(1, -1)
    print(f"Extracted features shape: {features.shape}")
    prediction = lgb_model.predict(features)[0]
    return "Phishing" if prediction == 1 else "Safe"  

In [78]:
from skl2onnx import update_registered_converter
from skl2onnx.common.shape_calculator import calculate_linear_classifier_output_shapes
from onnxmltools.convert.lightgbm.operator_converters.LightGbm import convert_lightgbm
import lightgbm as lgb
import numpy as np
from skl2onnx.common.data_types import FloatTensorType
from onnxmltools.convert import convert_sklearn

# Register LightGBM with skl2onnx
update_registered_converter(
    lgb.LGBMClassifier,
    "LightGBMClassifier",
    calculate_linear_classifier_output_shapes,
    convert_lightgbm,
    options={"nocl": [True, False]},
)

# Sample dataset
X_train = np.random.rand(100, 10)
y_train = np.random.randint(0, 2, size=(100,))

# Train LightGBM model
lgb_model = lgb.LGBMClassifier()
lgb_model.fit(X_train, y_train)

# Convert to ONNX format with explicit opset
initial_type = [("input", FloatTensorType([None, 20]))]
onnx_model = convert_sklearn(
    lgb_model,
    initial_types=initial_type,
    target_opset={"ai.onnx.ml": 3},  # 👈 **Fix: Explicitly specify target opset**
)

# Save ONNX model
with open("lightgbm_model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

print("✅ LightGBM successfully converted to ONNX!")



[LightGBM] [Info] Number of positive: 52, number of negative: 48
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008747 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 350
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.520000 -> initscore=0.080043
[LightGBM] [Info] Start training from score 0.080043
✅ LightGBM successfully converted to ONNX!


In [79]:
import onnx

onnx_model = onnx.load("lightgbm_model.onnx")
onnx.checker.check_model(onnx_model)

print("✅ ONNX model is valid!")


✅ ONNX model is valid!


In [80]:
import onnxruntime as ort

# Load ONNX model
onnx_session = ort.InferenceSession("lightgbm_model.onnx")

def predict_url_onnx(url):
    features = extract_features(url).astype(np.float32).reshape(1, -1)
    #features = extract_features(url).astype(np.float32).reshape(1, -1)
    print(f"Extracted features shape: {features.shape}")  # Debug print
    input_name = onnx_session.get_inputs()[0].name
    expected_shape = onnx_session.get_inputs()[0].shape
    (f"ONNX model expects shape: {expected_shape}")  # Debug print
    prediction = onnx_session.run(None, {input_name: features})  # Run inference
    # input_name = onnx_session.get_inputs()[0].name
    # prediction = onnx_session.run(None, {input_name: features})
    return "Phishing" if prediction[0][0] == 1 else "Safe"

# Example Prediction
test_url = "http://paypal-secure-login.com"
print(f"🔍 Prediction for {test_url} (ONNX): {predict_url_onnx(test_url)}")

Extracted features shape: (1, 20)
🔍 Prediction for http://paypal-secure-login.com (ONNX): Phishing
