In [3]:
import pandas as pd
import numpy as np
import re
import joblib
from urllib.parse import urlparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import lightgbm as lgb

# ---------------------------
# 1. Feature extraction functions
# ---------------------------
def extract_url_features(url):
    parsed = urlparse(url)
    features = {
        "url_length": len(url),
        "hostname_length": len(parsed.netloc),
        "path_length": len(parsed.path),
        "num_digits": sum(c.isdigit() for c in url),
        "num_special_chars": sum(c in ['-', '_', '?', '=', '&', '%'] for c in url),
        "has_ip": 1 if re.search(r'\d+\.\d+\.\d+\.\d+', url) else 0,
        "https": 1 if parsed.scheme == "https" else 0,
        "num_subdirs": url.count('/'),
        "num_query_params": url.count('='),
        "suspicious_words": 1 if re.search(r"(login|verify|update|bank|free|account|secure)", url.lower()) else 0
    }
    return list(features.values())

feature_names = [
    "url_length", "hostname_length", "path_length", "num_digits",
    "num_special_chars", "has_ip", "https", "num_subdirs",
    "num_query_params", "suspicious_words"
]

# ---------------------------
# 2. Load dataset
# ---------------------------
data = pd.read_csv("/kaggle/input/phisingds/phishing_dataset.csv")  # CSV with columns: url,label
X_urls = data["url"]
y = data["label"]

# ---------------------------
# 3. TF-IDF vectorization
# ---------------------------
tfidf = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), max_features=3000)
X_tfidf = tfidf.fit_transform(X_urls)

# ---------------------------
# 4. Extract statistical features
# ---------------------------
stat_features = np.array([extract_url_features(u) for u in X_urls])

# ---------------------------
# 5. Combine features
# ---------------------------
from scipy.sparse import hstack
X_combined = hstack([X_tfidf, stat_features])

# ---------------------------
# 6. Train-test split
# ---------------------------
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# ---------------------------
# 7. Train LightGBM
# ---------------------------
lgbm_model = lgb.LGBMClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=-1,
    random_state=42
)
lgbm_model.fit(X_train, y_train)

# ---------------------------
# 8. Evaluate
# ---------------------------
y_pred = lgbm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# ---------------------------
# 9. Save model & vectorizer
# ---------------------------
joblib.dump(lgbm_model, "lightgbm_url_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")

print("✅ Model & vectorizer saved.")

# ---------------------------
# 10. Prediction function
# ---------------------------
def predict_url(url):
    # Load model & vectorizer
    model = joblib.load("lightgbm_url_model.pkl")
    tfidf_vec = joblib.load("tfidf_vectorizer.pkl")
    
    # Extract features
    tfidf_features = tfidf_vec.transform([url])
    stat_features = np.array([extract_url_features(url)])
    
    # Combine
    combined_features = hstack([tfidf_features, stat_features])
    
    # Predict
    pred = model.predict(combined_features)[0]
    prob = model.predict_proba(combined_features)[0][1]
    
    return {"prediction": int(pred), "probability_of_phishing": float(prob)}

# ---------------------------
# 11. Example prediction
# ---------------------------
test_url = "https://www.bankofamerica.com/"
print(predict_url(test_url))


[LightGBM] [Info] Number of positive: 29789, number of negative: 29071
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.854939 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 678894
[LightGBM] [Info] Number of data points in the train set: 58860, number of used features: 3010
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.506099 -> initscore=0.024398
[LightGBM] [Info] Start training from score 0.024398
Accuracy: 0.9785932721712538
              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7329
           1       0.98      0.98      0.98      7386

    accuracy                           0.98     14715
   macro avg       0.98      0.98      0.98     14715
weighted avg       0.98      0.98      0.98     14715

✅ Model & vectorizer saved.
{'prediction': 0, 'probability_of_phishing': 0.018951235

