In [None]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
phiusiil_phishing_url_website = fetch_ucirepo(id=967)

# data (as pandas dataframes)
X = phiusiil_phishing_url_website.data.features
y = phiusiil_phishing_url_website.data.targets

# metadata
print(phiusiil_phishing_url_website.metadata)

# variable information
print(phiusiil_phishing_url_website.variables)


{'uci_id': 967, 'name': 'PhiUSIIL Phishing URL (Website)', 'repository_url': 'https://archive.ics.uci.edu/dataset/967/phiusiil+phishing+url+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/967/data.csv', 'abstract': 'PhiUSIIL Phishing URL Dataset is a substantial dataset comprising 134,850 legitimate and 100,945 phishing URLs. Most of the URLs we analyzed, while constructing the dataset, are the latest URLs. Features are extracted from the source code of the webpage and URL. Features such as CharContinuationRate, URLTitleMatchScore, URLCharProb, and TLDLegitimateProb are derived from existing features.', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Tabular'], 'num_instances': 235795, 'num_features': 54, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['label'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2024, 'last_updated': 'Sun May 12 

In [None]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.preprocessing import Binarizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import pandas as pd

# 1. Fetch PhiUSIIL Phishing URL dataset
phiusiil_phishing_url_website = fetch_ucirepo(id=967)

# 2. Load data
X = phiusiil_phishing_url_website.data.features
y = phiusiil_phishing_url_website.data.targets

# If 'y' is a DataFrame with one column, convert to Series
if isinstance(y, pd.DataFrame):
    y = y.iloc[:, 0]

# 3. Optional: Explore metadata
#print(phiusiil_phishing_url_website.metadata)
#print(phiusiil_phishing_url_website.variables)

# 4. Drop non-numeric columns before binarizing
X_numeric = X.select_dtypes(include=['number'])

# 5. Binarize for BernoulliNB
binarizer = Binarizer()
X_bin = binarizer.fit_transform(X_numeric)

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_bin, y, test_size=0.3, random_state=42)

# 7. Train Bernoulli Naive Bayes model
bnb = BernoulliNB()
bnb.fit(X_train, y_train)

# 8. Make predictions
y_pred = bnb.predict(X_test)

# 9. Evaluation
print("🔹 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n🔹 Classification Report:")
print(classification_report(y_test, y_pred))

print("🔹 Accuracy Score:", accuracy_score(y_test, y_pred))



🔹 Confusion Matrix:
[[29278   873]
 [   77 40511]]

🔹 Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     30151
           1       0.98      1.00      0.99     40588

    accuracy                           0.99     70739
   macro avg       0.99      0.98      0.99     70739
weighted avg       0.99      0.99      0.99     70739

🔹 Accuracy Score: 0.986570350160449


In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import unicodedata
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import BernoulliNB
from difflib import SequenceMatcher

# Step 1: Fetch dataset
phiusiil = fetch_ucirepo(id=967)
X_raw = phiusiil.data.features
y = phiusiil.data.targets.iloc[:, 0] if isinstance(phiusiil.data.targets, pd.DataFrame) else phiusiil.data.targets

# Step 2: Feature Engineering from URL
def extract_features(url):
    parsed = urlparse(url)
    hostname = parsed.netloc.lower()
    path = parsed.path.lower()
    full = url.lower()

    def looks_like(target, url):
        return SequenceMatcher(None, target, url).ratio()

    features = {
        'url_length': len(full),
        'has_https': int(parsed.scheme == 'https'),
        'num_dots': full.count('.'),
        'num_hyphens': full.count('-'),
        'num_slashes': full.count('/'),
        'num_digits': sum(c.isdigit() for c in full),
        'has_ip': int(any(char.isdigit() for char in hostname.split('.')[0])),
        'homograph_google': looks_like('google.com', hostname),
        'homograph_paypal': looks_like('paypal.com', hostname),
        'homograph_amazon': looks_like('amazon.com', hostname)
    }
    return features

# Apply to dataset
X = pd.DataFrame([extract_features(url) for url in X_raw['URL']])

# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Incremental Learning
scaler = StandardScaler()
clf = SGDClassifier(loss='log_loss', max_iter=1000)

X_train_batches = np.array_split(X_train, 10)
y_train_batches = np.array_split(y_train, 10)

# Scale first batch and fit
X0 = scaler.fit_transform(X_train_batches[0])
clf.partial_fit(X0, y_train_batches[0], classes=np.unique(y_train))

# Incremental updates
for i in range(1, len(X_train_batches)):
    Xi = scaler.transform(X_train_batches[i])
    clf.partial_fit(Xi, y_train_batches[i])

# Final Evaluation
X_test_scaled = scaler.transform(X_test)
y_pred = clf.predict(X_test_scaled)

# Results
print("\n🔹 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n🔹 Classification Report:")
print(classification_report(y_test, y_pred))

print("\n🔹 Accuracy Score:", accuracy_score(y_test, y_pred))



🔹 Confusion Matrix:
[[19837   287]
 [    7 27028]]

🔹 Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     20124
           1       0.99      1.00      0.99     27035

    accuracy                           0.99     47159
   macro avg       0.99      0.99      0.99     47159
weighted avg       0.99      0.99      0.99     47159


🔹 Accuracy Score: 0.9937657711147395


  return bound(*args, **kwds)
  return bound(*args, **kwds)


In [None]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import unicodedata
from difflib import SequenceMatcher
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# Step 1: Fetch dataset
phiusiil = fetch_ucirepo(id=967)
X_raw = phiusiil.data.features
y = phiusiil.data.targets.iloc[:, 0] if isinstance(phiusiil.data.targets, pd.DataFrame) else phiusiil.data.targets

# Step 2: Similarity & Feature Functions
def looks_like(target, url):
    return SequenceMatcher(None, target, url).ratio()

def contains_zero_width(url):
    return any(c in url for c in ['\u200b', '\u200c', '\u200d'])

def is_punycode(url):
    return 'xn--' in urlparse(url).netloc

def extract_features(url):
    parsed = urlparse(url)
    hostname = parsed.netloc.lower()
    path = parsed.path.lower()
    full = url.lower()

    features = {
        'url_length': len(full),
        'has_https': int(parsed.scheme == 'https'),
        'num_dots': full.count('.'),
        'num_hyphens': full.count('-'),
        'num_slashes': full.count('/'),
        'num_digits': sum(c.isdigit() for c in full),
        'has_ip': int(any(char.isdigit() for char in hostname.split('.')[0])),
        'homograph_google': looks_like('google.com', hostname),
        'homograph_paypal': looks_like('paypal.com', hostname),
        'homograph_amazon': looks_like('amazon.com', hostname),
        'has_zero_width': int(contains_zero_width(full)),
        'is_punycode': int(is_punycode(url)),
        'homograph_combined': max(
            looks_like('google.com', hostname),
            looks_like('paypal.com', hostname),
            looks_like('amazon.com', hostname)
        )
    }
    return features

# Step 3: Extract Features
X = pd.DataFrame([extract_features(url) for url in X_raw['URL']])

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Model Training with XGBoost
clf = XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, use_label_encoder=False, eval_metric='logloss')
clf.fit(X_train, y_train)

# Step 6: Predictions and Evaluation
y_pred = clf.predict(X_test)

print("\n\U0001F4D9 Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\n\U0001F4D9 Classification Report:")
print(classification_report(y_test, y_pred))

print("\n\U0001F4D9 Accuracy Score:", accuracy_score(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.




📙 Confusion Matrix:
[[19947   177]
 [   11 27024]]

📙 Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     20124
           1       0.99      1.00      1.00     27035

    accuracy                           1.00     47159
   macro avg       1.00      1.00      1.00     47159
weighted avg       1.00      1.00      1.00     47159


📙 Accuracy Score: 0.9960134862910579


In [None]:
import pickle

In [None]:

# Assuming 'clf' is your trained model
model_filename = 'xgb_phishing_model.pkl'
with open(model_filename, 'wb') as f:
    pickle.dump(clf, f)
