In [2]:
!pip install tld

Collecting tld
  Downloading tld-0.13.1-py2.py3-none-any.whl.metadata (10 kB)
Downloading tld-0.13.1-py2.py3-none-any.whl (274 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/274.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.7/274.7 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tld
Successfully installed tld-0.13.1


In [5]:
import joblib
import pandas as pd
import re
from urllib.parse import urlparse
from tld import get_tld

# Load the saved model and feature names
model = joblib.load('/content/url_model.joblib')
model_features = joblib.load('/content/model_features.joblib')

# Define all feature extraction functions you used before
def process_tld(url):
    try:
        res = get_tld(url, as_object=True, fail_silently=False, fix_protocol=True)
        return res.parsed_url.netloc
    except:
        return None

def abnormal_url(url):
    hostname = urlparse(url).hostname
    hostname = str(hostname)
    return 1 if re.search(hostname, url) else 0

def httpSecure(url):
    return 1 if urlparse(url).scheme == 'https' else 0

def digit_count(url):
    return sum(c.isdigit() for c in url)

def letter_count(url):
    return sum(c.isalpha() for c in url)

def Shortining_Service(url):
    match = re.search(r'bit\.ly|goo\.gl|tinyurl|ow\.ly|t\.co', url)  # Add all patterns you had
    return 1 if match else 0

def having_ip_address(url):
    match = re.search(r'(([0-9]{1,3}\.){3}[0-9]{1,3})', url)
    return 1 if match else 0

# Extract features from a single new URL (must match model features exactly)
def extract_features(url):
    url = re.sub(r'www\.', '', url)  # remove www.
    data = {
        'url_len': len(url),
        '@': url.count('@'),
        '?': url.count('?'),
        '-': url.count('-'),
        '=': url.count('='),
        '.': url.count('.'),
        '#': url.count('#'),
        '%': url.count('%'),
        '+': url.count('+'),
        '$': url.count('$'),
        '!': url.count('!'),
        '*': url.count('*'),
        ',': url.count(','),
        '//': url.count('//'),
        'abnormal_url': abnormal_url(url),
        'https': httpSecure(url),
        'digits': digit_count(url),
        'letters': letter_count(url),
        'Shortining_Service': Shortining_Service(url),
        'having_ip_address': having_ip_address(url)
    }
    df = pd.DataFrame([data])

    # Make sure all columns exist and in the right order
    for col in model_features:
        if col not in df.columns:
            df[col] = 0
    df = df[model_features]
    return df

# Example new URL
new_url = "https://bit.ly/phishing_test"

# Prepare feature vector
X_new = extract_features(new_url)

# Predict using the loaded model
prediction = model.predict(X_new)[0]

# Map your numeric label back to text
labels = {0: "benign", 1: "defacement", 2: "phishing", 3: "malware"}

print(f"Prediction for URL '{new_url}': {labels[prediction]}")


Prediction for URL 'https://bit.ly/phishing_test': phishing


In [6]:
test_urls = [
    "http://example.com",                            # benign
    "https://www.google.com",                        # benign, secure https
    "http://bit.ly/abc123",                          # shortened URL, possibly malicious
    "http://192.168.1.1/login",                      # URL with IP address
    "https://secure-login-paypal.com.verify-info.xyz",  # suspicious phishing style URL
    "http://free-money-now.com/?click=true",        # potentially phishing
    "https://tinyurl.com/xyz789",                    # shortened URL
    "http://bank-secure-update.com/login?user=abc", # suspicious phishing URL
    "https://github.com",                            # benign
    "http://malicious-site.com/malware.exe",        # possible malware URL
]


In [7]:
for url in test_urls:
    features = extract_features(url)
    pred = model.predict(features)[0]
    print(f"URL: {url}\nPrediction: {labels[pred]}\n")

URL: http://example.com
Prediction: phishing

URL: https://www.google.com
Prediction: phishing

URL: http://bit.ly/abc123
Prediction: phishing

URL: http://192.168.1.1/login
Prediction: malware

URL: https://secure-login-paypal.com.verify-info.xyz
Prediction: phishing

URL: http://free-money-now.com/?click=true
Prediction: defacement

URL: https://tinyurl.com/xyz789
Prediction: phishing

URL: http://bank-secure-update.com/login?user=abc
Prediction: defacement

URL: https://github.com
Prediction: phishing

URL: http://malicious-site.com/malware.exe
Prediction: defacement

