In [22]:
# Step 1: Imports
import pandas as pd
import numpy as np
import re
from urllib.parse import urlparse
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

# Step 2: Load Dataset
df = pd.read_csv("/content/Phishing_Legitimate_full.csv")

# Step 3: Define phishing keywords
phishing_keywords = ['login', 'verify', 'account', 'update', 'secure', 'free', 'click', 'paypal']

# Step 4: Define features
url_features = [
    'URLLength', 'DomainLength', 'IsDomainIP', 'NoOfDegitsInURL', 'DegitRatioInURL',
    'NoOfEqualsInURL', 'NoOfQMarkInURL', 'NoOfAmpersandInURL',
    'NoOfOtherSpecialCharsInURL', 'SpacialCharRatioInURL',
    'IsHTTPS', 'NoOfLettersInURL', 'LetterRatioInURL',
    'Bank', 'Pay', 'Crypto'
] + [f'kw_{kw}' for kw in phishing_keywords]

# Step 5: Add keyword features
def add_keyword_features(df):
    for kw in phishing_keywords:
        df[f'kw_{kw}'] = df['URL'].str.lower().str.contains(kw).astype(int)
    return df

df = add_keyword_features(df)

# Step 6: Prepare data
X = df[url_features]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train model with class balance
model = RandomForestClassifier(class_weight='balanced', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("✅ Model Accuracy:", accuracy_score(y_test, y_pred))

# Step 8: Feature extraction for URL
def extract_features_from_url(url):
    parsed = urlparse(url)
    domain = parsed.netloc if parsed.netloc else parsed.path
    features = {}

    features['URLLength'] = len(url)
    features['DomainLength'] = len(domain)
    features['IsDomainIP'] = 1 if re.match(r'^\d{1,3}(\.\d{1,3}){3}$', domain) else 0
    features['NoOfDegitsInURL'] = sum(c.isdigit() for c in url)
    features['DegitRatioInURL'] = features['NoOfDegitsInURL'] / len(url)
    features['NoOfEqualsInURL'] = url.count('=')
    features['NoOfQMarkInURL'] = url.count('?')
    features['NoOfAmpersandInURL'] = url.count('&')
    special_chars = re.findall(r'[^\w]', url)
    features['NoOfOtherSpecialCharsInURL'] = len(special_chars) - features['NoOfEqualsInURL'] - features['NoOfQMarkInURL'] - features['NoOfAmpersandInURL']
    features['SpacialCharRatioInURL'] = len(special_chars) / len(url)
    features['IsHTTPS'] = 1 if parsed.scheme == 'https' else 0
    letters = sum(c.isalpha() for c in url)
    features['NoOfLettersInURL'] = letters
    features['LetterRatioInURL'] = letters / len(url)
    features['Bank'] = 1 if 'bank' in url.lower() else 0
    features['Pay'] = 1 if 'pay' in url.lower() else 0
    features['Crypto'] = 1 if 'crypto' in url.lower() else 0

    # Add keyword flags
    for kw in phishing_keywords:
        features[f'kw_{kw}'] = 1 if kw in url.lower() else 0

    return features

# Step 9: Check if a URL is suspicious (based on manual rules)
def is_suspicious(url):
    url = url.lower()
    tld_suspicious = any(url.endswith(ext) for ext in ['.xyz', '.top', '.biz', '.tk'])
    keyword_count = sum(kw in url for kw in phishing_keywords)
    subdomain_count = url.count('.') > 2
    if keyword_count >= 2 or tld_suspicious or subdomain_count:
        return True
    return False

# Step 10: Predict with user input
while True:
    input_url = input("\nEnter a URL (or type 'exit' to quit): ")
    if input_url.lower() == 'exit':
        break

    if not re.match(r'^https?://', input_url):
        print("❌ Invalid URL format. Please start with http:// or https://")
        continue

    try:
        features = extract_features_from_url(input_url)
        input_df = pd.DataFrame([features])
        input_df = input_df[X_train.columns]

        prediction = model.predict(input_df)[0]
        suspicious = is_suspicious(input_url)

        if prediction == 1 or suspicious:
            print(f"🔎 Prediction for '{input_url}': 🚨 Phishing Website")
        else:
            print(f"🔎 Prediction for '{input_url}': ✅ Legitimate Website")
    except Exception as e:
        print("❌ Error:", e)


✅ Model Accuracy: 0.9972645730401408

Enter a URL (or type 'exit' to quit): adsf
❌ Invalid URL format. Please start with http:// or https://

Enter a URL (or type 'exit' to quit): https://www.google.com    
🔎 Prediction for 'https://www.google.com    ': ✅ Legitimate Website

Enter a URL (or type 'exit' to quit): http://verify-account-secure-login.biz
🔎 Prediction for 'http://verify-account-secure-login.biz': 🚨 Phishing Website

Enter a URL (or type 'exit' to quit): http://free-login-paypal-update.xyz
🔎 Prediction for 'http://free-login-paypal-update.xyz': 🚨 Phishing Website

Enter a URL (or type 'exit' to quit): https://monkeytype.com/
🔎 Prediction for 'https://monkeytype.com/': ✅ Legitimate Website

Enter a URL (or type 'exit' to quit): https://spendssmart.netlify.app/#/
🔎 Prediction for 'https://spendssmart.netlify.app/#/': ✅ Legitimate Website

Enter a URL (or type 'exit' to quit): http://secure-chase-banking-login.page
🔎 Prediction for 'http://secure-chase-banking-login.page': 🚨 Ph