In [None]:
pip install pandas numpy scikit-learn tldextract joblib

In [None]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import tldextract 
import joblib  

In [None]:
df=pd.read_csv(r"C:\Users\froze\Downloads\archive (7)\malicious_phish.csv")
df.head()

In [None]:
df = df.sample(n=len(df), random_state=42)

In [None]:
df['type'].value_counts()

In [None]:
df['label']= df['type'].apply(lambda x :0 if x=='benign' else 1)
df.head()

In [None]:
df.head()


In [None]:
import re
import math
import pandas as pd
import tldextract
from urllib.parse import urlparse


def url_entropy(url):
    """Calculate Shannon entropy of the URL."""
    if not url:
        return 0
    freq = {c: url.count(c) for c in set(url)}
    return -sum((f / len(url)) * math.log2(f / len(url)) for f in freq.values())

def extract_features(url):
    parsed_url = urlparse(url)
    extracted = tldextract.extract(url)
    
    features = {
        'url_length': len(url),
        'domain_length': len(extracted.domain),
        'tld': extracted.suffix,
        'has_ip': 1 if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', parsed_url.netloc) else 0,
        'num_dots': url.count('.'),
        'num_hyphens': url.count('-'),
        'num_digits': sum(c.isdigit() for c in url),
        'num_subdomains': len(extracted.subdomain.split('.')) if extracted.subdomain else 0,
        'has_https': 1 if parsed_url.scheme == 'https' else 0,
        'has_at': 1 if '@' in url else 0,
        'has_redirect': 1 if '//' in parsed_url.path else 0,
    }
    
  
    features['entropy'] = url_entropy(url)
    
    suspicious_keywords = [
        'login', 'bank', 'update', 'secure', 'verify', 
        'account', 'paypal', 'amazon', 'malicious', 'phish'
    ]
    features['num_suspicious_keywords'] = sum(1 for kw in suspicious_keywords if kw in url.lower())
    
    features['num_special_chars'] = sum(1 for c in url if c in '?=&%')

    return features


df = df.reset_index(drop=True)

def normalize_url(url):
    parsed = urlparse(url)
    netloc = parsed.netloc.replace('www.', '')  # Remove 'www.'
    return netloc + parsed.path + parsed.params + parsed.query  # Rebuild without scheme

df['url'] = df['url'].apply(normalize_url)


feature_list = df['url'].apply(extract_features).tolist()
df_features = pd.DataFrame(feature_list)
df_features['label'] = df['label']  


top_tlds = df_features['tld'].value_counts().index[:20]
df_features['tld'] = df_features['tld'].apply(lambda x: x if x in top_tlds else 'other')
df_features = pd.get_dummies(df_features, columns=['tld'])

df_features.head()


In [None]:
X = df_features.drop('label', axis=1)
y=df_features['label']



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model= RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


In [None]:
print(classification_report(y_test, y_pred))

In [None]:
model.score(X_train, y_train)

In [None]:
expected_columns = X.columns.tolist()
expected_columns
joblib.dump(expected_columns, 'expected_columns.pkl')

In [None]:
joblib.dump(model, 'URL_detection_model.pkl')

In [None]:
top_tlds
joblib.dump(top_tlds, 'top_tlds.pkl')

In [None]:
model=joblib.load('URL_detection_model.pkl')
top_tlds = joblib.load('top_tlds.pkl')
expected_columns = joblib.load('expected_columns.pkl')


In [None]:
expected_columns

In [None]:
extract_features

In [None]:
def custom_url(url):
    custom_features=extract_features(url)
    custom_df=pd.DataFrame([custom_features])

    custom_df['tld']=custom_df['tld'].apply(lambda x: x if x in top_tlds else 'other')
    custom_df=pd.get_dummies(custom_df,columns=['tld'])


    for col in expected_columns:
        if col not in custom_df.columns:
            custom_df[col]=0
    custom_df = custom_df[expected_columns]

    prediction=model.predict(custom_df)
    probability=model.predict_proba(custom_df)

    label = "Benign" if prediction[0] == 0 else "Malicious"
    prob_benign = probability[0][0]
    prob_malicious = probability[0][1]

    return label, prob_benign, prob_malicious


  
    

In [None]:
model.score(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test data
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
#testing cell
urls_to_test = [
    "https://google.com",
    "https://www.wikipedia.org",
    "https://github.com",
    "https://www.khanacademy.org",
    "https://www.stackoverflow.com",
    "https://www.microsoft.com/en-us",
    "https://www.nytimes.com",
    "https://www.researchgate.net",
    "https://www.bbc.com/news",
    "https://www.coursera.org",
    "http://example-malicious-site.com/login?fake=1",
    "http://192.168.1.1/malware.exe",
    "http://update-banking-info.xyz",
    "http://secure-paypal-login.com/verify",
    "http://amazon-login-security-update.net",
    "http://free-gift-card-reward.click",
    "http://bankofamerica.verify-user.info/login",
    "http://phishing-site.ru/account/update",
    "http://cheap-luxury-products.cn/paypal",
    "http://darkweb-marketplace.onion"
]

    
for url in urls_to_test:
    label, prob_benign, prob_malicious = custom_url(url)
    print(f"URL: {url}")
    print(f"Prediction: {label}")
    print(f"Probability (Benign): {prob_benign:.2f}")
    print(f"Probability (Malicious): {prob_malicious:.2f}\n")