In [2]:
pip install pandas numpy scikit-learn tldextract joblib

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from urllib.parse import urlparse
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import tldextract 
import joblib  

In [4]:
df=pd.read_csv(r"C:\Users\froze\Downloads\archive (7)\malicious_phish.csv")
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


In [5]:
df = df.sample(n=len(df), random_state=42)

In [6]:
df['type'].value_counts()

type
benign        428103
defacement     96457
phishing       94111
malware        32520
Name: count, dtype: int64

In [7]:
df['label']= df['type'].apply(lambda x :0 if x=='benign' else 1)
df.head()

Unnamed: 0,url,type,label
536448,http://37.49.226.178/deusbins/deus.sh4,malware,1
40630,medical-dictionary.thefreedictionary.com/Galt+...,benign,0
630496,www.jscape.com/sshfactory/,phishing,1
426724,http://www.wsnc.org.au/component/jcalpro/view/983,defacement,1
184034,virtualtourist.com/travel/North_America/Canada...,benign,0


In [8]:
df.head()


Unnamed: 0,url,type,label
536448,http://37.49.226.178/deusbins/deus.sh4,malware,1
40630,medical-dictionary.thefreedictionary.com/Galt+...,benign,0
630496,www.jscape.com/sshfactory/,phishing,1
426724,http://www.wsnc.org.au/component/jcalpro/view/983,defacement,1
184034,virtualtourist.com/travel/North_America/Canada...,benign,0


In [9]:
import re
import math
import pandas as pd
import tldextract
from urllib.parse import urlparse


def url_entropy(url):
    """Calculate Shannon entropy of the URL."""
    if not url:
        return 0
    freq = {c: url.count(c) for c in set(url)}
    return -sum((f / len(url)) * math.log2(f / len(url)) for f in freq.values())

def extract_features(url):
    parsed_url = urlparse(url)
    extracted = tldextract.extract(url)
    
    features = {
        'url_length': len(url),
        'domain_length': len(extracted.domain),
        'tld': extracted.suffix,
        'has_ip': 1 if re.match(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', parsed_url.netloc) else 0,
        'num_dots': url.count('.'),
        'num_hyphens': url.count('-'),
        'num_digits': sum(c.isdigit() for c in url),
        'num_subdomains': len(extracted.subdomain.split('.')) if extracted.subdomain else 0,
        'has_https': 1 if parsed_url.scheme == 'https' else 0,
        'has_at': 1 if '@' in url else 0,
        'has_redirect': 1 if '//' in parsed_url.path else 0,
    }
    
  
    features['entropy'] = url_entropy(url)
    
    suspicious_keywords = [
        'login', 'bank', 'update', 'secure', 'verify', 
        'account', 'paypal', 'amazon', 'malicious', 'phish'
    ]
    features['num_suspicious_keywords'] = sum(1 for kw in suspicious_keywords if kw in url.lower())
    
    features['num_special_chars'] = sum(1 for c in url if c in '?=&%')

    return features


df = df.reset_index(drop=True)

def normalize_url(url):
    parsed = urlparse(url)
    netloc = parsed.netloc.replace('www.', '')  # Remove 'www.'
    return netloc + parsed.path + parsed.params + parsed.query  # Rebuild without scheme

df['url'] = df['url'].apply(normalize_url)


feature_list = df['url'].apply(extract_features).tolist()
df_features = pd.DataFrame(feature_list)
df_features['label'] = df['label']  


top_tlds = df_features['tld'].value_counts().index[:20]
df_features['tld'] = df_features['tld'].apply(lambda x: x if x in top_tlds else 'other')
df_features = pd.get_dummies(df_features, columns=['tld'])

df_features.head()


Unnamed: 0,url_length,domain_length,has_ip,num_dots,num_hyphens,num_digits,num_subdomains,has_https,has_at,has_redirect,...,tld_gov,tld_info,tld_it,tld_jp,tld_net,tld_nl,tld_org,tld_other,tld_pl,tld_ru
0,31,13,0,4,0,11,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False
1,54,17,0,2,1,0,1,0,0,0,...,False,False,False,False,False,False,False,False,False,False
2,26,6,0,2,0,0,1,0,0,0,...,False,False,False,False,False,False,False,False,False,False
3,38,4,0,2,0,3,0,0,0,0,...,False,False,False,False,False,False,False,True,False,False
4,121,14,0,2,5,7,0,0,0,0,...,False,False,False,False,False,False,False,False,False,False


In [10]:
X = df_features.drop('label', axis=1)
y=df_features['label']



In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model= RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')

model.fit(X_train,y_train)

In [12]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.9006979476193767


In [13]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.93      0.93     85489
           1       0.87      0.84      0.85     44750

    accuracy                           0.90    130239
   macro avg       0.89      0.89      0.89    130239
weighted avg       0.90      0.90      0.90    130239



In [14]:
model.score(X_train, y_train)

0.984363242678788

In [15]:
expected_columns = X.columns.tolist()
expected_columns
joblib.dump(expected_columns, 'expected_columns.pkl')

['expected_columns.pkl']

In [16]:
joblib.dump(model, 'URL_detection_model.pkl')

['URL_detection_model.pkl']

In [17]:
top_tlds
joblib.dump(top_tlds, 'top_tlds.pkl')

['top_tlds.pkl']

In [18]:
model=joblib.load('URL_detection_model.pkl')
top_tlds = joblib.load('top_tlds.pkl')
expected_columns = joblib.load('expected_columns.pkl')


In [19]:
expected_columns

['url_length',
 'domain_length',
 'has_ip',
 'num_dots',
 'num_hyphens',
 'num_digits',
 'num_subdomains',
 'has_https',
 'has_at',
 'has_redirect',
 'entropy',
 'num_suspicious_keywords',
 'num_special_chars',
 'tld_',
 'tld_ca',
 'tld_co.uk',
 'tld_com',
 'tld_com.au',
 'tld_com.br',
 'tld_de',
 'tld_edu',
 'tld_es',
 'tld_eu',
 'tld_fr',
 'tld_gov',
 'tld_info',
 'tld_it',
 'tld_jp',
 'tld_net',
 'tld_nl',
 'tld_org',
 'tld_other',
 'tld_pl',
 'tld_ru']

In [20]:
extract_features

<function __main__.extract_features(url)>

In [21]:
def custom_url(url):
    custom_features=extract_features(url)
    custom_df=pd.DataFrame([custom_features])

    custom_df['tld']=custom_df['tld'].apply(lambda x: x if x in top_tlds else 'other')
    custom_df=pd.get_dummies(custom_df,columns=['tld'])


    for col in expected_columns:
        if col not in custom_df.columns:
            custom_df[col]=0
    custom_df = custom_df[expected_columns]

    prediction=model.predict(custom_df)
    probability=model.predict_proba(custom_df)

    label = "Benign" if prediction[0] == 0 else "Malicious"
    prob_benign = probability[0][0]
    prob_malicious = probability[0][1]

    return label, prob_benign, prob_malicious


  
    

In [22]:
model.score(X_train,y_train)

0.984363242678788

In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Predict on test data
y_pred = model.predict(X_test)

# Accuracy
print("Accuracy:", accuracy_score(y_test, y_pred))

# Classification report
print("Classification Report:\n", classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9006979476193767
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.93     85489
           1       0.87      0.84      0.85     44750

    accuracy                           0.90    130239
   macro avg       0.89      0.89      0.89    130239
weighted avg       0.90      0.90      0.90    130239

Confusion Matrix:
 [[79894  5595]
 [ 7338 37412]]


In [24]:
#testing cell
urls_to_test = [
    "https://google.com",
    "https://www.wikipedia.org",
    "https://github.com",
    "https://www.khanacademy.org",
    "https://www.stackoverflow.com",
    "https://www.microsoft.com/en-us",
    "https://www.nytimes.com",
    "https://www.researchgate.net",
    "https://www.bbc.com/news",
    "https://www.coursera.org",
    "http://example-malicious-site.com/login?fake=1",
    "http://192.168.1.1/malware.exe",
    "http://update-banking-info.xyz",
    "http://secure-paypal-login.com/verify",
    "http://amazon-login-security-update.net",
    "http://free-gift-card-reward.click",
    "http://bankofamerica.verify-user.info/login",
    "http://phishing-site.ru/account/update",
    "http://cheap-luxury-products.cn/paypal",
    "http://darkweb-marketplace.onion"
]

    
for url in urls_to_test:
    label, prob_benign, prob_malicious = custom_url(url)
    print(f"URL: {url}")
    print(f"Prediction: {label}")
    print(f"Probability (Benign): {prob_benign:.2f}")
    print(f"Probability (Malicious): {prob_malicious:.2f}\n")

URL: https://google.com
Prediction: Benign
Probability (Benign): 0.78
Probability (Malicious): 0.22

URL: https://www.wikipedia.org
Prediction: Benign
Probability (Benign): 0.59
Probability (Malicious): 0.41

URL: https://github.com
Prediction: Benign
Probability (Benign): 1.00
Probability (Malicious): 0.00

URL: https://www.khanacademy.org
Prediction: Benign
Probability (Benign): 0.59
Probability (Malicious): 0.41

URL: https://www.stackoverflow.com
Prediction: Benign
Probability (Benign): 0.52
Probability (Malicious): 0.48

URL: https://www.microsoft.com/en-us
Prediction: Benign
Probability (Benign): 0.87
Probability (Malicious): 0.13

URL: https://www.nytimes.com
Prediction: Malicious
Probability (Benign): 0.15
Probability (Malicious): 0.85

URL: https://www.researchgate.net
Prediction: Malicious
Probability (Benign): 0.10
Probability (Malicious): 0.90

URL: https://www.bbc.com/news
Prediction: Benign
Probability (Benign): 0.75
Probability (Malicious): 0.25

URL: https://www.courser