In [58]:
import numpy as np
import pandas as pd
import hashlib

from sklearn.linear_model import Lasso
from scipy.optimize import lsq_linear
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [59]:
urls = pd.read_csv('PhiUSIIL_Phishing_URL_Dataset.csv')
urls.head()

Unnamed: 0,FILENAME,URL,URLLength,Domain,DomainLength,IsDomainIP,TLD,URLSimilarityIndex,CharContinuationRate,TLDLegitimateProb,...,Pay,Crypto,HasCopyrightInfo,NoOfImage,NoOfCSS,NoOfJS,NoOfSelfRef,NoOfEmptyRef,NoOfExternalRef,label
0,521848.txt,https://www.southbankmosaics.com,31,www.southbankmosaics.com,24,0,com,100.0,1.0,0.522907,...,0,0,1,34,20,28,119,0,124,1
1,31372.txt,https://www.uni-mainz.de,23,www.uni-mainz.de,16,0,de,100.0,0.666667,0.03265,...,0,0,1,50,9,8,39,0,217,1
2,597387.txt,https://www.voicefmradio.co.uk,29,www.voicefmradio.co.uk,22,0,uk,100.0,0.866667,0.028555,...,0,0,1,10,2,7,42,2,5,1
3,554095.txt,https://www.sfnmjournal.com,26,www.sfnmjournal.com,19,0,com,100.0,1.0,0.522907,...,1,1,1,3,27,15,22,1,31,1
4,151578.txt,https://www.rewildingargentina.org,33,www.rewildingargentina.org,26,0,org,100.0,1.0,0.079963,...,1,0,1,244,15,34,72,1,85,1


## Bloom Setup 

In [135]:
NUM_BLOOMBITS = 1438
NUM_HASHES = 10

def hash_domain(url, num_bloombits, num_hashes):
    bits = np.zeros(num_bloombits, dtype=bool)
    
    for i in range(num_hashes):
        salted_input = f"{i}:{url}".encode()  # salt by prefixing i
        hash_digest = hashlib.sha256(salted_input).hexdigest()
        index = int(hash_digest, 16) % num_bloombits
        bits[index] = True

    return bits

def create_candidate_set(urls, num_bloombits, num_hashes):
    """
    Create a candidate set of URLs with their hashed indices.
    """
    candidate_set = []
    for url in urls:
        bits = hash_domain(url, num_bloombits, num_hashes)
        candidate_set.append(bits)
    return np.array(candidate_set)

# Get 100 URLS that have label set to 0
urls_mal = urls[urls['label'] == 0].sample(100, random_state=42)
urls_mal = np.array(urls_mal['URL'])

candidate_set = create_candidate_set(urls_mal, NUM_BLOOMBITS, NUM_HASHES)
print("Candidate Set Shape:", candidate_set.shape)

Candidate Set Shape: (100, 1438)


## RAPPOR - View Creation

In [213]:
def prr_view(url, num_bloombits, num_hashes, f):
    bits = hash_domain(url, num_bloombits, num_hashes)

    # Randomly keep bits with probability f
    rand_mask = np.random.rand(num_bloombits)
    keep_mask = rand_mask < 1 - f
    random_bits = np.random.rand(num_bloombits) < 0.5

    prr = np.where(keep_mask, bits, random_bits)
    return prr

def irr_view(prr, p, q):
    rand = np.random.rand(len(prr))

    # If prr is True (1), keep with probability q; if False (0), set with probability p
    irr = np.where(
        prr,
        rand < q,
        rand < p
    )

    return irr

In [225]:
prob_p = 0.5
prob_q = 0.75
prob_f = 0.5

# Select 10 urls with label set to 1
normal = urls[urls['label'] == 1].sample(10, random_state=42)
normal = normal['URL'].tolist()

# Select 1 url  from urls_mal
random_index = np.random.choice(len(urls_mal), 1, replace=False)
phishing = urls_mal[random_index].tolist()

query_set = normal + phishing

In [226]:
query_set

['https://www.atelierozmoz.be',
 'https://www.diemon.com',
 'https://www.wausauschools.org',
 'https://www.paademode.com',
 'https://www.boxturtles.com',
 'https://www.mmstadium.com',
 'https://www.brswimwear.com',
 'https://www.leathercouncil.org',
 'https://www.historync.org',
 'https://www.toshin.com',
 'https://leszek.arekhasnik.pl/add/email@example.com']

In [227]:
prr_views = []
for url in query_set:
    prr = prr_view(url, NUM_BLOOMBITS, NUM_HASHES, prob_f)
    prr_views.append(prr)

prr_views = np.array(prr_views)
print("PRR Views Shape:", prr_views.shape) 

PRR Views Shape: (11, 1438)


In [228]:
num_view = 1000

irr_matrix = []
for n in range(num_view):
    for prr in prr_views:
        irr = irr_view(prr, prob_p, prob_q)
        irr_matrix.append(irr)

irr_matrix = np.array(irr_matrix)
print(irr_matrix.shape)

(11000, 1438)


LASSO

In [229]:
y = irr_matrix.mean(axis=0)
n_samples = candidate_set.shape[0]
lambda_lasso = 0.01

print("y shape:", y.shape)
print("Candidate set shape:", candidate_set.shape)
alpha = lambda_lasso / n_samples

model = make_pipeline(
    StandardScaler(),
    Lasso(
        alpha=alpha,
        fit_intercept=True,
        positive=False,
        tol=1e-4,
        max_iter=2000
    )
)

try:
    model.fit(candidate_set.T, y)
    # Extract Lasso from pipeline to get coefficients
    lasso_coef = model.named_steps['lasso'].coef_
    reconstructed_freq = np.maximum(lasso_coef, 0)
    nonzero = np.sum(reconstructed_freq > 0)
    print(f"Used LASSO: {nonzero} non-zero coefficients")
    if nonzero == 0:
        raise Exception("Fallback to lsqnonneg")
except:
    res = lsq_linear(candidate_set.T, y, bounds=(0, np.inf))
    reconstructed_freq = res.x
    print(f"Used lsqnonneg: {np.sum(reconstructed_freq > 0)} non-zero coefficients")


y shape: (1438,)
Candidate set shape: (100, 1438)
Used LASSO: 49 non-zero coefficients


In [230]:
threshold = 0.001
detection = np.where(reconstructed_freq > threshold)

urls_detected = urls_mal[detection]
urls_detected

array(['http://www.soeme.com',
       'https://leszek.arekhasnik.pl/add/email@example.com',
       'https://service-106870.weeblysite.com/',
       'https://www-info-orico.kilnpay.com/',
       'http://mail.pemulihan11297.3gp.work.gd/',
       'http://www.moveonedegree.com',
       'https://gateway.ipfs.io/ipfs/bafybeihbg3efsd2hiqwl3xba37vfj5vcd23zwy747mcz6nkc7piu3gecpq/dropmebox22_ofic4.html',
       'https://optusnet-login.com/au/webmail/index.html',
       'https://authsecuritecle.web.app/',
       'https://login-cliente-rapido2023.com/luiza/home.php?userid=&uri=yc4kgyxpyiiys6i9h2e5vzj2n/mgjfs1d73alcee5ug=',
       'http://nilstiers.com',
       'https://accessible0-document-morroco-3jl58.ondigitalocean.app/',
       'http://www.radiolevi.ro',
       'https://ksheami.com/?article301=&_=/spip.php%232ul6dol2gmegls4woqvi8s0qdn05rgzpwjtdrzhh1pba',
       'http://www.myanmarnewsrecent.com'], dtype=object)

## Metric

In [231]:
# Check if phishing URL is detected
detected = 0
not_detected = 0
total_phishing = len(phishing)

for url in phishing:

    if url in urls_detected:
        detected += 1
    else:
        not_detected += 1
    
print(f"Detected: {detected}, Not Detected: {not_detected}, Total: {total_phishing} (Accuracy: {detected / total_phishing:.2f})")

# Check how many phishing urls are detected that are not in the candidate set
in_candidate = 0
for url in urls_mal:
    if url in urls_detected:
        in_candidate += 1

print(f"Phishing URLs in candidate set: {in_candidate} out of {len(urls_mal)}")

Detected: 1, Not Detected: 0, Total: 1 (Accuracy: 1.00)
Phishing URLs in candidate set: 15 out of 100
