In [1]:
import numpy as np
import tensorflow as tf 
from keras.models import load_model

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten,Subtract,Reshape
from keras.preprocessing import image
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D,Conv2D,MaxPooling2D,Input,Lambda,GlobalMaxPooling2D
from keras.regularizers import l2
from keras import backend as K
from keras.applications.vgg16 import VGG16
from skimage.io import imsave

from matplotlib.pyplot import imread
from skimage.transform import rescale, resize
import os

2025-01-18 11:16:55.725388: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-18 11:16:57.831198: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-01-18 11:16:57.831418: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


# Load precomputed embeddings

In [2]:
output_dir = '../../../notebooks/'
dataset_path = '../../../data/processed/smallerSampleDataset/'

phish_emb_name = 'phishing_emb.npy'
phish_emb_labels_name = 'phishing_labels.npy'

phish_train_idx_name = 'train_idx.npy'
phish_test_idx_name = 'test_idx.npy'

train_emb_name = 'whitelist_emb.npy'
train_emb_labels_name = 'whitelist_labels.npy'

#precomputed attacks embeddings for the phishing test set if any. 
#set use_attack to 1 to compute based on this
phish_emb_test_attack = 'X_phish_test_noise_gamma.npy'
use_attack = 0

X_legit_train = np.load(output_dir+train_emb_name)
y_legit_train = np.load(output_dir+train_emb_labels_name)

X_phish = np.load(output_dir+phish_emb_name)
y_phish = np.load(output_dir+phish_emb_labels_name)

phish_test_idx = np.load(output_dir+phish_test_idx_name)
phish_train_idx = np.load(output_dir+phish_train_idx_name)

X_phish_test = X_phish[phish_test_idx,:]
y_phish_test = y_phish[phish_test_idx,:]

#set the phishing test set directly to the precomputed embeddings of the attack
if use_attack == 1:
    X_phish_test = np.load(output_dir+phish_emb_test_attack)
    print('Test on: '+phish_emb_test_attack)

X_phish_train = X_phish[phish_train_idx,:]
y_phish_train = y_phish[phish_train_idx,:]


In [3]:
# Get file names of each example 
def read_file_names(data_path,file_name):
    targets_file = open(data_path+file_name, "r")
    targets = targets_file.read()
    
    file_names_list = []
    targets_list = targets.splitlines()
    for i in range(0,len(targets_list)):
        target_path = data_path + targets_list[i]
        file_names = sorted(os.listdir(target_path))
        for j in range(0,len(file_names)):
            file_names_list.append(file_names[j])
    return file_names_list

legit_file_names = read_file_names(dataset_path+'trusted_list/','targets.txt')
phish_file_names = read_file_names(dataset_path+'phishing/','targets.txt')

phish_train_file_names = []
for i in range(0,phish_train_idx.shape[0]):
    phish_train_file_names.append(phish_file_names[phish_train_idx[i]])
    
phish_test_file_names = []
for i in range(0,phish_test_idx.shape[0]):
    phish_test_file_names.append(phish_file_names[phish_test_idx[i]])

def get_label_from_name(name):
    first_half = name.split('_',1)[0]
    number = int(first_half.replace('T',''))
    return number

In [4]:
# L2 distance
def compute_distance_pair(layer1,layer2):
    diff = layer1 - layer2
    l2_diff = np.sum(diff**2) / X_phish_train.shape[1]
    return l2_diff

# Pairwise distance between query image and training
def compute_all_distances(test_matrix):
    train_size = phish_train_idx.shape[0] + X_legit_train.shape[0]
    X_all_train = np.concatenate((X_phish_train,X_legit_train))
    pairwise_distance = np.zeros([test_matrix.shape[0],train_size])
    for i in range(0,test_matrix.shape[0]):
        pair1 = test_matrix[i,:]
        for j in range(0,train_size):
            pair2 = X_all_train[j,:]
            l2_diff = compute_distance_pair(pair1,pair2)
            pairwise_distance[i,j] = l2_diff
    return pairwise_distance
pairwise_distance = compute_all_distances(X_phish_test)

# Find Smallest n distances
def find_min_distances(distances,n):
    idx = distances.argsort()[:n]
    values = distances[idx]
    return idx,values

# Find names of examples with min distance
def find_names_min_distances(idx,values):
    names_min_distance = ''
    only_names = []
    distances = ''
    for i in range(0,idx.shape[0]):
        index_min_distance = idx[i]
        if (index_min_distance < X_phish_train.shape[0]):
            names_min_distance = names_min_distance + 'Phish: ' + phish_train_file_names[index_min_distance] +','
            only_names.append(phish_train_file_names[index_min_distance])   
        else:
            names_min_distance = names_min_distance + 'Legit: ' + legit_file_names[index_min_distance-X_phish_train.shape[0]] +','
            only_names.append(legit_file_names[index_min_distance-X_phish_train.shape[0]])   
        distances = distances + str(values[i]) + ','
    names_min_distance = names_min_distance[:-1]
    distances = distances[:-1]
    return names_min_distance,only_names,distances

# Find same-category website (matching is correct if it was matched to the same category (e.g. microsoft and outlook ))
parents_targets = ['microsoft','apple','google','alibaba']
sub_targets = [['ms_outlook','ms_office','ms_bing','ms_onedrive','ms_skype'],['itunes','icloud'],['google_drive'],['aliexpress']]

parents_targets_idx = [90,12,65,4]
sub_targets = [[150,152,151,149,148],[153,154],[147],[5]]

def check_if_same_category(img_label1,img_label2):
    if_same = 0
    if img_label1 in parents_targets_idx:
        if img_label2 in sub_targets[parents_targets_idx.index(img_label1)]:
            if_same = 1
    elif img_label1 in sub_targets[0]:
        if img_label2 in sub_targets[0] or img_label2 == parents_targets_idx[0]:
            if_same = 1
    elif img_label1 in sub_targets[1]:
        if img_label2 in sub_targets[1] or img_label2 == parents_targets_idx[1]:
            if_same = 1
    elif img_label1 in sub_targets[2]:
        if img_label2 in sub_targets[2] or img_label2 == parents_targets_idx[2]:
            if_same = 1
    return if_same

# Find if target is in the top closest n distances
def check_if_target_in_top(test_file_name,only_names):
    found = 0
    idx = 0
    test_label = get_label_from_name(test_file_name)
    print('***')
    print('Test example: '+test_file_name)
    for i in range(0,len(only_names)):
        label_distance = get_label_from_name(only_names[i])
        if label_distance == test_label or check_if_same_category(test_label,label_distance) == 1:
            found = 1
            idx = i+1
            print('found')
            break
    return found,idx

# Compute correct matches

In [11]:
n = 1 #Top-1 match
correct = 0

for i in range(0,phish_test_idx.shape[0]):
    distances_to_train = pairwise_distance[i,:]
    idx,values = find_min_distances(np.ravel(distances_to_train),n)
    names_min_distance,only_names,min_distances = find_names_min_distances(idx,values)
    found,found_idx = check_if_target_in_top(phish_test_file_names[i],only_names)
    print(names_min_distance)
    
    if found == 1:
        correct += 1
        

print("Correct match percentage: " + str(correct/phish_test_idx.shape[0]))

***
Test example: T7_8.png
Legit: T4_137.png
***
Test example: T4_6.png
found
Phish: T4_12.png
***
Test example: T4_34.png
found
Phish: T4_31.png
***
Test example: T12_36.png
Phish: T30_13.png
***
Test example: T4_23.png
found
Phish: T4_43.png
***
Test example: T4_24.png
found
Phish: T4_51.png
***
Test example: T30_2.jpg
Phish: T4_41.jpg
***
Test example: T7_19.png
found
Legit: T7_9.png
***
Test example: T30_19.png
Legit: T4_76.png
***
Test example: T7_20.png
found
Legit: T7_8.png
***
Test example: T12_18.png
found
Phish: T12_11.png
***
Test example: T4_49.png
found
Phish: T4_17.png
***
Test example: T30_27.png
found
Phish: T30_1.jpg
***
Test example: T30_10.png
found
Phish: T30_23.png
***
Test example: T12_23.png
Legit: T0_26.png
***
Test example: T12_34.png
Phish: T30_20.png
***
Test example: T12_35.png
found
Phish: T12_11.png
***
Test example: T30_28.png
found
Phish: T30_24.png
***
Test example: T7_17.png
Phish: T30_26.png
***
Test example: T30_21.png
Legit: T0_54.png
***
Test examp