In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from tools.config import PROCESSED_DATA_DIR, INTERIM_DATA_DIR
import logging
import numpy as np
from argparse import ArgumentParser

In [3]:
%tb
parser = ArgumentParser(add_help=False)
# Dataset parameters
parser.add_argument('--dataset-path', type=str, default=PROCESSED_DATA_DIR / 'smallerSampleDataset')
parser.add_argument('--reshape-size', default=[224, 224, 3])
parser.add_argument('--phishing-test-size', default=0.4)
parser.add_argument('--num-targets', type=int, default=5)
parser.add_argument('--legit-imgs-num', default=420)
parser.add_argument('--phish-imgs-num', default=160)
# Model parameters
parser.add_argument('--input-shape', default=[224, 224, 3])
parser.add_argument('--margin', type=float, default=2.2)
parser.add_argument('--new-conv-params', default=[5, 5, 512])
# Training parameters
parser.add_argument('--start-lr', type=float, default=2e-5)  # 0.00002
parser.add_argument('--output-dir', type=str, default=INTERIM_DATA_DIR / 'smallerSampleDataset')
parser.add_argument('--saved-model-name', type=str, default='model')  # from first training
parser.add_argument('--new-saved-model-name', type=str, default='model2')
parser.add_argument('--save-interval', type=int, default=2000)
parser.add_argument('--batch-size', type=int, default=32)
parser.add_argument('--n-iter', type=int, default=50000)
parser.add_argument('--lr-interval', type=int, default=250)
# hard examples training
parser.add_argument('--num-sets', type=int, default=100)
parser.add_argument('--iter-per-set', type=int, default=8)
# parser.add_argument('--n_iter', type=int, default=30)


logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
)
logger = logging.getLogger()

No traceback available to show.


In [4]:
args, _ = parser.parse_known_args()

In [5]:
# Define paths for saved .npy files
imgs_train_path = args.output_dir / 'all_imgs_train.npy'
labels_train_path = args.output_dir / 'all_labels_train.npy'
file_names_train_path = args.output_dir / 'all_file_names_train.npy'

imgs_test_path = args.output_dir / 'all_imgs_test.npy'
labels_test_path = args.output_dir / 'all_labels_test.npy'
file_names_test_path = args.output_dir / 'all_file_names_test.npy'

# Initialize variables
all_imgs_train, all_labels_train, all_file_names_train = None, None, None
all_imgs_test, all_labels_test, all_file_names_test = None, None, None
data_path_trusted = args.dataset_path / 'trusted_list'
data_path_phish = args.dataset_path / 'phishing'

# Check if all .npy files exist
if (imgs_train_path.exists() and labels_train_path.exists() and file_names_train_path.exists() and
        imgs_test_path.exists() and labels_test_path.exists() and file_names_test_path.exists()):
    logger.info('Loading pre-saved data')

    # Load pre-saved data
    all_imgs_train = np.load(imgs_train_path)
    all_labels_train = np.load(labels_train_path)
    all_file_names_train = np.load(file_names_train_path)

    all_imgs_test = np.load(imgs_test_path)
    all_labels_test = np.load(labels_test_path)
    all_file_names_test = np.load(file_names_test_path)

2025-01-20 00:49:43,376 INFO Loading pre-saved data


In [6]:
logger.info('Images loaded')

X_train_legit = all_imgs_train
y_train_legit = all_labels_train
# Load the same train/split in phase 1
phish_test_idx = np.load(args.output_dir / 'test_idx.npy')
phish_train_idx = np.load(args.output_dir / 'train_idx.npy')

X_test_phish = all_imgs_test[phish_test_idx, :]
y_test_phish = all_labels_test[phish_test_idx, :]

X_train_phish = all_imgs_test[phish_train_idx, :]
y_train_phish = all_labels_test[phish_train_idx, :]

2025-01-20 00:49:46,354 INFO Images loaded


In [7]:
from trainer_phase2 import all_targets_start_end, HardSubsetSampling, prepare_model, start_end_each_target
from triplet_sampling import TargetHelper, get_batch

2025-01-20 00:49:49.205397: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-01-20 00:49:50.004199: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2025-01-20 00:49:50.004284: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [8]:
labels_start_end_train_legit = all_targets_start_end(args.num_targets, y_train_legit)
labels_start_end_train_legit

array([[  0.,  69.],
       [ 70., 217.],
       [218., 262.],
       [263., 365.],
       [366., 419.]])

In [9]:
hard_subset_sampling = HardSubsetSampling()

In [10]:
hard_subset_sampling.find_fixed_set_idx(
            labels_start_end_train_legit=labels_start_end_train_legit, num_target=args.num_targets)

array([ 19., 176., 237., 295., 376.])

In [18]:
from keras.models import load_model
from keras import backend as K

def custom_loss(margin):
        def loss(y_true, y_pred):
            loss_value = K.maximum(y_true, margin + y_pred)
            loss_value = K.mean(loss_value, axis=0)
            return loss_value

        return loss

full_model = load_model(args.output_dir / f"{args.saved_model_name}.h5",
                        custom_objects={'loss':custom_loss(args.margin)})
from keras import optimizers
optimizer = optimizers.Adam(lr=args.start_lr)
full_model.compile(loss=custom_loss(args.margin), optimizer=optimizer)

  super().__init__(name, **kwargs)


In [19]:
targetHelper = TargetHelper(data_path_phish)
full_model = prepare_model(args)

In [20]:
fixed_set_idx = hard_subset_sampling.find_fixed_set_idx(
            labels_start_end_train_legit=labels_start_end_train_legit, num_target=args.num_targets)
fixed_set = X_train_legit[fixed_set_idx.astype(int), :, :, :]
fixed_set

array([[[[0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         ...,
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432]],

        [[0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         ...,
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432]],

        [[0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         ...,
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432],
         [0.96078432, 0.96078432, 0.96078432]],

        ...,

        [[0.5956226 , 0.37993631, 0.47013235],
         [0.60131592, 0.38562965, 0.47582564]

In [21]:
n=1
X_train = np.concatenate([X_train_legit, X_train_phish])
y_train = np.concatenate([y_train_legit, y_train_phish])

X_train_new = np.zeros(
        [args.num_targets * 2 * n, X_train_legit.shape[1], X_train_legit.shape[2], X_train_legit.shape[3]])
y_train_new = np.zeros([args.num_targets * 2 * n, 1])



model = full_model.layers[3]

In [22]:
X_train_legit_last_layer, X_train_phish_last_layer, fixed_set_last_layer = hard_subset_sampling.predict_all_imgs(model,
                                                                                                X_train_legit=X_train_legit,
                                                                                                X_train_phish=X_train_phish,
                                                                                                fixed_set=fixed_set)
pairwise_distance = hard_subset_sampling.compute_all_distances(fixed_set_last_layer, X_train_legit_last_layer,
                                                X_train_phish_last_layer)
n = 1
all_idx = hard_subset_sampling.find_index_for_all_set(y_train, pairwise_distance, n)
X_train_new, y_train_new = hard_subset_sampling.find_next_training_set(X_train=X_train, y_train=y_train,
                                                        X_train_new=X_train_new, y_train_new=y_train_new,
                                                        all_idx=all_idx, n=n)

2025-01-20 00:54:34.770357: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8904




In [23]:
X_train_new

array([[[[0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         ...,
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647]],

        [[0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         ...,
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647]],

        [[0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         ...,
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647],
         [0.94117647, 0.94117647, 0.94117647]],

        ...,

        [[1.        , 1.        , 1.        ],
         [1.        , 1.        , 1.        ]

In [25]:
X_train_new, y_train_new = hard_subset_sampling.order_random_array(X_train_new, y_train_new, args.num_targets)
X_train_new
# return X_train_new, y_train_new, labels_start_end_train

array([[[[0.93093348, 0.41289198, 0.17092551],
         [0.93100047, 0.41288337, 0.16915455],
         [0.93077058, 0.41236159, 0.16316701],
         ...,
         [0.31989855, 0.04898285, 0.15878677],
         [0.3163943 , 0.04573404, 0.15553796],
         [0.31438249, 0.04379344, 0.15359737]],

        [[0.9260658 , 0.4048256 , 0.17114389],
         [0.92565703, 0.40387419, 0.1685442 ],
         [0.92742646, 0.40359151, 0.16311271],
         ...,
         [0.32421362, 0.05086722, 0.16067247],
         [0.32120204, 0.04984411, 0.15964808],
         [0.31864384, 0.04783837, 0.15764229]],

        [[0.92534286, 0.39789936, 0.17013651],
         [0.92472804, 0.39708644, 0.16873963],
         [0.92521638, 0.39669284, 0.16636996],
         ...,
         [0.32692111, 0.05209617, 0.16207136],
         [0.3247852 , 0.05118802, 0.16103345],
         [0.32407674, 0.05082122, 0.16062546]],

        ...,

        [[1.        , 1.        , 1.        ],
         [1.        , 1.        , 1.        ]

In [31]:
y_train_new

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.]])

In [29]:
def Xstart_end_each_target(num_target, labels):
    prev_target = 0
    start_end_each_target = np.zeros((num_target, 2))
    start_end_each_target[0, 0] = 0
    count_target = 0
    for i in range(1, labels.shape[0]):
        if not labels[i] == prev_target:
            start_end_each_target[count_target, 1] = i - 1
            count_target = count_target + 1
            start_end_each_target[count_target, 0] = i
            prev_target = prev_target + 1
    start_end_each_target[num_target - 1, 1] = labels.shape[0] - 1
    return start_end_each_target

In [30]:
labels_start_end_train = Xstart_end_each_target(args.num_targets, y_train_new)
labels_start_end_train

array([[0., 8.],
       [9., 0.],
       [0., 0.],
       [0., 0.],
       [0., 9.]])

In [36]:
get_batch(
    targetHelper=targetHelper,
    X_train_legit=X_train_legit,
    X_train_new=X_train_new,
    labels_start_end_train=labels_start_end_train,
    batch_size=args.batch_size,
    train_fixed_set=fixed_set,
    num_targets=args.num_targets
)

NameError: name 'labels_start_end_train' is not defined