In [1]:
import os
import numpy as np

def load_data(folder):
    data = []
    labels = []
    for subfolder in os.listdir(folder):
        subfolder_path = os.path.join(folder, subfolder)
        if os.path.isdir(subfolder_path):
            for file in os.listdir(subfolder_path):
                file_path = os.path.join(subfolder_path, file)
                if file.endswith('.npy'):
                    # Load numpy array
                    array = np.load(file_path).flatten()
                    data.append(array)
                    # Label phishing as 1, benign as 0
                    label = 1 if (folder.find('phishing') != -1) else 0
                    labels.append(label)
    return np.array(data), np.array(labels)

# Đường dẫn đến thư mục chứa dữ liệu
phishing_path = 'VisualPhish/phishing_features'
benign_path = 'VisualPhish/trusted_list_features'

# Load dữ liệu
phishing_data, phishing_labels = load_data(phishing_path)
benign_data, benign_labels = load_data(benign_path)

# Gộp dữ liệu và labels
X = np.concatenate((phishing_data, benign_data), axis=0)
y = np.concatenate((phishing_labels, benign_labels), axis=0)


In [5]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential([
    Dense(1024, activation='relu', input_shape=(512,)),
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [6]:
from sklearn.model_selection import train_test_split

# Phân chia dữ liệu
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Huấn luyện mô hình
model.fit(X_train, y_train, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f79273b7ac0>

In [7]:
# Đánh giá hiệu suất
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Accuracy: {accuracy*100:.2f}%")


Accuracy: 92.19%


In [9]:
print(len(X_train))
print(len(X_test))

8446
2112


In [11]:
cnt1Train = 0
cnt0Train = 0
for y in y_train:
  if y == 1:
    cnt1Train+=1
  else:
    cnt0Train+=1

print(cnt1Train)
print(cnt1Train / len(y_train))
print(cnt0Train)
print(cnt0Train / len(y_train))


949
0.11236088089036231
7497
0.8876391191096377


In [12]:
cnt1Test = 0
cnt0Test = 0
for y in y_test:
  if y == 1:
    cnt1Test+=1
  else:
    cnt0Test+=1

print(cnt1Test)
print(cnt1Test / len(y_test))
print(cnt0Test)
print(cnt0Test / len(y_test))


246
0.11647727272727272
1866
0.8835227272727273


In [13]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

# Dự đoán trên tập kiểm tra
y_pred = model.predict(X_test)
y_pred = [1 if y > 0.5 else 0 for y in y_pred]  # Chuyển đổi xác suất thành nhãn dự đoán

# Tính toán các chỉ số đánh giá
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")
print(f"AUC: {auc:.2f}")


Precision: 0.72
Recall: 0.53
F1-Score: 0.61
AUC: 0.75
