In [3]:
import torch
import torch.nn as nn
import joblib
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# === Thiết bị ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("✅ CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("🚀 GPU name:", torch.cuda.get_device_name(0))

# === Đọc dữ liệu ===
df = pd.read_csv("dataset(1).csv")
df['domain'] = df['domain'].astype(str).str.lower().str.strip()
df['label'] = df['label'].str.lower().str.strip().map({'benign': 0, 'dga': 1})

# === Encode domain ===
all_chars = set(''.join(df['domain'].tolist()))
char2idx = {c: i + 1 for i, c in enumerate(sorted(all_chars))}  # 0 cho padding
max_len = 30

def encode(domain):
    domain = domain.lower().strip()
    return [char2idx.get(c, 0) for c in domain[:max_len]] + [0] * (max_len - len(domain))

# === Tách đặc trưng 24 chiều ===
feature_cols = [f"f{i}" for i in range(1, 19)] + [f"f{i}" for i in range(21, 23)] + ["f19_1", "f19_2", "f20_count", "f20_1", "f20_2", "f20_3", "f20_4"]
X_24 = df[feature_cols].fillna(0).values
y = df['label'].values
domains = df['domain'].tolist()
X_seq = [encode(d) for d in domains]

# === Train-test split ===
X_seq_train, X_seq_test, X_24_train, X_24_test, y_train, y_test, domains_train, domains_test = train_test_split(
    X_seq, X_24, y, domains, test_size=0.2, random_state=42)

# === Mô hình LSTM ===
class Attention(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attn = nn.Linear(hidden_size, 1)
    def forward(self, x):
        weights = torch.softmax(self.attn(x), dim=1)
        return torch.sum(weights * x, dim=1)

class DomainBinaryClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim=64, hidden_size=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.encoder = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.attention = Attention(hidden_size)
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, _ = self.encoder(x)
        context = self.attention(lstm_out)
        return self.sigmoid(self.fc(context)).squeeze(1)

# === Load mô hình LSTM đã huấn luyện ===
model = DomainBinaryClassifier(vocab_size=len(char2idx)+1).to(device)
model.load_state_dict(torch.load("lstm-adamw.pt", map_location=device))
model.eval()

# === Load mô hình RF ===
rf = joblib.load("rf-model.joblib")

# === Hàm dự đoán ===
def predict_lstm(domain_list, batch_size=64):
    model.eval()
    results = []
    with torch.no_grad():
        for i in range(0, len(domain_list), batch_size):
            batch_domains = domain_list[i:i+batch_size]
            encoded = [encode(d) for d in batch_domains]
            tensor = torch.tensor(encoded, dtype=torch.long).to(device)
            probs = model(tensor).cpu().numpy()
            results.extend(probs)
    return np.array(results)

def predict_rf(X):
    return rf.predict_proba(X)[:, 1]

def predict_stacked(domains, X_24, alpha=0.9):
    p_lstm = predict_lstm(domains)
    p_rf = predict_rf(X_24)
    probs = alpha * p_lstm + (1 - alpha) * p_rf
    return (probs > 0.5).astype(int), probs

# === Dự đoán ===
p_lstm = predict_lstm(domains_test)
p_rf = predict_rf(X_24_test)
y_pred, y_prob = predict_stacked(domains_test, X_24_test)

# === In 10 kết quả tượng trưng ===
print(f"{'Domain':30} | {'True':^5} | {'LSTM':^5} | {'RF':^5} | {'Stack':^5} | {'Score':^7}")
print("-" * 75)
for d, yt, pl, pr, yhat, prob in list(zip(domains_test, y_test, p_lstm, p_rf, y_pred, y_prob))[:10]:
    print(f"{d:30} |  {int(yt):^3}  |  {int(pl>0.5):^3}  |  {int(pr>0.5):^3}  |  {yhat:^3}  | {prob:.4f}")

# === Đánh giá mô hình LSTM ===
y_pred_lstm = (p_lstm > 0.5).astype(int)
acc_lstm = accuracy_score(y_test, y_pred_lstm)
precision_lstm = precision_score(y_test, y_pred_lstm)
recall_lstm = recall_score(y_test, y_pred_lstm)
f1_lstm = f1_score(y_test, y_pred_lstm)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_lstm).ravel()
fpr_lstm = fp / (fp + tn) if (fp + tn) != 0 else 0.0

# === Đánh giá mô hình RF ===
y_pred_rf = (p_rf > 0.5).astype(int)
acc_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_rf).ravel()
fpr_rf = fp / (fp + tn) if (fp + tn) != 0 else 0.0

# === Đánh giá tổng thể ===
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# === Confusion Matrix để tính FPR ===
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
fpr = fp / (fp + tn) if (fp + tn) != 0 else 0.0

# === In kết quả so sánh 3 mô hình ===
print("\n📊 So sánh các mô hình:")
print(f"{'Metric':<25} | {'LSTM':^8} | {'RF':^8} | {'LSTM + RF':^10}")
print("-" * 55)
print(f"{'✅ Accuracy':<25} | {acc_lstm:.4f} | {acc_rf:.4f} | {acc:.4f}")
print(f"{'🎯 Precision':<25} | {precision_lstm:.4f} | {precision_rf:.4f} | {precision:.4f}")
print(f"{'🔁 Recall (TPR)':<25} | {recall_lstm:.4f} | {recall_rf:.4f} | {recall:.4f}")
print(f"{'💎 F1-score':<25} | {f1_lstm:.4f} | {f1_rf:.4f} | {f1:.4f}")
print(f"{'🚨 False Positive Rate (FPR)':<25} | {fpr_lstm:.4f} | {fpr_rf:.4f} | {fpr:.4f}")

✅ CUDA available: True
🚀 GPU name: NVIDIA GeForce GTX 1650




Domain                         | True  | LSTM  |  RF   | Stack |  Score 
---------------------------------------------------------------------------
mkyzanarianaqh.com             |   1   |   1   |   1   |   1   | 1.0000
list-manage1.com               |   0   |   0   |   0   |   0   | 0.0000
houstonbbs.com                 |   0   |   0   |   0   |   0   | 0.0068
fvbbpbdjicv.cn                 |   1   |   1   |   1   |   1   | 0.9997
yfmtyfgndoypa.in               |   1   |   1   |   1   |   1   | 0.9999
csgasikathrinezad.com          |   1   |   1   |   1   |   1   | 1.0000
seversknet.ru                  |   0   |   0   |   0   |   0   | 0.0271
bero-host.de                   |   0   |   0   |   0   |   0   | 0.0002
fbsaidablyhoosieraw.com        |   1   |   1   |   1   |   1   | 0.9998
tesoro.es                      |   0   |   0   |   0   |   0   | 0.0001

📊 So sánh các mô hình:
Metric                    |   LSTM   |    RF    | LSTM + RF 
----------------------------------------------