In [2]:
# 1. Cài đặt thư viện nếu cần (chạy trên Colab)
# !pip install tensorflow pandas scikit-learn
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# 2. Đọc dữ liệu
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/malicious_phish.csv")  # Đường dẫn CSV

# 3. Mã hóa nhãn
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['type'])
num_classes = len(label_encoder.classes_)

# 4. Tách tập train/test
X_train, X_test, y_train, y_test = train_test_split(
    df['url'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

# 5. Tokenize theo ký tự
tokenizer = Tokenizer(char_level=True, lower=True)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# 6. Padding chuỗi URL
max_len = 200
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# 7. One-hot encode nhãn
y_train_cat = to_categorical(y_train, num_classes=num_classes)
y_test_cat = to_categorical(y_test, num_classes=num_classes)

# 8. Xây mô hình CNN
vocab_size = len(tokenizer.word_index) + 1

model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=64),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# 9. Huấn luyện
history = model.fit(X_train_pad, y_train_cat, epochs=5, batch_size=64,
                    validation_data=(X_test_pad, y_test_cat))

# 10. Đánh giá
y_pred = model.predict(X_test_pad)
y_pred_classes = y_pred.argmax(axis=1)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Epoch 1/5
[1m8140/8140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m810s[0m 99ms/step - accuracy: 0.8927 - loss: 0.3128 - val_accuracy: 0.9641 - val_loss: 0.1182
Epoch 2/5
[1m8140/8140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m802s[0m 92ms/step - accuracy: 0.9510 - loss: 0.1562 - val_accuracy: 0.9685 - val_loss: 0.0995
Epoch 3/5
[1m8140/8140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m833s[0m 96ms/step - accuracy: 0.9563 - loss: 0.1411 - val_accuracy: 0.9711 - val_loss: 0.0920
Epoch 4/5
[1m8140/8140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m785s[0m 96ms/step - accuracy: 0.9582 - loss: 0.1342 - val_accuracy: 0.9732 - val_loss: 0.0866
Epoch 5/5
[1m8140/8140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m755s[0m 93ms/step - accuracy: 0.9596 - loss: 0.1300 - val_accuracy: 0.9719 - val_loss: 0.0877
[1m4070/4070[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 14ms/step

Classification Report:

              precision    recall  f1-score   support

      benig