In [1]:
import pandas as pd
import os
import logging
from sklearn.model_selection import train_test_split
from transformers import pipeline, AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# 로깅 설정
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 데이터 경로 설정
ROOT_DIR = "../custom_datasets"
MODEL_DIR = './saved_model'
TOKENIZER_DIR = './saved_tokenizer'


In [2]:
# GPU 사용 가능한지 확인
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU 사용 가능: {len(gpus)} GPUs")
    except RuntimeError as e:
        print(e)
else:
    print("GPU를 찾을 수 없습니다. CPU로 실행합니다.")


GPU를 찾을 수 없습니다. CPU로 실행합니다.


In [3]:
def load_and_prepare_data(root_dir, n_samples=10000):
    # 데이터 로드 및 전처리
    logger.info("피싱 데이터 로드 중...")
    phishing_df = pd.read_csv(
        os.path.join(root_dir, 'combined_phishing_data.txt'),
        header=None, delimiter='\t', names=['label', 'URL']
    )
    logger.info(f"피싱 데이터 로드 완료: {len(phishing_df)}개 샘플")

    benign_df = pd.read_csv(
        os.path.join(root_dir, 'combined_safe_data.txt'),
        header=None, delimiter='\t', names=['label', 'URL']
    )
    logger.info(f"정상 데이터 로드 완료: {len(benign_df)}개 샘플")

    phishing_df = phishing_df.sample(n=n_samples, random_state=42)
    benign_df = benign_df.sample(n=n_samples, random_state=42)
    df = pd.concat([phishing_df, benign_df], ignore_index=True)

    label_mapping = {-1: 1, +1: 0}
    df['label'] = df['label'].map(label_mapping)
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    texts = df['URL'].tolist()
    labels = df['label'].tolist()

    train_texts, val_texts, train_labels, val_labels = train_test_split(
        texts, labels, test_size=0.2, random_state=42
    )
    return train_texts, val_texts, train_labels, val_labels


In [4]:
def train_model(train_texts, train_labels, val_texts, val_labels, epochs=3, batch_size=16, model=None):
    tokenizer = AutoTokenizer.from_pretrained("ealvaradob/bert-finetuned-phishing")

    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    val_encodings = tokenizer(val_texts, truncation=True, padding=True)

    train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), train_labels))
    val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), val_labels))

    if model is None:
        model = TFAutoModelForSequenceClassification.from_pretrained("ealvaradob/bert-finetuned-phishing", from_pt=True)

    optimizer = Adam(learning_rate=3e-5)
    model.compile(optimizer=optimizer, metrics=['accuracy'])

    train_dataset = train_dataset.shuffle(len(train_dataset), seed=42).batch(batch_size)
    val_dataset = val_dataset.batch(batch_size)

    history = model.fit(train_dataset, validation_data=val_dataset, epochs=epochs)

    loss, accuracy = model.evaluate(val_dataset)
    print(f"검증 세트 손실: {loss:.4f}")
    print(f"검증 세트 정확도: {accuracy * 100:.2f}%")

    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(TOKENIZER_DIR)

    return model, tokenizer


In [None]:
train_texts, val_texts, train_labels, val_labels = load_and_prepare_data(ROOT_DIR)
model, tokenizer = train_model(train_texts, train_labels, val_texts, val_labels)


INFO:__main__:피싱 데이터 로드 중...
INFO:__main__:피싱 데이터 로드 완료: 93104개 샘플
INFO:__main__:정상 데이터 로드 완료: 1005715개 샘플
  pt_state_dict.update(torch.load(pt_path, map_location="cpu"))
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


Epoch 1/3


In [None]:
pip install tensorflow-gpu
