<a href="https://colab.research.google.com/github/Onedory/DDos-detection-LLM/blob/main/5_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import pickle
from tqdm import tqdm
from collections import Counter

# 1. Google Drive에서 저장된 텍스트 데이터와 레이블 불러오기
with open('/content/drive/MyDrive/LLM/data/train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/val_texts.pkl', 'rb') as f:
    val_texts = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/val_labels.pkl', 'rb') as f:
    val_labels = pickle.load(f)

print("train_texts, val_texts, train_labels, val_labels loaded successfully!")

train_texts, val_texts, train_labels, val_labels loaded successfully!


In [None]:
with open('/content/drive/MyDrive/LLM/data/train_encodings.pkl', 'rb') as f:
    train_encodings = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/val_encodings.pkl', 'rb') as f:
    val_encodings = pickle.load(f)


In [None]:
# 4. DDoSDataset 클래스 정의
class DDoSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 5. DDoSDataset 생성
train_dataset = DDoSDataset(train_encodings, train_labels)
val_dataset = DDoSDataset(val_encodings, val_labels)


# 6. BERT 모델 불러오기 (Dropout 확률 설정)
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

 # 7. GPU 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 5.5 가중치 설정 #클래스간 비율불균형 고려
label_counts = Counter(train_labels)
total_count = len(train_labels)
benign_weight = total_count / label_counts[0]  # BENIGN에 대한 가중치
ddos_weight = total_count / label_counts[1]    # DDoS에 대한 가중치
class_weights = torch.tensor([benign_weight, ddos_weight]).to(device)

from transformers import Trainer, TrainingArguments

# 8. 학습 설정 (TrainingArguments)
# 학습 설정 (TrainingArguments) - wandb 비활성화
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LLM/model',      # 모델 저장 경로
    num_train_epochs=3,                                # 에폭 수
    per_device_train_batch_size=16,                    # 배치 크기
    per_device_eval_batch_size=16,                     # 검증 배치 크기
    warmup_steps=500,                                  # 워밍업 단계
    weight_decay=0.01,                                 # 가중치 감소 (정규화)
    logging_dir='./logs',                              # 로그 저장 경로
    logging_steps=100,                                 # 로깅 간격
    evaluation_strategy="steps",                       # 스텝마다 검증
    save_strategy="steps",                             # 스텝마다 모델 저장
    eval_steps=500,                                    # 검증 스텝 간격
    save_steps=500,                                    # 모델 저장 스텝 간격
    learning_rate=2e-5,                                # 기본 학습률
    lr_scheduler_type="cosine",                        # Cosine Annealing 스케줄러 추가
    load_best_model_at_end=True,                       # 가장 좋은 모델 불러오기
    metric_for_best_model="eval_loss",                 # 손실을 기준으로 베스트 모델 선택
    save_total_limit=3,                                # 저장할 모델의 수 제한
    report_to="none"                                   # wandb 비활성화
)

# 9. Early Stopping 설정 (검증 손실 개선이 없으면 조기 종료)
early_stopping = EarlyStoppingCallback(early_stopping_patience=1)

# 10. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],  # Early Stopping 적용
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Train과 Validation 데이터셋의 레이블 개수 확인
from collections import Counter

# Train 데이터 레이블 개수
train_label_counts = Counter(train_labels)
print(f"Train Dataset - BENIGN: {train_label_counts[0]}, DDoS: {train_label_counts[1]}")

# Validation 데이터 레이블 개수
val_label_counts = Counter(val_labels)
print(f"Validation Dataset - BENIGN: {val_label_counts[0]}, DDoS: {val_label_counts[1]}")

# 전체 데이터셋 레이블 개수
total_label_counts = Counter(train_labels + val_labels)
print(f"Total Dataset - BENIGN: {total_label_counts[0]}, DDoS: {total_label_counts[1]}")


Train Dataset - BENIGN: 1818809, DDoS: 445785
Validation Dataset - BENIGN: 454288, DDoS: 111861
Total Dataset - BENIGN: 2273097, DDoS: 557646


In [None]:
# 11. 학습 시작
trainer.train()

Step,Training Loss,Validation Loss
500,0.18,0.115819
1000,0.0803,0.090338
1500,0.0524,0.058911
2000,0.0442,0.03932
2500,0.0308,0.044201


TrainOutput(global_step=2500, training_loss=0.12463585767745972, metrics={'train_runtime': 20999.4102, 'train_samples_per_second': 323.523, 'train_steps_per_second': 20.22, 'total_flos': 2631110553600000.0, 'train_loss': 0.12463585767745972, 'epoch': 0.017663101075329594})

In [None]:
# 7. 최종 평가
trainer.evaluate()

{'eval_loss': 0.039320237934589386,
 'eval_runtime': 4024.8686,
 'eval_samples_per_second': 140.663,
 'eval_steps_per_second': 8.792,
 'epoch': 0.017663101075329594}