In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import pickle
from tqdm import tqdm
from collections import Counter

# 1. Google Drive에서 저장된 텍스트 데이터와 레이블 불러오기
with open('/content/drive/MyDrive/LLM/data/train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/val_texts.pkl', 'rb') as f:
    val_texts = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/val_labels.pkl', 'rb') as f:
    val_labels = pickle.load(f)

print("train_texts, val_texts, train_labels, val_labels loaded successfully!")


train_texts, val_texts, train_labels, val_labels loaded successfully!


In [4]:
# 2. RoBERTa 토크나이저 로드
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 3. 데이터 배치로 나누기 및 토큰화 진행 (tqdm을 사용하여 진행 상황 표시)
batch_size = 5000  # 한 번에 처리할 텍스트 데이터 개수

# 진행 상황을 확인하면서 학습 데이터 토큰화
train_encodings = {'input_ids': [], 'attention_mask': []}
for i in tqdm(range(0, len(train_texts), batch_size), desc="Tokenizing train data", unit="batch"):
    batch_texts = train_texts[i:i+batch_size]
    batch_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=128)

    # 배치별로 나온 결과를 전체 결과에 추가
    train_encodings['input_ids'].extend(batch_encodings['input_ids'])
    train_encodings['attention_mask'].extend(batch_encodings['attention_mask'])

# 검증 데이터셋도 동일한 방식으로 처리
val_encodings = {'input_ids': [], 'attention_mask': []}
for i in tqdm(range(0, len(val_texts), batch_size), desc="Tokenizing val data", unit="batch"):
    batch_texts = val_texts[i:i+batch_size]
    batch_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=128)

    # 배치별로 나온 결과를 전체 결과에 추가
    val_encodings['input_ids'].extend(batch_encodings['input_ids'])
    val_encodings['attention_mask'].extend(batch_encodings['attention_mask'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Tokenizing train data: 100%|██████████| 453/453 [1:43:50<00:00, 13.75s/batch]
Tokenizing val data: 100%|██████████| 114/114 [25:55<00:00, 13.65s/batch]


In [7]:
# 4. DDoSDataset 클래스 정의
class DDoSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 5. DDoSDataset 생성
train_dataset = DDoSDataset(train_encodings, train_labels)
val_dataset = DDoSDataset(val_encodings, val_labels)

# 6. RoBERTa 모델 불러오기 (Dropout 확률 설정)
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# 7. GPU 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 5.5 가중치 설정 #클래스간 비율불균형 고려
label_counts = Counter(train_labels)
total_count = len(train_labels)
benign_weight = total_count / label_counts[0]  # BENIGN에 대한 가중치
ddos_weight = total_count / label_counts[1]    # DDoS에 대한 가중치
class_weights = torch.tensor([benign_weight, ddos_weight]).to(device)



# 8. 학습 설정 (TrainingArguments)
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LLM/roberta_model',   # 모델 저장 경로
    num_train_epochs=2,                                      # 에폭 수
    per_device_train_batch_size=16,                          # 배치 크기
    per_device_eval_batch_size=16,                           # 검증 배치 크기
    warmup_steps=500,                                        # 워밍업 단계
    weight_decay=0.01,                                       # 가중치 감소 (정규화)
    logging_dir='./logs',                                    # 로그 저장 경로
    logging_steps=100,                                       # 로깅 간격
    eval_strategy="steps",                                   # 스텝마다 검증
    save_strategy="steps",                                   # 스텝마다 모델 저장
    eval_steps=500,                                          # 검증 스텝 간격
    save_steps=500,                                          # 모델 저장 스텝 간격
    learning_rate=5e-5,                                      # 학습률 (낮게 조정)
    load_best_model_at_end=True,                             # 가장 좋은 모델 불러오기
    metric_for_best_model="eval_loss",                       # 손실을 기준으로 베스트 모델 선택
    save_total_limit=3,                                      # 저장할 모델의 수 제한
    report_to=["none"],                                      # tqdm으로 진행 상황을 표시하기 위해 다른 로깅 비활성화
)

# 9. Early Stopping 설정 (검증 손실 개선이 없으면 조기 종료)
early_stopping = EarlyStoppingCallback(early_stopping_patience=1)

# 10. Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],  # Early Stopping 적용
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 11. 학습 시작
trainer.train()

Step,Training Loss,Validation Loss
500,0.1373,0.408889
1000,0.091,0.06734
1500,0.3678,0.496741


TrainOutput(global_step=1500, training_loss=0.2013672154744466, metrics={'train_runtime': 12912.7965, 'train_samples_per_second': 350.752, 'train_steps_per_second': 21.922, 'total_flos': 1578666332160000.0, 'train_loss': 0.2013672154744466, 'epoch': 0.010597860645197757})

In [9]:
# 7. 최종 평가
trainer.evaluate()

{'eval_loss': 0.06733975559473038,
 'eval_runtime': 4115.18,
 'eval_samples_per_second': 137.576,
 'eval_steps_per_second': 8.599,
 'epoch': 0.010597860645197757}