In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import pickle
from tqdm import tqdm
from collections import Counter

# 1. Google Drive에서 저장된 텍스트 데이터와 레이블 불러오기
with open('/content/drive/MyDrive/LLM/data/train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/val_texts.pkl', 'rb') as f:
    val_texts = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)

with open('/content/drive/MyDrive/LLM/data/val_labels.pkl', 'rb') as f:
    val_labels = pickle.load(f)

print("train_texts, val_texts, train_labels, val_labels loaded successfully!")


train_texts, val_texts, train_labels, val_labels loaded successfully!


In [3]:
from transformers import XLNetTokenizer
from tqdm import tqdm

# XLNet 토크나이저 로드
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
batch_size = 5000  # 한 번에 처리할 텍스트 데이터 개수

# 학습 데이터 배치 단위로 토크나이징
train_encodings = {'input_ids': [], 'attention_mask': []}
for i in tqdm(range(0, len(train_texts), batch_size), desc="Tokenizing train data", unit="batch"):
    batch_texts = train_texts[i:i + batch_size]
    batch_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=128)

    # 배치별로 나온 결과를 전체 결과에 추가
    train_encodings['input_ids'].extend(batch_encodings['input_ids'])
    train_encodings['attention_mask'].extend(batch_encodings['attention_mask'])

# 검증 데이터셋도 동일한 방식으로 토크나이징
val_encodings = {'input_ids': [], 'attention_mask': []}
for i in tqdm(range(0, len(val_texts), batch_size), desc="Tokenizing val data", unit="batch"):
    batch_texts = val_texts[i:i + batch_size]
    batch_encodings = tokenizer(batch_texts, truncation=True, padding=True, max_length=128)

    # 배치별로 나온 결과를 전체 결과에 추가
    val_encodings['input_ids'].extend(batch_encodings['input_ids'])
    val_encodings['attention_mask'].extend(batch_encodings['attention_mask'])


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Tokenizing train data: 100%|██████████| 453/453 [2:02:36<00:00, 16.24s/batch]
Tokenizing val data: 100%|██████████| 114/114 [31:35<00:00, 16.63s/batch]


In [4]:
from transformers import XLNetTokenizer, XLNetForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback

# 데이터셋 클래스 정의
class DDoSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# DDoSDataset 생성
train_dataset = DDoSDataset(train_encodings, train_labels)
val_dataset = DDoSDataset(val_encodings, val_labels)


# XLNet 모델 초기화
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=2)

# GPU 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 가중치 설정 (클래스 불균형 고려)
from collections import Counter

label_counts = Counter(train_labels)
total_count = len(train_labels)
benign_weight = total_count / label_counts[0]  # BENIGN에 대한 가중치
ddos_weight = total_count / label_counts[1]    # DDoS에 대한 가중치
class_weights = torch.tensor([benign_weight, ddos_weight]).to(device)

# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LLM/xlnet_model',    # 모델 저장 경로
    num_train_epochs=2,                                     # 에폭 수
    per_device_train_batch_size=16,                         # 배치 크기
    per_device_eval_batch_size=16,                          # 검증 배치 크기
    warmup_steps=500,                                       # 워밍업 단계
    weight_decay=0.01,                                      # 가중치 감소 (정규화)
    logging_dir='./logs',                                   # 로그 저장 경로
    logging_steps=100,                                      # 로깅 간격
    evaluation_strategy="steps",                            # 스텝마다 검증
    save_strategy="steps",                                  # 스텝마다 모델 저장
    eval_steps=500,                                         # 검증 스텝 간격
    save_steps=500,                                         # 모델 저장 스텝 간격
    learning_rate=5e-5,                                     # 학습률 (낮게 조정)
    load_best_model_at_end=True,                            # 가장 좋은 모델 불러오기
    metric_for_best_model="eval_loss",                      # 손실을 기준으로 베스트 모델 선택
    save_total_limit=3,                                     # 저장할 모델의 수 제한
    report_to=["none"],                                     # tqdm으로 진행 상황을 표시하기 위해 다른 로깅 비활성화
)

# Trainer 설정 - 가중치 적용
from transformers import Trainer, EarlyStoppingCallback

# WeightedTrainer 클래스에서 compute_loss 메서드 수정
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # 가중치 적용한 손실 계산
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        # 모든 텐서를 연속적으로 변환
        for param in model.parameters():
            param.data = param.data.contiguous()

        return (loss, outputs) if return_outputs else loss

# Early Stopping 설정
early_stopping = EarlyStoppingCallback(early_stopping_patience=1)

# WeightedTrainer 초기화
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],  # Early Stopping 적용
)

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:

# 학습 및 평가
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

Step,Training Loss,Validation Loss
500,0.5842,0.214457
1000,0.2201,0.399096


{'eval_loss': 0.2144566774368286, 'eval_runtime': 6372.2992, 'eval_samples_per_second': 88.845, 'eval_steps_per_second': 5.553, 'epoch': 0.007065240430131838}
