In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers torch




In [5]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments
import torch
import pickle

# 데이터 로드
with open('/content/drive/MyDrive/LLM/data/train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)
with open('/content/drive/MyDrive/LLM/data/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)
with open('/content/drive/MyDrive/LLM/data/val_texts.pkl', 'rb') as f:
    val_texts = pickle.load(f)
with open('/content/drive/MyDrive/LLM/data/val_labels.pkl', 'rb') as f:
    val_labels = pickle.load(f)

# GPT-2 토크나이저 로드
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



In [6]:
# 데이터 토크나이징
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)


In [7]:

# 데이터셋 클래스 정의
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

In [10]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, GPT2ForSequenceClassification
import torch
from collections import Counter

# 2. 모델 초기화
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = model.config.eos_token_id


# 3. GPU 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# 1. 가중치 설정 (클래스 불균형 고려)
label_counts = Counter(train_labels)
total_count = len(train_labels)
benign_weight = total_count / label_counts[0]  # BENIGN에 대한 가중치
ddos_weight = total_count / label_counts[1]    # DDoS에 대한 가중치
class_weights = torch.tensor([benign_weight, ddos_weight]).to(device)


# 4. TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LLM/gpt2_model',    # 모델 저장 경로
    num_train_epochs=2,                                    # 에폭 수
    per_device_train_batch_size=16,                        # 배치 크기
    per_device_eval_batch_size=16,                         # 검증 배치 크기
    warmup_steps=500,                                      # 워밍업 단계
    weight_decay=0.01,                                     # 가중치 감소 (정규화)
    logging_dir='./logs',                                  # 로그 저장 경로
    logging_steps=100,                                     # 로깅 간격
    evaluation_strategy="steps",                           # 스텝마다 검증
    save_strategy="steps",                                 # 스텝마다 모델 저장
    eval_steps=500,                                        # 검증 스텝 간격
    save_steps=500,                                        # 모델 저장 스텝 간격
    learning_rate=5e-5,                                    # 학습률 (낮게 조정)
    load_best_model_at_end=True,                           # 가장 좋은 모델 불러오기
    metric_for_best_model="eval_loss",                     # 손실을 기준으로 베스트 모델 선택
    save_total_limit=3,                                    # 저장할 모델의 수 제한
    report_to=["none"],                                    # tqdm으로 진행 상황을 표시하기 위해 다른 로깅 비활성화
)

# 5. Trainer 설정 - 가중치 적용
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        # 가중치 적용한 손실 계산
        loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))

        return (loss, outputs) if return_outputs else loss

# 6. Early Stopping 설정
early_stopping = EarlyStoppingCallback(early_stopping_patience=1)

# 7. WeightedTrainer 초기화
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],  # Early Stopping 적용
)


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)


Step,Training Loss,Validation Loss
500,0.2031,0.269587
1000,0.0781,0.144659
1500,0.1232,0.093479


Step,Training Loss,Validation Loss
500,0.2031,0.269587
1000,0.0781,0.144659
1500,0.1232,0.093479
2000,0.0835,0.07025
2500,0.0789,0.077242


{'eval_loss': 0.070250004529953, 'eval_runtime': 4648.2064, 'eval_samples_per_second': 121.799, 'eval_steps_per_second': 7.613, 'epoch': 0.017663101075329594}
