In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
import torch
import pickle
from tqdm import tqdm
from collections import Counter

# Google Drive에서 데이터 로드
with open('/content/drive/MyDrive/LLM/data/train_texts.pkl', 'rb') as f:
    train_texts = pickle.load(f)
with open('/content/drive/MyDrive/LLM/data/train_labels.pkl', 'rb') as f:
    train_labels = pickle.load(f)
with open('/content/drive/MyDrive/LLM/data/val_texts.pkl', 'rb') as f:
    val_texts = pickle.load(f)
with open('/content/drive/MyDrive/LLM/data/val_labels.pkl', 'rb') as f:
    val_labels = pickle.load(f)

In [4]:
# T5 토크나이저 로드 및 데이터 토크나이징
tokenizer = T5Tokenizer.from_pretrained("t5-small")
batch_size = 5000  # 배치 단위로 토크나이징

train_encodings = {'input_ids': [], 'attention_mask': [], 'labels': []}
for i in tqdm(range(0, len(train_texts), batch_size), desc="Tokenizing train data", unit="batch"):
    batch_texts = [f"classify: {text}" for text in train_texts[i:i + batch_size]]
    batch_labels = [str(label) for label in train_labels[i:i + batch_size]]

    inputs = tokenizer(batch_texts, truncation=True, padding=True, max_length=128)
    outputs = tokenizer(batch_labels, truncation=True, padding=True, max_length=2)

    train_encodings['input_ids'].extend(inputs['input_ids'])
    train_encodings['attention_mask'].extend(inputs['attention_mask'])
    train_encodings['labels'].extend(outputs['input_ids'])

val_encodings = {'input_ids': [], 'attention_mask': [], 'labels': []}
for i in tqdm(range(0, len(val_texts), batch_size), desc="Tokenizing val data", unit="batch"):
    batch_texts = [f"classify: {text}" for text in val_texts[i:i + batch_size]]
    batch_labels = [str(label) for label in val_labels[i:i + batch_size]]

    inputs = tokenizer(batch_texts, truncation=True, padding=True, max_length=128)
    outputs = tokenizer(batch_labels, truncation=True, padding=True, max_length=2)

    val_encodings['input_ids'].extend(inputs['input_ids'])
    val_encodings['attention_mask'].extend(inputs['attention_mask'])
    val_encodings['labels'].extend(outputs['input_ids'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Tokenizing train data: 100%|██████████| 453/453 [1:53:59<00:00, 15.10s/batch]
Tokenizing val data: 100%|██████████| 114/114 [31:32<00:00, 16.60s/batch]


In [5]:
# 데이터셋 클래스 정의
class DDoSDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

train_dataset = DDoSDataset(train_encodings)
val_dataset = DDoSDataset(val_encodings)

# T5 모델 초기화
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# GPU 설정
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# TrainingArguments 설정
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/LLM/t5_model',    # 모델 저장 경로
    num_train_epochs=2,                                  # 에폭 수
    per_device_train_batch_size=16,                      # 배치 크기
    per_device_eval_batch_size=16,                       # 검증 배치 크기
    warmup_steps=500,                                    # 워밍업 단계
    weight_decay=0.01,                                   # 가중치 감소 (정규화)
    logging_dir='./logs',                                # 로그 저장 경로
    logging_steps=100,                                   # 로깅 간격
    evaluation_strategy="steps",                         # 스텝마다 검증
    save_strategy="steps",                               # 스텝마다 모델 저장
    eval_steps=500,                                      # 검증 스텝 간격
    save_steps=500,                                      # 모델 저장 스텝 간격
    learning_rate=5e-5,                                  # 학습률 (낮게 조정)
    load_best_model_at_end=True,                         # 가장 좋은 모델 불러오기
    metric_for_best_model="eval_loss",                   # 손실을 기준으로 베스트 모델 선택
    save_total_limit=3,                                  # 저장할 모델의 수 제한
    report_to=["none"],                                  # tqdm으로 진행 상황을 표시하기 위해 다른 로깅 비활성화
)

# Early Stopping 설정
early_stopping = EarlyStoppingCallback(early_stopping_patience=1)

# Trainer 설정
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[early_stopping],  # Early Stopping 적용
)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]



In [6]:

# 학습 및 평가
trainer.train()
eval_results = trainer.evaluate()
print(eval_results)

Step,Training Loss,Validation Loss
500,0.1747,0.095988
1000,0.0512,0.061084
1500,0.0458,0.069456


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'eval_loss': 0.061083611100912094, 'eval_runtime': 1341.8428, 'eval_samples_per_second': 421.919, 'eval_steps_per_second': 26.37, 'epoch': 0.010597860645197757}
