<a href="https://colab.research.google.com/github/Onedory/DDos-detection-LLM/blob/main/10_1_%ED%86%A0%ED%81%AC%EB%82%98%EC%9D%B4%EC%A0%80_%EC%A0%80%EC%9E%A5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import pickle
from tqdm import tqdm
from transformers import BertTokenizer, RobertaTokenizer, T5Tokenizer
from concurrent.futures import ThreadPoolExecutor
from huggingface_hub import login

# 1. Hugging Face Hub 로그인 (선택 사항: 인증을 원하는 경우)
# login("your_huggingface_token")

# 2. 저장된 데이터셋 로드
data_dir = '/content/drive/MyDrive/LLM/data'  # 데이터셋 저장 경로
with open(os.path.join(data_dir, 'train_texts.pkl'), 'rb') as f:
    train_texts = pickle.load(f)

with open(os.path.join(data_dir, 'val_texts.pkl'), 'rb') as f:
    val_texts = pickle.load(f)

print(f"Loaded {len(train_texts)} training texts and {len(val_texts)} validation texts.")

# 3. 저장 경로 설정
save_dir = '/content/drive/MyDrive/LLM/tokenizing'
os.makedirs(save_dir, exist_ok=True)

# 4. 토크나이저 로드
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", use_auth_token=False)
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base", use_auth_token=False)
t5_tokenizer = T5Tokenizer.from_pretrained("t5-small", legacy=False, use_auth_token=False)


# 5. 데이터 토크나이징 함수 (개별 텍스트 처리)
def tokenize_text(tokenizer, text):
    encoding = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    return {key: val.squeeze(0) for key, val in encoding.items()}  # 차원 축소

# 6. 병렬 토크나이징 함수
def parallel_tokenize_data(tokenizer, texts, desc, num_workers=4):
    encodings = []
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        results = list(tqdm(executor.map(lambda x: tokenize_text(tokenizer, x), texts), desc=desc, unit="texts", total=len(texts)))
        encodings.extend(results)
    return encodings

# 8. 토크나이징 결과 저장 함수
def save_encodings(encodings, filename):
    with open(os.path.join(save_dir, filename), 'wb') as f:
        pickle.dump(encodings, f)

# 7. 학습 및 검증 데이터 토크나이징

train_encodings_bert = parallel_tokenize_data(bert_tokenizer, train_texts, "Tokenizing BERT Train", num_workers=8)
val_encodings_bert = parallel_tokenize_data(bert_tokenizer, val_texts, "Tokenizing BERT Validation", num_workers=8)
save_encodings(train_encodings_bert, "train_encodings_bert.pkl")
save_encodings(val_encodings_bert, "val_encodings_bert.pkl")

train_encodings_roberta = parallel_tokenize_data(roberta_tokenizer, train_texts, "Tokenizing RoBERTa Train", num_workers=8)
val_encodings_roberta = parallel_tokenize_data(roberta_tokenizer, val_texts, "Tokenizing RoBERTa Validation", num_workers=8)
save_encodings(train_encodings_roberta, "train_encodings_roberta.pkl")
save_encodings(val_encodings_roberta, "val_encodings_roberta.pkl")

train_encodings_t5 = parallel_tokenize_data(t5_tokenizer, train_texts, "Tokenizing T5 Train", num_workers=8)
val_encodings_t5 = parallel_tokenize_data(t5_tokenizer, val_texts, "Tokenizing T5 Validation", num_workers=8)
save_encodings(train_encodings_t5, "train_encodings_t5.pkl")
save_encodings(val_encodings_t5, "val_encodings_t5.pkl")



print("Tokenized encodings saved successfully in /content/drive/MyDrive/LLM/tokenizing/")


Loaded 2264594 training texts and 566149 validation texts.


Tokenizing T5 Train: 100%|██████████| 2264594/2264594 [02:09<00:00, 17437.48texts/s]


Tokenized encodings saved successfully in /content/drive/MyDrive/LLM/tokenizing/
