In [None]:
!pip install -U langchain-community bitsandbytes transformers accelerate
!pip install transformers datasets pypdf

In [None]:
import re
import json
import torch
import glob
import pandas as pd

from tqdm import tqdm
from typing import List, Dict
from datasets import Dataset, load_dataset, load_from_disk
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, pipeline, BitsAndBytesConfig
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# PDF Load & SPlit Chunk

In [None]:
pdf_folder = "/content/drive/MyDrive/Dacon/Data/PDF_files/*.pdf"
pdf_files = glob.glob(pdf_folder)

In [None]:
%%time

# 청크 분할기
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50
)

# 2. 모든 문서를 불러와 합치기
all_docs = []
for pdf_file in tqdm(pdf_files, desc = "PDF Load..."):
    loader = PyPDFLoader(pdf_file)
    docs = loader.load()  # 페이지 단위로 로드됨
    all_docs.extend(docs) # 리스트에 추가

all_chunks = text_splitter.split_documents(all_docs)

print(f"총 문서 수: {len(all_chunks)}")

# QA Set

In [None]:
model_name = "skt/A.X-4.0-Light"

# 모델과 토크나이저 로드 (fp16, GPU 자동 할당)
tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side = "left")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype = torch.float16,
    device_map = "auto"
)

# 모든 청크를 prompts로 변환
contexts = [doc.page_content for doc in all_chunks]
prompts = [
    f"""
아래 문단을 바탕으로 객관식 질문 1개와 주관식 질문 1개를 만들어줘.
객관식은 4지선다로 하고 정답 표시 포함.

문단 :
{context}
""" for context in contexts
]

# 배치 처리
batch_size = 32
qa_dataset = []

for i in tqdm(range(0, len(prompts), batch_size), desc = "QA Dataset..."):
    batch_prompts = prompts[i:i+batch_size]
    inputs = tokenizer(batch_prompts, return_tensors = "pt", padding = True, truncation = False).to("cuda")

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens = 128,
            do_sample = False,
            eos_token_id = tokenizer.eos_token_id
        )

    decoded = tokenizer.batch_decode(outputs, skip_special_tokens = True)
    for context, qa_text in zip(contexts[i:i+batch_size], decoded):
        qa_dataset.append({"context": context, "qa": qa_text})

print(f"총 {len(qa_dataset)}개의 QA 생성 완료")

In [None]:
with open("/content/drive/MyDrive/Dacon/Data/train_10.json", "w", encoding = "utf-8") as f:
    for item in qa_dataset:
        json.dump({
            "instruction" : "다음 문단에 대한 질문에 답하시오.",
            "input" : item["context"],
            "output" : item["qa"]
        }, f, ensure_ascii = False)
        f.write("\n")

# Fine-tuning

In [None]:
model_name = "skt/A.X-4.0-Light"

In [None]:
# 데이터 로드
dataset = load_dataset("json", data_files = "/content/drive/MyDrive/Dacon/Data/train_10.json")

# 모델, 토크나이저 로드
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = None,
    torch_dtype = torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# LoRA 설정
lora_config = LoraConfig(
    lora_alpha = 32,
    lora_dropout = 0.05,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj"]
)

# 모델에 LoRA 적용
model = get_peft_model(model, lora_config)

# GPU로 옮기기
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
dataset = dataset["train"]
tokenizer.pad_token = tokenizer.eos_token  # pad_token 설정 (필수)

# 3. 프롬프트 포맷 정의
def format_prompt(ex):
    if ex["input"]:
        return f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
    else:
        return f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"

# 4. 토크나이징 함수
def preprocess_function(ex):
    prompt = format_prompt(ex)
    tokenized = tokenizer(prompt, truncation=True, padding = "max_length", max_length = 256)
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

# 5. 전처리 적용
tokenized_dataset = dataset.map(
    preprocess_function,
    batched = False,
    remove_columns = dataset.column_names
)

In [None]:
tokenized_dataset.save_to_disk("/content/drive/MyDrive/Dacon/Data/tokenized_dataset_10")

In [None]:
tokenized_dataset = load_from_disk("/content/drive/MyDrive/Dacon/Data/tokenized_dataset_10")

In [None]:
# training argument 설정
training_args = TrainingArguments(
    output_dir = "/content/drive/MyDrive/Dacon/finetuned_model_10",
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    learning_rate = 2e-4,
    num_train_epochs = 3,
    warmup_ratio = 0.05,
    logging_steps = 50,
    save_strategy = "epoch",
    eval_strategy = "no",
    fp16 = True,
    push_to_hub = False,
    report_to = "none"
)

# 학습
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset,
    tokenizer = tokenizer
)

In [None]:
# 모델 학습
trainer.train()

# 모델 저장
trainer.save_model()