In [None]:
pip install --upgrade pip
!pip install trl wandb
!pip install -U bitsandbytes
!pip install "unsloth[cu121-ampere-torch220] @ git+https://github.com/unslothai/unsloth.git"
!pip install datasets

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn.functional as F
from datasets import load_dataset, Dataset
import huggingface_hub
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
    DataCollatorForSeq2Seq
)

from trl.core import LengthSampler
from trl import (
    PPOTrainer,
    PPOConfig,
    AutoModelForCausalLMWithValueHead,
    create_reference_model,
    DPOConfig,
    DPOTrainer,
)
from peft import (
    get_peft_model,
    AutoPeftModel,
    AutoPeftModelForCausalLM,
    PeftModel,
    LoraConfig,
    LoftQConfig,
    TaskType,
)
from trl import SFTTrainer, SFTConfig
# from unsloth import is_bfloat16_supported
# from unsloth import FastLanguageModel
# from unsloth.chat_templates import (
#     get_chat_template,
#     train_on_responses_only,
#     standardize_sharegpt,
# )

### Model load

In [None]:
huggingface_hub.login()

In [None]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"
model_id = 'Bllossom/llama-3.2-Korean-Bllossom-3B'
model_id = 'qa_kor_v11'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

==((====))==  Unsloth 2024.12.8: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA A100 80GB PCIe. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
for param in model.parameters():
    param.requires_grad = True

### Dataset load

In [None]:
dataset = load_dataset('HoJL/law_expc')
dataset

In [None]:
instruction = """너는 주어지는 내용만을 보고 질문에 답을 하고 왜 이런 답을 했는지 추론도 해주는 역할이야. 
반드시 한국어로 답변해줘.
답변은 아래형식과 같이 추론과 답으로 이루어져 있고 마크다운으로 내줘.

### 추론:
### 답:
"""
inputs = f"""
### 내용:
{dataset['test'][2].get('관계법령_정리')}

### 질문:
{dataset['test'][2].get('질의요지')}
"""
messages = [
{"role": "system", "content": f"{instruction}"},
{"role": "user", "content": f"{inputs}"}
]
input = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    tokenize=False
)

In [None]:
tokenizer.pad_token_id = tokenizer.eos_token_id 
inputs = tokenizer(
    input,
    return_tensors="pt",
).to("cuda")


In [None]:
from transformers import TextStreamer


model.generation_config.pad_token_id = tokenizer.pad_token_id
terminators = [
    tokenizer.convert_tokens_to_ids("<|end_of_text|>"),
    tokenizer.convert_tokens_to_ids("<|eot_id|>")
]
#skip_special_tokens=True,
text_streamer = TextStreamer(tokenizer,  skip_prompt=True)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=4096,  # 최대 생성 토큰 수를 설정합니다.
    eos_token_id=terminators,  # 생성을 멈출 기준을 설정합니다.
    temperature=0.1,
    repetition_penalty=1.1,
)