In [None]:
!pip install python-dotenv

from dotenv import load_dotenv

load_dotenv()

In [None]:
import torch

print(torch.__version__)
major_version, minor_version = torch.cuda.get_device_capability()
major_version, minor_version

In [2]:
if major_version >= 8:
    # 새로운 GPU(예: Ampere, Hopper GPUs - RTX 30xx, RTX 40xx, A100, H100, L40)에 사용하세요.
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    # 오래된 GPU(예: V100, Tesla T4, RTX 20xx)에 사용하세요.
    !pip install --no-deps xformers trl peft accelerate bitsandbytes

In [3]:
config = {
    "architecture": "yanolja/EEVE-Korean-Instruct-10.8B-v1.0",
    "dataset": "easyread",
    "max_seq_length": 4096, # Choose any! We auto support RoPE Scaling internally!
    "load_in_4bit": False, # Use 4bit quantization to reduce memory usage. Can be False.
    "lora_use": True,
    "lora_rank_and_alpha": [16, 32],
    "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    "epochs": 1,
    "batch_size": 8,
    "gradient_accumulation_steps": 4,
    "learning_rate": 2e-5,
}

In [5]:
import wandb 

# # wandb로 추적하기 위한 코드
# wandb.require("core")


# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="easyread-primary-try",

    # track hyperparameters and run metadata
    config=config
)


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


Abort: 

In [6]:
import torch

# CUDA 장치의 주요 버전과 부 버전을 가져옵니다.
major_version, minor_version = torch.cuda.get_device_capability()
major_version, minor_version

ModuleNotFoundError: No module named 'torch'

In [5]:
import torch

if torch.cuda.is_available():
    print(f"CUDA available: {torch.cuda.is_available()}")
    print(f"Device count: { torch.cuda.device_count()}")
    print(f"Current device name: {torch.cuda.get_device_name(0)}")
    print(f"Device capability: {torch.cuda.get_device_capability(0)}")
    print(f"bfloat16 support: {torch.cuda.is_bf16_supported()}")
else:
    print("CUDA not available")

CUDA available: True
Device count: 1
Current device name: NVIDIA RTX 6000 Ada Generation
Device capability: (8, 9)
bfloat16 support: True


In [6]:
from unsloth import FastLanguageModel
from transformers import BitsAndBytesConfig
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config["architecture"],
    max_seq_length = config["max_seq_length"],
    load_in_4bit = config["load_in_4bit"],
    low_cpu_mem_usage=True
)

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: NVIDIA RTX 6000 Ada Generation. Max memory: 47.507 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.1+cu121. CUDA = 8.9. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.25. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards: 100%|██████████| 5/5 [00:03<00:00,  1.29it/s]


In [7]:
if config["lora_use"] :
    model = FastLanguageModel.get_peft_model(
        model,
        r = config["lora_rank_and_alpha"][0], # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = config["lora_target_modules"],
        lora_alpha = config["lora_rank_and_alpha"][1],
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
        use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
        random_state = 123,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )

Unsloth 2024.7 patched 48 layers with 48 QKV layers, 48 O layers and 48 MLP layers.


In [8]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

def foramtting_prompts_end_token(examples):
    before_text = examples["text"]
    texts = []
    for text in before_text:
        text += EOS_TOKEN
        texts.append(text)
    return { "text": texts, }
        

from datasets import load_dataset, concatenate_datasets

dataset = load_dataset("Suchae/judgment-transducer-primary-dataset", split="train")
dataset = dataset.map(foramtting_prompts_end_token, batched = True) # 엔드 토큰 추가
# dataset = dataset['train']

# dataset = dataset.shuffle(seed=1234)

print(dataset)

Dataset({
    features: ['instruction', 'input', 'output', 'system', 'text'],
    num_rows: 2097957
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 49620
})
Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 49620
})


In [9]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

tokenizer.padding_side = "right"  # 토크나이저의 패딩을 오른쪽으로 설정합니다.

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    # eval_dataset = valid_dataset,
    dataset_text_field = "text",
    max_seq_length = config["max_seq_length"],
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = config["batch_size"],
        gradient_accumulation_steps = config["gradient_accumulation_steps"],
        warmup_steps = 5,
        num_train_epochs=1, # Set num_train_epochs = 1 for full training runs
        # do_eval=True,
        # evaluation_strategy="steps",
        logging_steps=1,  # logging 스텝 수
        learning_rate = config["learning_rate"],
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = "adam",
        weight_decay = 0.01,
        lr_scheduler_type = "cosine",
        seed = 1234,
        output_dir = "outputs",
    ),
)

Map (num_proc=2): 100%|██████████| 49620/49620 [00:02<00:00, 22813.41 examples/s]


In [10]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA RTX 6000 Ada Generation. Max memory = 47.507 GB.
20.35 GB of memory reserved.


In [13]:
#     
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 49,620 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 3 | Gradient Accumulation steps = 12
\        /    Total batch size = 36 | Total steps = 1,378
 "-____-"     Number of trainable parameters = 31,457,280


Step,Training Loss
1,3.0534
2,3.2107
3,3.0405
4,2.9697
5,2.9716
6,2.9715
7,2.745
8,2.4566
9,2.1797
10,2.2385


KeyboardInterrupt: 

In [None]:
from transformers import TextStreamer

# FastLanguageModel을 이용하여 추론 속도를 2배 빠르게 설정합니다.
FastLanguageModel.for_inference(model)
inputs = tokenizer(
    [
        "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nHuman: 다음 법률용어를 참고하여 판결문 내용을 요약하고 쉽게 바꾼 뒤, json 형태로 출력해줘.\n<법률용어>\n{'상고이유': '상고심에서의 판결에 불복하여 상고를 제기하는 이유', '자동차손해배상 보장법': '자동차 사고로 인한 피해를 보상하기 위한 법률', '책임보험': '법적으로 의무적으로 가입해야 하는 보험', '보험금': '보험 계약에 따라 보험사가 보험 가입자나 피해자에게 지급하는 돈', '피해자': '사고 등으로 인해 피해를 입은 사람', '손해배상청구권': '피해를 입은 사람이 가해자에게 손해를 배상해달라고 요구할 수 있는 권리', '대위행사': '다른 사람의 권리를 그 사람을 대신하여 행사하는 것'}\n</법률용어>\n<판결문>\n【이유】\n상고이유를 판단한다.\n1. 원고의 상고이유에 대하여\n가. 구 자동차손해배상 보장법(2008. 2. 29. 법률 제8852호로 개정되기 전의 것, 이하 ‘법’이라 한다) 제26조 제1항은 “정부는 다음 각 호의 1에 해당하는 경우에는 피해자의 청구에 따라 책임보험의 보험금의 한도 안에서 그가 입은 피해를 보상한다.”고 규정하면서 그 제1호에서 “자동차보유자를 알 수 없는 자동차의 운행으로 인하여 사망하거나 부상한 경우”를 들고 있고, 법 제31조 제1항은 “정부는 제26조 제1항의 규정에 의하여 피해를 보상한 경우에는 그 보상금액의 한도 안에서 제3조의 규정에 의하여 손해배상책임이 있는 자에 대한 피해자의 손해배상청구권을 대위행사할 수 있다.”고 규정하고 있으므로, 법 제37조 제1항에 의하여 법 제26조 제1항의 규정에 따른 보장사업에 관한 업무를 건설교통부장관으로부터 위탁받은 보장사업자가 피해자에게 보상금을 지급한 경우 그 보장사업자는 법 제31조 제1항의 규정에 따라 법 제3조의 규정에 의하여 손해배상책임이 있는 자에 대하여 가지는 피해자의 손해배상청구권을 대위행사할 수 있다.\n</판결문>\nAssistant: "
    ],
    return_tensors="pt",
).to("cuda")


text_streamer = TextStreamer(tokenizer)
_ = model.generate(
    **inputs,
    streamer=text_streamer,
    max_new_tokens=2048,  # 최대 생성 토큰 수를 설정합니다.
)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
# 로라 모델 저장

model.save_pretrained("suchae/EEVE-Korean-Judgment-Transducer-10.8B-v1.0") # Local saving

base_model = config["architecture"]
huggingface_repo = "suchae/EEVE-Korean-Judgment-Transducer-10.8B-v1.0"  # 모델을 업로드할 repository
save_method = (
    "merged_16bit"  # "merged_4bit", "merged_4bit_forced", "merged_16bit", "lora"
)

model.push_to_hub_merged(
    huggingface_repo,
    tokenizer,
    save_method=save_method,
)

model.push_to_hub_gguf("suchae/EEVE-Korean-Judgment-Transducer-10.8B-v1.0-GGUF", tokenizer, quantization_method = "f16")

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q5_k_m", token = "")

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`. You can install GPT4All by going [here](https://gpt4all.io/index.html).