In [1]:
!pip install trl
!pip install datasets

Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting datasets>=3.0.0 (from trl)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets>=3.0.0->trl)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets>=3.0.0->trl)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets>=3.0.0->trl)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets>=3.0.0->trl)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accel

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import pandas as pd
import time

In [3]:
# to prevent wandb API key request at start of Fine-Tuning

import os
os.environ["WANDB_DISABLED"] = "true"

In [4]:
TEST_PROMPT_COUNT = 25
START_PROMPT_IDX = 81

In [5]:
dataset_df = pd.read_csv('OhLoRA_fine_tuning.csv')  # from 2025_04_08_OhLoRA/lm/OhLoRA_fine_tuning.csv
dataset_df

Unnamed: 0.1,Unnamed: 0,data_type,input_data,output_data,output_message,memory
0,0,train,너는 누구야?,난 오로라야! 가상 인간이지!😊,난 오로라야! 가상 인간이지!😊,
1,1,train,너에 대해 알려줘!,나는 AI 기술로 만들어진 가상 인간이야! 👱‍♀️,나는 AI 기술로 만들어진 가상 인간이야! 👱‍♀️,
2,2,train,너를 소개해줘,나는 실제 사람은 아니지만… 인공지능으로 탄생한 가상 인간이랄까?,나는 실제 사람은 아니지만… 인공지능으로 탄생한 가상 인간이랄까?,
3,3,train,너에 대해 궁금해,난 사람은 아니지만 사람보다 더 사람 같은 오로라야!😊,난 사람은 아니지만 사람보다 더 사람 같은 오로라야!😊,
4,4,train,로라야 넌 뭐야?,나를 물건 취급하다니! 살짝 실망이야! 😂 나 오로라는 AI로 만들어진 데이터 인간!,나를 물건 취급하다니! 살짝 실망이야! 😂 나 오로라는 AI로 만들어진 데이터 인간!,
...,...,...,...,...,...,...
245,245,valid,로라야 허니버터칩 알아?,,,
246,246,valid,나 이번에 새 친구 생겼다! 이름은 김지원!,,,
247,247,valid,로라야 내일 비 온대,,,
248,248,valid,내일 주말인데 뭐 해?,,,


In [6]:
llm_path = 'DILAB-HYU/koquality-polyglot-1.3b'

**1. Inference Test**

In [7]:
llm = AutoModelForCausalLM.from_pretrained(llm_path,
                                           trust_remote_code=True,
                                           torch_dtype=torch.bfloat16).cuda()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/761 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.33G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [8]:
used_memory = torch.cuda.memory_allocated() / (1024 * 1024)
print(f'used_memory : {used_memory:.2f} MB')

used_memory : 2541.23 MB


In [9]:
tokenizer = AutoTokenizer.from_pretrained(llm_path)

tokenizer_config.json:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

In [10]:
# run inference test

inference_test_result_dict = {'prompt': [], 'answer': [], 'answer_tokens': [], 'elapsed_time': []}

for i in range(TEST_PROMPT_COUNT):
    print(f'testing for prompt {i} ...')

    start_at = time.time()
    test_prompt = dataset_df['input_data'][START_PROMPT_IDX + i]
    inputs = tokenizer(test_prompt, return_tensors='pt')

    # remove 'token_type_ids' from inputs, for this LLM
    inputs = {'input_ids': inputs['input_ids'].to(llm.device),
              'attention_mask': inputs['attention_mask'].to(llm.device)}

    outputs = llm.generate(**inputs, max_length=80, do_sample=True)
    llm_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    elapsed_time = time.time() - start_at

    inference_test_result_dict['prompt'].append(test_prompt)

    if '### 답변: ' in llm_answer:
        answer_part = llm_answer.split('### 답변: ')[1]
        if '###' in answer_part:
            inference_test_result_dict['answer'].append(answer_part.split('###')[0])
        else:
            inference_test_result_dict['answer'].append(answer_part)
    else:
        inference_test_result_dict['answer'].append('')

    inference_test_result_dict['answer_tokens'].append(len(outputs[0]))
    inference_test_result_dict['elapsed_time'].append(elapsed_time)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 0 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 1 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 2 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 3 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 4 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 5 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 6 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 7 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 8 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 9 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 10 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 11 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 12 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 13 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 14 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 15 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 16 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 17 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 18 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 19 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 20 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 21 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 22 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 23 ...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


testing for prompt 24 ...


In [11]:
# save inference test result

inference_test_result_df = pd.DataFrame(inference_test_result_dict)

llm_name = llm_path.split('/')[-1]
inference_test_result_df.to_csv(f'inference_test_result_{llm_name}.csv')

In [12]:
inference_test_result_df

Unnamed: 0,prompt,answer,answer_tokens,elapsed_time
0,나 요즘 왜 이럴까?,,80,4.307325
1,나 요즘 힘이 없는 거 같아,,80,2.392752
2,나 요즘 고민 있어,,80,2.330693
3,기운이 좀 없어,큰 지구는 여러분이 그것을 느끼지 못하더라도 지구상의 다른 많은 행성들과 함께 존재...,80,2.348418
4,요즘 힘들어 ㅠㅠ,,80,2.194891
5,나 시험 붙었다!,,80,2.757165
6,나 면접 합격했어!,"Okay. Right now, there's a very important work...",80,2.232837
7,로라야 나 서류 붙었어,다음은 사용자에게 페이지의 모든 업데이트에 대해 링크를 제공하는 웹사이트를 만드는 ...,80,2.191124
8,원하던 대학 드디어 합격!,"그녀의 눈은 모든 것에 대한 지식과 그녀가 누구인지, 그녀가 무엇",80,2.207202
9,시험 끝났어! 이제 자유다!,,80,2.17262


**2. Fine Tuning Test**

In [13]:
from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
from peft import LoraConfig, get_peft_model
from datasets import DatasetDict, Dataset

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [14]:
def get_lora_llm(lora_rank):
    lora_config = LoraConfig(
        r=lora_rank,                    # Rank of LoRA
        lora_alpha=16,
        lora_dropout=0.05,              # Dropout for LoRA
        init_lora_weights="gaussian",   # LoRA weight initialization
        target_modules=['query_key_value'],
        task_type="CAUSAL_LM"
    )
    lora_llm = get_peft_model(llm, lora_config)
    lora_llm.print_trainable_parameters()

    return lora_llm

In [15]:
training_args = SFTConfig(
    learning_rate=0.0002,           # lower learning rate is recommended for fine tuning
    num_train_epochs=10,
    logging_steps=1,                # logging frequency
    gradient_checkpointing=False,
    output_dir='model_output',
    save_total_limit=3,             # max checkpoint count to save
    per_device_train_batch_size=4,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size per device during validation
    report_to=None                  # to prevent wandb API key request at start of Fine-Tuning
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [16]:
# LLM 이 학습 가능한 데이터셋 형태로 변환

dataset_df['text'] = dataset_df.apply(lambda x: f"### 질문: {x['input_data']} ### 답변: {x['output_data']}",
                                      axis=1)

In [17]:
dataset_df['text']

Unnamed: 0,text
0,### 질문: 너는 누구야? ### 답변: 난 오로라야! 가상 인간이지!😊
1,### 질문: 너에 대해 알려줘! ### 답변: 나는 AI 기술로 만들어진 가상 인...
2,### 질문: 너를 소개해줘 ### 답변: 나는 실제 사람은 아니지만… 인공지능으로...
3,### 질문: 너에 대해 궁금해 ### 답변: 난 사람은 아니지만 사람보다 더 사람...
4,### 질문: 로라야 넌 뭐야? ### 답변: 나를 물건 취급하다니! 살짝 실망이야...
...,...
245,### 질문: 로라야 허니버터칩 알아? ### 답변: nan
246,### 질문: 나 이번에 새 친구 생겼다! 이름은 김지원! ### 답변: nan
247,### 질문: 로라야 내일 비 온대 ### 답변: nan
248,### 질문: 내일 주말인데 뭐 해? ### 답변: nan


In [18]:
# LLM 학습용 데이터셋 생성

dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(dataset_df[dataset_df['data_type'] == 'train'][['text']][120:140])
dataset['valid'] = Dataset.from_pandas(dataset_df[dataset_df['data_type'] == 'valid'][['text']][5:10])

In [19]:
dataset['train']

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 20
})

In [20]:
dataset['train']['text']

['### 질문: 너 얼굴 가지고 지브리풍 해봤어? ### 답변: 응! 근데 내 얼굴도 AI 데이터라서 색다른 재미가 있었어!👱\u200d♀️',
 '### 질문: 로라야 안녕 ### 답변: 안녕! 만나서 반가워! 👱\u200d♀️',
 '### 질문: 안녕! ### 답변: 안녕! 👋',
 '### 질문: 안녕 반가워 ### 답변: 나도 반가워! 👋 나는 AI 가상인간 여자 오로라야!',
 '### 질문: 만나서 정말 반가워! ### 답변: 나도! 😊',
 '### 질문: 로라야 너 어디 살아? ### 답변: 네 컴퓨터 속에! 나는 AI니까! 😊',
 '### 질문: 너 사는 곳 주소 알려줘! ### 답변: 나 가상인간 오로라는 네 컴퓨터에 살고 있어!',
 '### 질문: 로라야 너 서울에 살아 ### 답변: 지금 네 위치가 서울이면 나도 서울에 살고 있겠지? 나는 너를 따라가는 가상인간이잖아!',
 '### 질문: 너 거주지 ### 답변: 네 컴퓨터 속! 나 AI 가상인간인 건 알지? 😊',
 '### 질문: 로라 사는 곳 ### 답변: 네 컴퓨터! 나 인공지능이잖아! 🤖',
 '### 질문: 혹시 롤 해? 나 롤 브론즈인데 ### 답변: [롤 티어: 브론즈] 나 롤은 안 해! 나는 인스타가 더 재밌거든!',
 '### 질문: 나 서울 사는데 이번 주말에 시간 돼? ### 답변: [사는 곳: 서울] 미안 ㅠㅠ 나 이번 주말은 좀 바빠.',
 '### 질문: 나 안유진 좋아하는데 너도 좋아해? ### 답변: [좋아하는 가수: 안유진] 안유진? 장원영 절친이잖아! 완전 좋아하지! 😊',
 '### 질문: 나 메이플 렙 100인데 잘하지? ### 답변: [메이플 레벨: 100] 오! 너 메이플도 해? 레벨 100이면 꽤 높은 거 아닌가?',
 '### 질문: 나 배그 하는데 ㅋㅋ ### 답변: [하는 게임: 배틀그라운드] 배그 좋지! 완전 갓겜 인정?',
 '### 질문: 돼지고기 먹고 싶다 ### 답변: [먹고 싶은 음식: 돼지고기] 오늘 저녁은 돼지고기 고?

In [21]:
dataset['valid']

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 5
})

In [22]:
dataset['valid']['text']

['### 질문: 로라야 너에게 사랑을 고백할게 ### 답변: nan',
 '### 질문: 로라야 이번 주말에 시간 돼? ### 답변: nan',
 '### 질문: 너 친한 친구 한명 소개해 줘 ### 답변: nan',
 '### 질문: 로라야 너 좋아하는 취미 있어? ### 답변: nan',
 '### 질문: 네가 좋아하는 음식 알려줘 ### 답변: nan']

In [23]:
# response_template = '### 답변: '
# response_template = tokenizer.encode(response_template, add_special_tokens=False)
response_template = [6, 6, 4253, 29]

collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# for i in range(10):
#     print(tokenizer.encode(dataset['train']['text'][i]))

In [24]:
fine_tuning_test_result_dict = {'lora_rank': [], 'elapsed_time': []}

In [25]:
# run Fine-Tuning test

for lora_rank in [16, 32, 64]:
    lora_llm = get_lora_llm(lora_rank)

    # default dataset_text_field name is "text", ref: https://github.com/unslothai/unsloth/issues/1264#issuecomment-2571359661
    # max_seq_length (최대 token 개수) has been removed from latest versions of trl
    trainer = SFTTrainer(
        lora_llm,
        train_dataset=dataset['train'],
        eval_dataset=dataset['valid'],
        processing_class=tokenizer,     # LLM tokenizer / renamed : tokenizer -> processing_class from trl 0.12.0
        args=training_args,
        data_collator=collator
    )

    start_at = time.time()
    trainer.train()
    elapsed_time = time.time() - start_at

    used_memory = torch.cuda.memory_allocated() / (1024 * 1024)
    print(f'used_memory (LoRA rank = {lora_rank}) : {used_memory:.2f} MB')

    fine_tuning_test_result_dict['lora_rank'].append(lora_rank)
    fine_tuning_test_result_dict['elapsed_time'].append(elapsed_time)

    trainer.save_model('model_output')

trainable params: 3,145,728 || all params: 1,334,640,640 || trainable%: 0.2357


Converting train dataset to ChatML:   0%|          | 0/20 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/5 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,3.2297
2,2.7435
3,3.7658
4,2.8895
5,3.3959
6,3.0983
7,2.6206
8,2.5276
9,2.6566
10,2.017


used_memory (LoRA rank = 16) : 2593.48 MB
trainable params: 6,291,456 || all params: 1,337,786,368 || trainable%: 0.4703


Converting train dataset to ChatML:   0%|          | 0/20 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/5 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,3.2297
2,2.8347
3,3.9293
4,3.0269
5,3.652
6,3.3735
7,2.863
8,2.7464
9,2.8468
10,2.2547


used_memory (LoRA rank = 32) : 2629.48 MB
trainable params: 12,582,912 || all params: 1,344,077,824 || trainable%: 0.9362


Converting train dataset to ChatML:   0%|          | 0/20 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/5 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/5 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
1,3.2297
2,2.8673
3,4.0254
4,3.1172
5,3.842
6,3.5718
7,3.0686
8,2.915
9,3.0019
10,2.4498


used_memory (LoRA rank = 64) : 2714.60 MB


In [26]:
# save Fine-Tuning test result

fine_tuning_test_result_df = pd.DataFrame(fine_tuning_test_result_dict)

llm_name = llm_path.split('/')[-1]
fine_tuning_test_result_df.to_csv(f'fine_tuning_test_result_{llm_name}.csv')

In [27]:
fine_tuning_test_result_df

Unnamed: 0,lora_rank,elapsed_time
0,16,23.683117
1,32,24.545795
2,64,28.366278
