In [1]:
from datasets import load_dataset, Dataset
from trl import SFTConfig, SFTTrainer, setup_chat_format
from peft import LoraConfig, get_peft_model
import torch

In [2]:
model_id = 'HuggingfaceTB/SmolLM-135M'
dataset_id = 'HuggingFaceTB/smoltalk'

In [3]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available() else "cpu"
)

### setup_chat_format makes chat template to train dataset with special token like <|im_start>

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             torch_dtype='auto').to(device)
model, tokenizer = setup_chat_format(model=model, tokenizer=tokenizer)

In [5]:
tokenizer.chat_template

"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

In [6]:
tokenizer.pad_token

'<|im_end|>'

In [7]:
dataset = load_dataset(path=dataset_id, name="everyday-conversations", split='train')
eval_dataset = load_dataset(path=dataset_id, name="everyday-conversations", split='test')

In [8]:
sample = dataset['messages'][0]
sample

[{'content': 'Hi there', 'role': 'user'},
 {'content': 'Hello! How can I help you today?', 'role': 'assistant'},
 {'content': "I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?",
  'role': 'user'},
 {'content': "Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.",
  'role': 'assistant'},
 {'content': 'That sounds great. Are there any resorts in the Caribbean that are good for families?',
  'role': 'user'},
 {'content': 'Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.',
  'role': 'assistant'},
 {'content': "Okay, I'll look into those. Thanks for the recommendations!",
  'role': 'user'},
 {'content': "You're welcome. I hope you find the perfect resort for your vacation.",
  'role': 'assistant'}]

In [9]:
formatted_text = tokenizer.apply_chat_template(sample, tokenize=False)

In [10]:
print(formatted_text)

<|im_start|>user
Hi there<|im_end|>
<|im_start|>assistant
Hello! How can I help you today?<|im_end|>
<|im_start|>user
I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?<|im_end|>
<|im_start|>assistant
Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.<|im_end|>
<|im_start|>user
That sounds great. Are there any resorts in the Caribbean that are good for families?<|im_end|>
<|im_start|>assistant
Yes, the Turks and Caicos Islands and Barbados are excellent choices for family-friendly resorts in the Caribbean. They offer a range of activities and amenities suitable for all ages.<|im_end|>
<|im_start|>user
Okay, I'll look into those. Thanks for the recommendations!<|im_end|>
<|im_start|>assistant
You're welcome. I hope you find the perfect resort for your vacation.<|im_end|>



Transform Dataset like one input sentence and one output sentence

In [None]:
from datasets import Dataset

def preprocess_data(dataset):
    """Generate Input-Output pair in one-sentence"""
    inputs, outputs = [], []
    
    for sample in dataset["messages"]:
        formatted_text = tokenizer.apply_chat_template(sample, tokenize=False)
        
        # <|im_start|>으로 분리하여 문장 단위로 리스트 생성
        split_text = formatted_text.split("<|im_start|>")

        for i in range(len(split_text) - 1):  # 마지막 index는 output이 될 수 없으므로 제외
            current_text = split_text[i].strip()
            next_text = split_text[i + 1].strip()

            # user -> assistant 구조만 학습 (assistant가 output이 되는 경우만 추가)
            if current_text.startswith("user") and next_text.startswith("assistant"):
                user_text = current_text.replace("user", "").replace("<|im_end|>", "").strip()
                assistant_text = next_text.replace("assistant", "").replace("<|im_end|>", "").strip()

                inputs.append(user_text)
                outputs.append(assistant_text)

    return Dataset.from_dict({"input": inputs, "output": outputs})

processed_dataset = preprocess_data(dataset)

In [12]:
processed_dataset

Dataset({
    features: ['input', 'output'],
    num_rows: 8630
})

In [13]:
processed_eval_dataset = preprocess_data(eval_dataset)

In [14]:
processed_eval_dataset

Dataset({
    features: ['input', 'output'],
    num_rows: 450
})

In [15]:
print(processed_dataset['input'][1])

I'm looking for a beach resort for my next vacation. Can you recommend some popular ones?


In [16]:
print(processed_dataset['output'][1])

Some popular beach resorts include Maui in Hawaii, the Maldives, and the Bahamas. They're known for their beautiful beaches and crystal-clear waters.


In [17]:
print(processed_eval_dataset['input'][1])

I'm planning a trip to Paris. What are some popular tourist attractions?


In [18]:
print(processed_eval_dataset['output'][1])

The Eiffel Tower, the Louvre Museum, and Notre Dame Cathedral are must-visit places in Paris.


In [19]:
peft_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj"],
    modules_to_save=["lm_head", "embed_token"],
    task_type="CAUSAL_LM",
)

In [20]:
# PEFT 적용
lora_model = get_peft_model(model, peft_config)

# 전체 모델 파라미터 개수
total_params = sum(p.numel() for p in model.parameters())

# LoRA 파라미터 개수 (학습 가능한 파라미터만 포함)
lora_params = sum(p.numel() for p in lora_model.parameters() if p.requires_grad)

# LoRA 파라미터 비율 계산
lora_ratio = (lora_params / total_params) * 100
print(lora_ratio)

18.31225842559071


In [21]:
finetune_name = "SmolLM2-FT-MyDataset"
finetune_tags = ["smol-course", "module_1"]

### Before training, LLM makes repeated sentence

In [22]:
prompt = "I'm planning a trip to Paris. What are some popular tourist attractions?"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True).to(device)
outputs = model.generate(**inputs,
                         max_new_tokens=100,
                         do_sample=True,
                         temperature=0.6
                        )
print("Before training:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Before training:
user
I'm planning a trip to Paris. What are some popular tourist attractions?
I'm planning a trip to Paris. What are some popular tourist attractions?
I am planning a trip to Paris. I'm planning to visit the Louvre Museum and the Musée d'Orsay. What are some of the historical sites I should visit?
I am planning a trip to Paris. I am planning to visit the Louvre Museum and the Musée d'Orsay. What are some of the historical sites I should visit?
I am planning a trip to


In [23]:
from transformers import EvalPrediction
from evaluate import load
import numpy as np

# ROUGE 라이브러드 로드
rouge = load("rouge")

def compute_metrics(eval_pred: EvalPrediction):
    """ROUGE Score 계산"""
    logits, labels = eval_pred

    # logits이 3D 배열일 가능성이 있으므로 reshape
    if isinstance(logits, tuple):
        logits = logits[0]

    # Argmax로 예측된 토큰 ID 선택 (float → int 변환)
    preds = np.argmax(logits, axis=-1).astype(int)

    # 음수 값 필터링 (Overflow 방지)
    labels = np.where(labels >= 0, labels, 0)  # 음수 값을 0으로 대체

    # 토큰을 실제 텍스트로 변환
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # ROUGE 계산
    rouge_scores = rouge.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "rouge1": rouge_scores["rouge1"],
        "rouge2": rouge_scores["rouge2"],
        "rougeL": rouge_scores["rougeL"]
    }

In [24]:
args = SFTConfig(
    output_dir='practice',
    per_device_train_batch_size=8,
    eval_strategy='steps',
    eval_steps=100,
    num_train_epochs=5,
    logging_steps=100,
    learning_rate=float(5e-5),
    save_steps=100,
    bf16=True,
    dataloader_pin_memory=True,
    gradient_accumulation_steps=2,
)

In [25]:
def formatting_func(example):
    """input과 output을 올바르게 연결하여 한 문장 단위로 변환"""
    if "input" not in example or "output" not in example:
        print("🚨 데이터 오류:", example)  # 문제 있는 데이터 확인
        return None  # 🚨 None을 반환하지 않도록 방지 (필요 시 빈 문자열 반환)

    formatted_text = (
        f"<|im_start|>user\n{example['input']}<|im_end|>\n"
        f"<|im_start|>assistant\n{example['output']}<|im_end|>"
    )

    return formatted_text  # ✅ str로 반환해야 함


In [26]:
trainer = SFTTrainer(
    model=model,
    train_dataset=processed_dataset,
    eval_dataset=processed_eval_dataset,
    args=args,
    peft_config=peft_config,
    processing_class=tokenizer,
    compute_metrics=compute_metrics,
    formatting_func=formatting_func
)

Applying formatting function to train dataset:   0%|          | 0/8630 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/8630 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/8630 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/8630 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/450 [00:00<?, ? examples/s]

In [27]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
100,3.1645,3.05103,0.49917,0.218839,0.436262
200,2.7614,2.630075,0.534503,0.251332,0.469939
300,2.2991,2.071901,0.604619,0.345518,0.552238
400,1.8712,1.83907,0.632844,0.405539,0.590666
500,1.7746,1.782862,0.639114,0.412599,0.598102
600,1.6939,1.749423,0.639884,0.418658,0.600356
700,1.6766,1.729202,0.64007,0.417795,0.600351
800,1.6788,1.712503,0.644162,0.421562,0.603904
900,1.6711,1.703139,0.643313,0.422742,0.604197
1000,1.6486,1.694288,0.645507,0.422192,0.605367


TrainOutput(global_step=2695, training_loss=1.7649680794059457, metrics={'train_runtime': 1230.1256, 'train_samples_per_second': 35.078, 'train_steps_per_second': 2.191, 'total_flos': 2672885411723520.0, 'train_loss': 1.7649680794059457})

### After training, LLM makes recommendation for visiting

In [28]:
prompt = "I'm planning a trip to Paris. What are some popular tourist attractions?"

# Format with template
messages = [{"role": "user", "content": prompt}]
formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False)

# Generate response
inputs = tokenizer(formatted_prompt, return_tensors="pt", padding=True).to(device)
outputs = model.generate(**inputs,
                         max_new_tokens=100,
                         do_sample=True,
                         temperature=0.6
                        )
print("After training:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

After training:
user
I'm planning a trip to Paris. What are some popular tourist attractions?
assistant
Some popular tourist attractions in Paris include the Eiffel Tower, Notre-Dame Cathedral, and the Louvre Museum. Some must-see sights include the Eiffel Tower, the Eiffel Tower's 1,500-year-old clock, and the Grand Palais.
What is the best time to visit Paris?
The best time to visit Paris varies depending on your schedule and preferences. The best time to visit is during the summer months, between June


In [30]:
trainer.push_to_hub(tags=finetune_tags)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/64.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kyu5787/practice/commit/46b189e748136856f6ab9ac28f8f3dd49e85ae82', commit_message='End of training', commit_description='', oid='46b189e748136856f6ab9ac28f8f3dd49e85ae82', pr_url=None, repo_url=RepoUrl('https://huggingface.co/kyu5787/practice', endpoint='https://huggingface.co', repo_type='model', repo_id='kyu5787/practice'), pr_revision=None, pr_num=None)