In [None]:
# 取得 pretrained model
from transformers import AutoTokenizer, AutoModelForCausalLM

device = "cpu" # "cuda" for GPU usage or "cpu" for CPU usage
checkpoint = "HuggingFaceTB/SmolLM-360M-Instruct"
# checkpoint = "HuggingFaceTB/SmolLM-1.7B-Instruct"
# checkpoint = "microsoft/Phi-3-mini-4k-instruct"
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
tokenizer = AutoTokenizer.from_pretrained(checkpoint, cache_dir="./.cache")
model = AutoModelForCausalLM.from_pretrained(checkpoint, cache_dir="./.cache").to(device)

In [2]:
from datasets import load_dataset

raw_data = load_dataset("ryan98153/remove-mistake", cache_dir=".cache/datasets", split="mistake_short")

In [None]:
from typing import Dict, Any

def func(example: Dict[str, Any]) -> Dict[str, Any]:
    return {"text": f"{example['article']}\n{example['answer']}"}

raw_data.remove_columns(['id', 'topic', 'hint', 'marked_article'])
dataset = raw_data.map(func).remove_columns(['id', 'topic', 'article', 'answer', 'mistake', 'hint', 'marked_article'])

In [None]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=400, return_tensors="pt")

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
tokenized_dataset

In [None]:
tokenized_dataset = tokenized_dataset.train_test_split(test_size=0.05)
tokenized_dataset

In [None]:
eos_string = tokenizer.decode([tokenizer.eos_token_id])
eos_string

In [7]:
from transformers import DataCollatorForLanguageModeling


data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [None]:
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])
for key in out:
    print(f"{key} shape: {out[key].shape}")

In [9]:
class CustomDataCollatorForLanguageModeling(DataCollatorForLanguageModeling):
    def __call__(self, examples):
        batch = super().__call__(examples)
        labels = batch['labels']
        eos_token_id = self.tokenizer.eos_token_id

        labels[labels == -100] = eos_token_id

        batch['labels'] = labels
        return batch

data_collator = CustomDataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

In [None]:
out = data_collator([tokenized_dataset["train"][i] for i in range(5)])
out["labels"][0]

In [None]:
from transformers import Trainer, TrainingArguments


args = TrainingArguments(
    output_dir="SmolLM",
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    eval_strategy="steps",
    eval_steps=250,
    gradient_accumulation_steps=8,
    num_train_epochs=3,
    weight_decay=0.1,
    warmup_steps=50,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    save_steps=500,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
)

trainer.train()

In [None]:
trained_model = trainer.model

# prompt = "Small models are great.\n"
prompt = raw_data[0]['article']
#Small models are great.
#{"Small": "models", "are": "great"}
input_ids = tokenizer.encode(prompt, return_tensors="pt", add_special_tokens=False).to(device)

generated_ids = trained_model.generate(
 input_ids,
    max_new_tokens=30,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)



In [14]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
trained_model.push_to_hub("SmolLM-135M-fine-tuned")
# trained_model.save_pretrained("./train/SmolLM-135M-fine-tuned")

NameError: name 'trained_model' is not defined