In [None]:
# 先清舊件（沒有也沒關係）
!pip uninstall -y transformers trl peft accelerate datasets pyarrow fsspec gcsfs || true
!pip uninstall -y cudf-cu12 pylibcudf-cu12 dask-cudf-cu12 cuml-cu12 cugraph-cu12 bigframes || true
!pip cache purge

In [None]:
# 安裝可在 CPU 跑、彼此相容的一組
!pip install -U --no-cache-dir \
  "pyarrow==21.0.0" "fsspec==2024.5.0" "gcsfs==2024.5.0" \
  "transformers==4.57.1" "peft==0.13.2" "accelerate==1.4.0" "datasets==4.3.0"

In [None]:
import transformers, datasets, peft, accelerate
print("transformers:", transformers.__version__)
print("datasets:", datasets.__version__)
print("peft:", peft.__version__)
print("accelerate:", accelerate.__version__)

In [None]:
import os, random, json
os.makedirs("/content/data", exist_ok=True)

# 做一個玩具語言：SunnyLang（可改成你的真實文本）
base_sent = [
  "mola toki suna ~ greet friend gently .",
  "suna kala numeri : one two three four five .",
  "mola think chain step by step then answer .",
  "define term : cross validation split folds average score .",
  "suna code style : short answer precise .",
]
# 擴增 400 行
lines = []
for i in range(400):
    t = random.choice(base_sent)
    lines.append(f"[S{1000+i}] {t}")

with open("/content/data/continued_corpus.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(lines))

# 每行一筆 text，用於 causal LM（無標註）
with open("/content/data/continued_corpus.jsonl", "w", encoding="utf-8") as f:
    for s in lines:
        f.write(json.dumps({"text": s}, ensure_ascii=False) + "\n")

!wc -l /content/data/continued_corpus.jsonl
!head -n 3 /content/data/continued_corpus.jsonl

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct"  # 小、CPU 友善
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True, trust_remote_code=True)
if tok.pad_token is None:
    tok.pad_token = tok.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float32,          # CPU：用 FP32 穩定
    device_map={"": "cpu"},
    trust_remote_code=True,
)
model.config.pad_token_id = tok.pad_token_id
model.config.eos_token_id = tok.eos_token_id
print("Loaded.")

In [None]:
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling

ds = load_dataset("json", data_files={"train": "/content/data/continued_corpus.jsonl"})
train_ds = ds["train"]

MAX_LEN = 256
def tokenize(batch):
    out = tok(batch["text"], truncation=True, max_length=MAX_LEN)
    return out

tok_ds = train_ds.map(tokenize, batched=True, remove_columns=train_ds.column_names)
# causal LM，mlm=False
collator = DataCollatorForLanguageModeling(tokenizer=tok, mlm=False)
len(tok_ds), tok_ds[0]

In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=4, lora_alpha=16, lora_dropout=0.1,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    bias="none",
)
model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

In [None]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir="/content/cpt_lora_out",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,   # 等效 bs=8
    max_steps=200,                   # 先來 200 步；你可改 150/300
    learning_rate=5e-5,
    warmup_ratio=0.05,
    weight_decay=0.0,
    logging_steps=10,
    save_steps=100,
    save_total_limit=1,
    report_to=[],
    fp16=False, bf16=False,          # CPU：關混合精度
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok_ds,
    data_collator=collator,
)

trainer.train()

In [None]:
save_dir = "/content/cpt_lora_smolm2_cpu"
model.save_pretrained(save_dir)
tok.save_pretrained(save_dir)
print("Saved to:", save_dir)
!ls -l /content/cpt_lora_smolm2_cpu

In [None]:
import torch
model.eval()

def gen(prompt):
    text = f"### Instruction:\nYou are a helpful assistant. Answer briefly.\n\n### Input:\n{prompt}\n\n### Response:\n"
    inputs = tok(text, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(
            **inputs, max_new_tokens=64,
            do_sample=False, temperature=None, top_p=None,
            pad_token_id=tok.pad_token_id, eos_token_id=tok.eos_token_id
        )
    return tok.decode(out[0], skip_special_tokens=True).split("### Response:")[-1].strip()

print(gen("Translate to SunnyLang: 'hello friend'"))
print("---")
print(gen("Count to five in SunnyLang."))