In [1]:
!pip install --upgrade pip
!pip install unsloth[cuda-full]
!pip install datasets accelerate peft

Collecting pip
  Downloading pip-25.3-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.3-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.3
Collecting unsloth[cuda-full]
  Downloading unsloth-2025.12.5-py3-none-any.whl.metadata (65 kB)
[0mCollecting unsloth_zoo>=2025.12.4 (from unsloth[cuda-full])
  Downloading unsloth_zoo-2025.12.4-py3-none-any.whl.metadata (32 kB)
Collecting tyro (from unsloth[cuda-full])
  Downloading tyro-1.0.1-py3-none-any.whl.metadata (11 kB)
Collecting xformers>=0.0.27.post2 (from unsloth[cuda-full])
  Downloading xformers-0.0.33.post2-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (1.2 kB)
Collecting bitsandbytes!=0.46.0,!=0.48.0,>=0.45.5 (from unsloth[cu

In [4]:
import torch
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments

# QLoRA (4bit Quantization) でモデルをロード
model,tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # 使用したいモデル
    max_seq_length = 2048, # 最大シーケンス長
    dtype = None,
    load_in_4bit = True, # 4ビット量子化を有効化 (QLoRA)
)

# LoRAアダプターの設定
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # LoRAのランク。高いほど表現力が上がるがメモリ消費も増える
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16, # スケーリング係数
    lora_dropout = 0, # ドロップアウト率
    bias = "none",
    use_gradient_checkpointing = "unsloth", # メモリ効率化
    random_state = 3407,
    # use_input_packing = False,
    max_seq_length = 2048,
)

==((====))==  Unsloth 2025.12.5: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.12.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [9]:
from unsloth import FastLanguageModel
from datasets import load_dataset
import torch

# LLaMA 3.1 8B (約80億) よりも小さい Gemma 2B (約20億) に変更
model_name = "unsloth/gemma-2b-bnb-4bit"

# torch.bfloat16 (BFLOAT16) を使用できるか確認
# BFLOAT16はFP16よりも安定しており、メモリ効率が良い (T4 GPUで利用可能)
# 確保できない場合は torch.float16 にフォールバック
dtype = torch.bfloat16
if torch.cuda.is_available():
    if not torch.cuda.is_bf16_supported():
        dtype = torch.float16

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=512,
    dtype=dtype,             # 最適なデータ型を選択
    load_in_4bit=True,       # QLoRAを強制
    # SFTTrainerで使用する場合、use_gradient_checkpointing='unsloth' を後で設定することで、
    # さらにメモリを節約できますが、ロード時点では不要です。
)

# データセットの読み込み
data = load_dataset("yahma/alpaca-cleaned")
print("Dataset loaded successfully.")

==((====))==  Unsloth 2025.12.5: Fast Gemma patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.07G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/154 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Dataset loaded successfully.


In [10]:
# ① まず、全データのうち 10% だけを使用（90% を破棄）
small_data = data["train"].train_test_split(test_size=0.9, seed=42)["train"]

# ② small_data を Train+Val と Test に分割（例：Test に 10% を割り当てる）
split1 = small_data.train_test_split(test_size=0.1, seed=42)
train_val_data = split1["train"]  # 全体の 90%（small_data のうち）
test_data = split1["test"]        # 全体の 10%（small_data のうち）

# ③ train_val_data をさらに Train と Validation に分割（例：Validation に 10% を割り当てる）
split2 = train_val_data.train_test_split(test_size=0.1, seed=42)
train_data = split2["train"]  # train_val_data の 90%（全体で約 81%）
eval_data = split2["test"]     # train_val_data の 10%（全体で約 9%）

# ④ テンプレートを適用する関数の定義
def format_examples(example):
    instr = example["instruction"]
    inp = example["input"] if example["input"] else ""
    # テンプレート形式の文字列を作成
    example["text"] = (
        f"### Instruction:\n{instr}\n"
        f"### Input:\n{inp}\n"
        f"### Response:\n{example['output']}"
    )
    return example

# ⑤ 各データセットにフォーマットを適用
train_data = train_data.map(format_examples)
eval_data = eval_data.map(format_examples)
test_data = test_data.map(format_examples)

# ⑥ 変換後の例を確認（最初の 200 文字を表示）
print(train_data[0]["text"][:200])



Map:   0%|          | 0/4192 [00:00<?, ? examples/s]

Map:   0%|          | 0/466 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

### Instruction:
Find the next 3 terms in the following sequence: 4, 6, 9, 12
### Input:

### Response:
The next three terms in the sequence are: 15, 18, 21.


In [None]:
# トークナイズを行う関数を定義
def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=512)

# バッチ処理で高速化しつつトークナイズを適用
train_tokens = train_data.map(tokenize_batch, batched=True, remove_columns=["instruction","input","output","text"])
eval_tokens = eval_data.map(tokenize_batch, batched=True, remove_columns=["instruction","input","output","text"])

# トークナイズ後のデータ例を確認
print(train_tokens[0].keys())
# 出力例: dict_keys(['input_ids', 'attention_mask'])



LoRA適用: FastLanguageModel.get_peft_model を使って、現在のモデルにLoRAを組み込みました。またtarget_modulesではLLaMAモデル内のどの部分にLoRAを適用するかを指定しています。これにより、元のモデルは凍結されたまま、追加のLoRA層のみが学習対象となります。実質学習するパラメータ数はモデル全体のごく一部であり、メモリと計算の節約になります。

DataCollator: DataCollatorForLanguageModeling は言語モデル用のデータコラレータで、バッチ内の系列をパディングし、必要に応じて入力をそのままラベルとして複製してくれます。モデルの予測対象となるlabelsを自動的に生成できます。

TrainingArguments: TransformersのTrainingArgumentsで学習の細かな設定を行います。主要なものだけピックアップすると、per_device_train_batch_size=2（バッチサイズ2）、num_train_epochs=1（エポック数1）としてあります。実行時間の制約も考え1エポックに留めています。gradient_checkpointing=Trueで勾配チェックポイントを有効化しメモリ節約を図っています。また、fp16=Trueにより16-bit精度で学習を行います。

学習の実行: trainer.train() を呼ぶと実際に学習がスタートします。時間に制約がある場合は、データの一部のみで試すかステップ数を減らすなどして調整してください。

In [14]:
from unsloth import FastLanguageModel
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# 1. モデルにLoRAアダプタを挿入して学習モードにする
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none"
)
# LoRA適用後、学習可能パラメータ数を確認（任意）
model.print_trainable_parameters()

# 2. データコラレータ（バッチ処理時のデータ整形）を用意
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# 3. 学習時の各種ハイパーパラメータを設定
training_args = TrainingArguments(
    output_dir="./llama-unsloth-model",
    per_device_train_batch_size=2,
    num_train_epochs=1, # 必要に応じて増やしてください
    learning_rate=2e-4,
    fp16=True,                         # 16ビット精度で計算（A100等のGPUではbf16推奨）
    gradient_checkpointing=True,       # 勾配チェックポイントでメモリ節約
    logging_steps=50,
    save_steps=200,
    # evaluation_strategy="epoch",       # 1エポックごとに評価
    save_total_limit=1,
)
# 4. Trainerオブジェクトの初期化（モデル・データ・設定の紐付け）
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_tokens,
    eval_dataset=eval_tokens,
    data_collator=data_collator,
    args=training_args
)

# 5. 学習の実行
trainer.train()



Unsloth: Already have LoRA adapters! We shall skip this step.
  trainer = Trainer(


trainable params: 19,611,648 || all params: 2,525,784,064 || trainable%: 0.7765


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 4,192 | Num Epochs = 1 | Total steps = 2,096
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2
 "-____-"     Trainable parameters = 19,611,648 of 2,525,784,064 (0.78% trained)
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
50,2.6355
100,1.3967
150,1.3558
200,1.3112
250,1.362
300,1.3195
350,1.2989
400,1.2381
450,1.2174
500,1.3331


Unsloth: Will smartly offload gradients to save VRAM!


TrainOutput(global_step=2096, training_loss=1.305524711390488, metrics={'train_runtime': 1202.3826, 'train_samples_per_second': 3.486, 'train_steps_per_second': 1.743, 'total_flos': 1.2374577661034496e+16, 'train_loss': 1.305524711390488, 'epoch': 1.0})

In [17]:

# 推論用のプロンプトを準備
instruction = "Explain the importance of sleep in simple terms."
input_text = ""  # 追加の入力がない場合は空文字
prompt = f"Instruction: \
{instruction} \
Input:{input_text} \
Response: \
"

# トークナイズしてモデルに入力
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# モデルによるテキスト生成
output_ids = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
# トークン列を文字列にデコード
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)



### Instruction: Explain the importance of sleep in simple terms. Input: Response: 
Sleep is an essential part of our daily routine and is vital for our overall health and well-being. It helps us to recover from physical and mental exhaustion, improves our cognitive abilities, and promotes overall health and wellness. It is important to get enough sleep every night, as it allows our bodies to rest and repair themselves, and helps us to be more productive and focused during the day. Without enough sleep, we can experience fatigue, irritability, and difficulty concentrating, which can have a negative impact


In [19]:
# 推論用のプロンプトを準備
instruction = "What supports the U.S. economy?"
input_text = ""  # 追加の入力がない場合は空文字
prompt = f"Instruction: \
{instruction} \
Input:{input_text} \
Response: \
"

# トークナイズしてモデルに入力
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# モデルによるテキスト生成
output_ids = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
# トークン列を文字列にデコード
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Instruction: What supports the U.S. economy? Input: Response: 1. The U.S. economy is supported by a diverse range of industries, including agriculture, manufacturing, technology, healthcare, and finance. 2. The government provides support through tax incentives, infrastructure investments, and other programs that encourage businesses to invest and grow. 3. The U.S. has a strong labor force, with a high level of education and training, which allows businesses to hire and retain skilled workers. 4. The U.S. has a strong financial system


In [21]:
# 推論用のプロンプトを準備
instruction = "What were the causes of the American Depression?"
input_text = ""  # 追加の入力がない場合は空文字
prompt = f"Instruction: \
{instruction} \
Input:{input_text} \
Response: \
"

# トークナイズしてモデルに入力
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# モデルによるテキスト生成
output_ids = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
# トークン列を文字列にデコード
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Instruction: What were the causes of the American Depression? Input: Response: 1. The Great Depression was caused by the stock market crash of 1929, which led to a severe economic downturn. 2. The Great Depression was also caused by the failure of the banking system, which led to a loss of confidence in the economy. 3. The Great Depression was also caused by the loss of jobs and the loss of income, which led to a decrease in consumer spending and a decrease in demand for goods and services. 4. The Great Depression was


In [22]:
# 推論用のプロンプトを準備
instruction = "Write code to calculate prime numbers."
input_text = ""  # 追加の入力がない場合は空文字
prompt = f"Instruction: \
{instruction} \
Input:{input_text} \
Response: \
"

# トークナイズしてモデルに入力
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# モデルによるテキスト生成
output_ids = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
# トークン列を文字列にデコード
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Instruction: Write code to calculate prime numbers. Input: Response: 

<code>def is_prime(number):
    if number < 2:
        return False
    for i in range(2, number):
        if number % i == 0:
            return False
    return True

def main():
    number = int(input("Enter a number: "))
    if is_prime(number):
        print(f"{number} is a prime number.")
    else:
        print(f"{number


In [27]:
# 推論用のプロンプトを準備
instruction = "Summarize the GDPs of the United States, Japan, and China with a table."
input_text = ""  # 追加の入力がない場合は空文字
prompt = f"Instruction: \
{instruction} \
Input:{input_text} \
Response: \
"

# トークナイズしてモデルに入力
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
# モデルによるテキスト生成
output_ids = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
# トークン列を文字列にデコード
generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(generated_text)

Instruction: Summarize the GDPs of the United States, Japan, and China with a table. Input: Response: 

The GDP of the United States in 2021 was $20.4 trillion, Japan was $5.1 trillion, and China was $14.6 trillion. 

The GDP of the United States in 2021 was $20.4 trillion, Japan was $5.1 trillion, and China was $14.6 trillion. 

The GDP of the United States in 2021 was $20.4 trillion
