In [None]:
import json
import torch
from pathlib import Path
from datasets import Dataset
from transformers import (
    T5Tokenizer, T5ForConditionalGeneration,
    Trainer, TrainingArguments, TrainerCallback
)
from tqdm.auto import tqdm

# --- Custom Callback ---
# This class shows a progress bar during training.
class TQDMProgressCallback(TrainerCallback):
    # Setup for the progress bar.
    def __init__(self, total_steps):
        # total_steps (int): The total number of steps in training.
        self.total_steps = total_steps
        self.progress_bar = tqdm(total=total_steps, desc="Training")

    # Update the progress bar at the end of each step.
    def on_step_end(self, args, state, control, **kwargs):
        self.progress_bar.update(1)

    # Close the progress bar when training is done.
    def on_train_end(self, args, state, control, **kwargs):
        self.progress_bar.close()

# --- Setup ---
# Check the device and use MPS if available, otherwise use CPU.
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load the T5 tokenizer and model.
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# --- Data Prep ---
# Load the JSONL data file.
data_path = Path("/Users/shangray/Desktop/sch_project/dataforstage2.jsonl")
samples = []
with data_path.open("r", encoding="utf-8") as f:
    # Loop through each line and get the input and output.
    for line in f:
        item = json.loads(line)
        # Check if the data is valid before using it.
        if isinstance(item.get("input"), str) and isinstance(item.get("output"), str):
            if item["input"].strip() and item["output"].strip():
                samples.append({"input_text": item["input"], "target_text": item["output"]})

print(f"Valid samples loaded: {len(samples)}")

# Create a Dataset object from the list.
dataset = Dataset.from_list(samples)

# --- Tokenization ---
max_input_length = 512
max_target_length = 128

def preprocess(example):
    """
    Tokenize the input and target texts.
    example (dict): A single data point.
    """
    # Tokenize the input text.
    model_inputs = tokenizer(example["input_text"], max_length=max_input_length, truncation=True, padding="max_length")
    # Tokenize the target text.
    labels = tokenizer(example["target_text"], max_length=max_target_length, truncation=True, padding="max_length")
    # Set the labels for training.
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the tokenization function to the whole dataset.
tokenized_dataset = dataset.map(preprocess, remove_columns=["input_text", "target_text"])

# --- Training Setup ---
# Set the training parameters.
EPOCHS = 3
BATCH_SIZE = 4
total_steps = len(tokenized_dataset) * EPOCHS // BATCH_SIZE

training_args = TrainingArguments(
    output_dir="./t5_stage2_trained_local",
    per_device_train_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    logging_steps=10,
    save_strategy="epoch",  # Save after each epoch.
    logging_dir="./logs",
    do_eval=False,          # Disable evaluation.
    fp16=False              # MPS does not support fp16.
)

# --- Training & Saving ---
# Set up the trainer with the model, data, and a progress bar.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    callbacks=[TQDMProgressCallback(total_steps=total_steps)],
)

# Start training the model.
trainer.train()

# Save the trained model and tokenizer to the disk.
trainer.save_model("./t5_stage2_trained_local")
tokenizer.save_pretrained("./t5_stage2_trained_local")

  from .autonotebook import tqdm as notebook_tqdm


✅ 使用裝置：mps


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


✅ 有效樣本數：10000


Map: 100%|██████████| 10000/10000 [00:02<00:00, 3760.36 examples/s]
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Training:   0%|          | 1/7500 [00:02<5:19:41,  2.56s/it]

Step,Training Loss
10,7.1292
20,1.7922
30,0.7499
40,0.5501
50,0.5302
60,0.4966
70,0.3758
80,0.3863
90,0.3693
100,0.3426


Training: 100%|██████████| 7500/7500 [4:36:09<00:00,  2.21s/it]     


('./t5_stage2_trained_local/tokenizer_config.json',
 './t5_stage2_trained_local/special_tokens_map.json',
 './t5_stage2_trained_local/spiece.model',
 './t5_stage2_trained_local/added_tokens.json')

In [None]:


from transformers import T5Tokenizer, T5ForConditionalGeneration


# ✅ 儲存模型本體
model.save_pretrained("./t5_stage2_trained_local")

# ✅ 儲存 slow tokenizer 結構（你目前使用的）
tokenizer.save_pretrained("./t5_stage2_trained_local")
222903552

('./t5_stage2_trained_local/tokenizer_config.json',
 './t5_stage2_trained_local/special_tokens_map.json',
 './t5_stage2_trained_local/spiece.model',
 './t5_stage2_trained_local/added_tokens.json')

In [8]:
print(model is trainer.model)  # True 才是正確的
trainer.save_model("./t5_stage2_trained_local")


True


In [9]:
import os
print(os.listdir("./t5_stage2_trained_local"))


['model.safetensors', 'added_tokens.json', 'tokenizer_config.json', 'special_tokens_map.json', 'config.json', 'generation_config.json', 'training_args.bin', 'spiece.model']


In [2]:
pip install --upgrade "transformers[torch]" "accelerate>=0.26.0"


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers[torch]
  Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
[K     |████████████████████████████████| 10.8 MB 11.5 MB/s eta 0:00:01
[?25hCollecting accelerate>=0.26.0
  Downloading accelerate-1.8.1-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 10.9 MB/s eta 0:00:01
Installing collected packages: transformers, accelerate
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.4
    Uninstalling transformers-4.52.4:
      Successfully uninstalled transformers-4.52.4
Successfully installed accelerate-1.8.1 transformers-4.53.0
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cpu
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.
