In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch
import json
from pathlib import Path

# --- Setup ---
# Set the device for computation (MPS or CPU).
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer and T5 base model.
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name).to(device)

# --- Data Processing ---
# Load a JSONL file with training data.
jsonl_path = Path("/Users/shangray/Desktop/sch_project/output_10k_stage1_ready.jsonl")
lines = []

In [None]:
# Loop through each line to process the data.
with jsonl_path.open("r", encoding="utf-8") as f:
    for line in f:
        try:
            # Parse the JSON object from the line.
            item = json.loads(line)
            input_text = item.get("input", "").strip()
            output_dict = item.get("output", {})
            if not input_text or not isinstance(output_dict, dict):
                continue

            pos_list = output_dict.get("positive", [])
            neg_list = output_dict.get("negative", [])

            # Join positive and negative sentences into one string.
            pos_str = "; ".join(pos_list) if isinstance(pos_list, list) else str(pos_list)
            neg_str = "; ".join(neg_list) if isinstance(neg_list, list) else str(neg_list)
            target_text = f"positive: {pos_str} negative: {neg_str}"

            # Append the input and target to the list.
            lines.append({"input_text": input_text, "target_text": target_text})
        except Exception as e:
            print("Skipping entry:", e)

print(f"Loaded data entries: {len(lines)}")

# Create a Dataset object from the list.
dataset = Dataset.from_list(lines)

# --- Preprocessing Function ---
def preprocess(example):
    """
    Function to tokenize text for the model.
    example (dict): Contains 'input_text' and 'target_text'.
    """
    # Tokenize the input text.
    model_inputs = tokenizer(
        example["input_text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )
    # Tokenize the target text.
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["target_text"],
            truncation=True,
            padding="max_length",
            max_length=128
        )
    # The labels for training are the tokenized target IDs.
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the preprocessing function to the whole dataset.
tokenized_dataset = dataset.map(preprocess, remove_columns=["input_text", "target_text"])

# --- Training ---
# Configure training arguments (output directory, batch size, epochs, etc.).
training_args = TrainingArguments(
    output_dir="./t5_stage1_trained",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    logging_steps=10,
)

# Set up the Trainer object with the model and data.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Start the training process.
trainer.train()

# Save the fine-tuned model and its tokenizer.
model.save_pretrained("./stage1_trained_local")
tokenizer.save_pretrained("./stage1_trained_local")

  from .autonotebook import tqdm as notebook_tqdm


✅ 使用裝置：mps


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


✅ 載入資料筆數：10000


Map: 100%|██████████| 10000/10000 [00:03<00:00, 2937.44 examples/s]
  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss


KeyboardInterrupt: 