In [18]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModel
from datasets import load_dataset

# กำหนดโมเดลและ tokenizer
checkpoint = "gpt2"
model = TFAutoModel.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# กำหนด pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# โหลด dataset
ds = load_dataset("Exched/Hutao_furina_roleplay")

# ตรวจสอบตัวอย่าง dataset
print(ds)
print(ds["train"][0])  # ตัวอย่างข้อมูล

# ฟังก์ชัน tokenize
def tokenize_function(examples):
    return tokenizer(
        examples["input"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Tokenize dataset
tokenized_datasets = ds.map(tokenize_function, batched=True)

# ดูตัวอย่างข้อมูลที่ tokenize แล้ว
print(tokenized_datasets["train"][0])


All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 787
    })
})
{'input': 'What are you planning today, Hu Tao?', 'output': 'Hmm, who knows? Maybe I’ll surprise you! Or maybe… I’m waiting for you to take the first step?'}
{'input': 'What are you planning today, Hu Tao?', 'output': 'Hmm, who knows? Maybe I’ll surprise you! Or maybe… I’m waiting for you to take the first step?', 'input_ids': [2061, 389, 345, 5410, 1909, 11, 11256, 32120, 30, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 502

In [None]:
from transformers import AutoTokenizer, TFAutoModelForCausalLM
from datasets import load_dataset
import tensorflow as tf

# โหลด tokenizer และ model
checkpoint = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForCausalLM.from_pretrained(checkpoint)

# เพิ่ม pad_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# โหลด dataset
ds = load_dataset("Exched/Hutao_furina_roleplay")

# ฟังก์ชัน tokenize แบบ batch
def preprocess_function(examples):
    texts = ["input: " + i + " output: " + o for i, o in zip(examples["input"], examples["output"])]
    tokenized = tokenizer(
        texts,
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="tf",
    )
    tokenized["labels"] = tokenized["input_ids"]
    return tokenized

# Tokenize dataset
tokenized_datasets = ds.map(preprocess_function, batched=True)

# แปลงเป็น tf.data.Dataset
train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],  # เพิ่ม labels
    shuffle=True,
    batch_size=8,
)

# กำหนด optimizer และ loss
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

# คอมไพล์โมเดล
model.compile(optimizer=optimizer, loss=loss)

# ฝึกโมเดล
model.fit(train_dataset, epochs=3)


All PyTorch model weights were used when initializing TFGPT2LMHeadModel.

All the weights of TFGPT2LMHeadModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Map:   0%|          | 0/787 [00:00<?, ? examples/s]

Epoch 1/3