In [22]:
!pip install transformers
!pip install datasets
!pip install torch



In [23]:
from datasets import load_dataset

# 載入故事資料集
dataset = load_dataset("ShehryarAzhar/stories")

# 查看數據集的樣本
print(dataset['train'][0])

{'id': 1, 'title': 'Write an Adventure Story', 'story': "Far across the sea, on the distant shores of Avalon, an intrepid explorer named Captain Drake set sail in search of undiscovered lands. With his loyal crew and a map of unknown territories, he braved the treacherous waters of the open ocean, guided by the stars above. For months, they sailed through storms and calm seas, charting a course into the unknown.  One fateful night, as they neared the edge of the world, Captain Drake's ship was engulfed by a massive whirlpool that dragged them into the depths below. Miraculously, they emerged on the other side, but they soon realized they had entered a realm of myth and legend. Giant sea serpents, mermaids, and krakens roamed the waters, posing a constant threat to their journey.  Undeterred by the dangers that lurked beneath the waves, Captain Drake and his crew pressed on, driven by their thirst for adventure and the promise of untold riches. With each passing day, they faced new chal

In [24]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# 載入GPT-2模型和tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# 添加特殊token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# 定義數據集處理函數
def preprocess_data(examples):
    inputs = examples['story']
    inputs = [input_ for input_ in inputs if input_]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    return model_inputs

# 應用預處理函數
tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=["id", "title", "story"])

# 設置訓練參數
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

# 訓練模型
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["train"]
)

trainer.train()


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/1654 [00:00<?, ? examples/s]

Step,Training Loss
500,1.0821


TrainOutput(global_step=827, training_loss=0.946425690472198, metrics={'train_runtime': 570.1364, 'train_samples_per_second': 2.901, 'train_steps_per_second': 1.451, 'total_flos': 864354041856000.0, 'train_loss': 0.946425690472198, 'epoch': 1.0})

In [31]:
# 定義生成故事的函數
def generate_story(prompt, model, tokenizer, max_length=500):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    inputs = inputs.to(model.device)
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    story = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return story

# 測試生成故事
prompt = "Write an Adventure Story"
print(generate_story(prompt, model, tokenizer))


Write an Adventure Story to a friend, a friend with a heart as deep as the sea, who had lost her virginity to a jealous rival.  Try as she might to maintain her virginity, a young woman with eyes like a moonlit sky and a heart as sharp as a razor blade, she wasn't.  Try as she might to maintain her virginity, a rival rival stole her virginity with a passion that threatened to tear her apart.  Try as she might to maintain her virginity, a jealous rival stole her virginity with a passion for revenge.  Try as she might to maintain her virginity, a rival rival stole her virginity with a passion for revenge.  Try as she might to maintain her virginity, a rival rival stole her virginity with a passion for revenge.  Try as she might to maintain her virginity, a rival rival stole her virginity with a passion for revenge.  Try as she might to maintain her virginity, a rival rival stole her virginity with a passion for revenge.  Try as she might to maintain her virginity, a rival stole her virgi

In [27]:
from nltk.translate.bleu_score import sentence_bleu

# 計算BLEU分數的函數
def calculate_bleu(reference, candidate):
    reference = [reference.split()]
    candidate = candidate.split()
    return sentence_bleu(reference, candidate)

# 示例評估
reference = "Once upon a time in a faraway land, there was a prince who..."
candidate = generate_story("Once upon a time in a faraway land", model, tokenizer)
print(f"BLEU score: {calculate_bleu(reference, candidate)}")


BLEU score: 0.04217842167255958
