In [28]:
# Step 1: Install Required Libraries
!pip install transformers datasets accelerate



In [29]:
# Step 2: Import Libraries
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments, pipeline

In [30]:
# Step 3: Load Pretrained GPT-2 Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [31]:
# Step 4: Define Custom Dataset (using the provided text)
text_data = """The knight crossed the valley with fire behind him.
The moonlight lit up her journal as she wrote her last words.
Beneath the waves, ancient ruins whispered tales of betrayal.
Every shadow told a story; every whisper carried a warning.
Let go, for even the river forgets its beginning.
Sometimes the questions matter more than the answers.
You are not broken; you are just unfolding.
Grief is just love without a place to land.
We are all stories pretending to be solid.
Truth waits in the quiet corners where ego dares not look.
"""

text_lines = text_data.strip().split('\n')

In [32]:
# Step 5: Load and Tokenize Dataset
# Create a dataset from the loaded text
dataset = Dataset.from_dict({"text": text_lines})


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [33]:
# Step 6: Prepare Training Components
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    prediction_loss_only=True,
    fp16=True,
    report_to="none",
)

In [None]:
# Step 7: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

  trainer = Trainer(


In [None]:
# Step 8: Save Model
model.save_pretrained("gpt2-finetuned")
tokenizer.save_pretrained("gpt2-finetuned")

In [None]:
# Step 9: Generate Text from the Fine-Tuned Model
generator = pipeline("text-generation", model="gpt2-finetuned", tokenizer=tokenizer)
prompt = "Once upon a time"
output = generator(
    prompt,
    max_new_tokens=100,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.9,
    top_p=0.95,
    top_k=50,
    repetition_penalty=1.2
)
print(output[0]['generated_text'])

In [None]:
import nbformat

# 🔹 Replace this with your actual file path
original_path = "/content/drive/MyDrive/Colab Notebooks/gpt2_finetuning_poetic.ipynb.ipynb"
clean_path = "/content/drive/MyDrive/Colab Notebooks/gpt2_finetuning_poetic_clean.ipynb"

# Load notebook
nb = nbformat.read(open(original_path), as_version=4)

# Remove any broken widget metadata
for cell in nb.cells:
    if "widgets" in cell.get("metadata", {}):
        del cell["metadata"]["widgets"]

# Save the cleaned notebook
nbformat.write(nb, clean_path)

# Optional: Download cleaned notebook if pushing manually
from google.colab import files
files.download(clean_path)
