In [41]:
# STEP 1: Install required packages
!pip install -q transformers datasets

# STEP 2: Import libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset

# STEP 3: Load pre-trained GPT-2
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)


# STEP 4: Create sample training text
sample_text = """
Once upon a time, there was a kingdom where machines could think and speak.
These machines learned from books, the internet, and conversations with people.
GPT-2 was one such machine that could write amazing stories and complete sentences logically.
Its creators fine-tuned it using lots of custom data to improve its results.
Artificial Intelligence grew smarter every day, with new models learning new skills.
In a classroom, a teacher used GPT-2 to help students write better essays.
The model analyzed grammar, style, and helped organize thoughts clearly.
By feeding GPT-2 lots of high-quality text, it learned how to imitate it in return.
Researchers even used it to create poetry, jokes, and technical writing.
GPT-2 proved that with the right data, language models could become very powerful tools.
"""

# STEP 5: Convert text into HuggingFace Dataset object
lines = sample_text.strip().split("\n")
dataset = Dataset.from_dict({"text": lines})

# STEP 6: Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# STEP 7 (Final Fix): Manually group tokens into blocks
block_size = 128

# Flatten all input_ids into one list
all_input_ids = sum(tokenized_dataset["input_ids"], [])

# Trim to block size multiple
total_length = (len(all_input_ids) // block_size) * block_size
all_input_ids = all_input_ids[:total_length]

# Split into chunks
input_ids = [all_input_ids[i:i + block_size] for i in range(0, total_length, block_size)]
attention_mask = [[1] * block_size] * len(input_ids)

# Build grouped dataset
from datasets import Dataset
lm_dataset = Dataset.from_dict({
    "input_ids": input_ids,
    "attention_mask": attention_mask
})


# STEP 8: Set training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    prediction_loss_only=True
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# STEP 9: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset,
    data_collator=data_collator,
)

trainer.train()

# STEP 10: Save the model
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

print("✅ Training complete and model saved!")


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Step,Training Loss


✅ Training complete and model saved!


In [42]:
trainer.save_model("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [43]:
from transformers import pipeline, GPT2LMHeadModel, GPT2Tokenizer

# Load your fine-tuned model
model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Set pad token again (required for generation)
tokenizer.pad_token = tokenizer.eos_token

# Create text generation pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Generate text from a prompt
prompt = "Once upon a time"
output = generator(prompt, max_length=100, num_return_sequences=1, do_sample=True, temperature=0.8)

print("📝 Generated Text:\n")
print(output[0]["generated_text"])


Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


📝 Generated Text:

Once upon a time the world had been ruled by the Roman Empire, its greatest ruler was a ruler of kings and nobles as deep as the Mediterranean. He was known at the time as the Emperor of the Roman Empire. The people living in the world were known for their wisdom and wisdom, and they were wise enough to know that his rule was good.

He was known for not being a tyrant, but for being just human. He would make his decisions with a firm and decisive mind and a decisive heart. He was willing to listen to the advice of others, even if it meant risking his life to accomplish the same. His actions were pure and spontaneous, with the greatest possible amount of patience and concentration. He was a good leader who would have done anything for his people, even if it meant doing his best to avoid being caught in an act of political treason. He was a man who could always be more powerful than his opponents, and who could always take risks.

But the emperor was more than just a l