In [None]:
!pip install transformers datasets

In [None]:
# Step 2: Import dependencies
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
from datasets import load_dataset

# Step 3: Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Set pad token to eos token
tokenizer.pad_token = tokenizer.eos_token

# Step 4: Load and tokenize dataset
dataset_path = "/content/stories.txt"  # Updated path
dataset = load_dataset("text", data_files={"train": dataset_path})

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors="pt"
    )

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # GPT-2 is not trained with masked language modeling
)

# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir="/content/gpt2-story-model",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_dir="/content/logs",
    logging_steps=100,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available()  # Use mixed precision if GPU is available
)

# Step 6: Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    data_collator=data_collator
)

trainer.train()

# Step 7: Save the fine-tuned model
model.save_pretrained("/content/gpt2-story-model")
tokenizer.save_pretrained("/content/gpt2-story-model")

# Step 8: Generate text
def generate_text(prompt, max_length=200, temperature=0.7, top_p=0.9):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = input_ids != tokenizer.pad_token_id  # Create attention mask
    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id  # Explicitly set pad token id
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example prompt
prompt = input("Enter your prompt for story generation:")
print(generate_text(prompt))