<a href="https://colab.research.google.com/github/Parth-349/PRODIGY_GA_01/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install Required Libraries
!pip install transformers datasets accelerate

In [None]:
# Step 2: Import Libraries
from datasets import load_dataset, Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from transformers import pipeline

In [None]:
# Step 3: Load Pretrained GPT-2 Tokenizer and Model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
# Step 4: Define Custom Dataset (using the provided text)
text_data = """The knight crossed the valley with fire behind him.
The moonlight lit up her journal as she wrote her last words.
Beneath the waves, ancient ruins whispered tales of betrayal.
Every shadow told a story; every whisper carried a warning.
...
"""

# Split the text into lines to simulate the previous dataset structure
text_lines = text_data.strip().split('\n')

# Step 5: Load and Tokenize Dataset
# Create a dataset from the loaded text
dataset = Dataset.from_dict({"text": text_lines})


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Step 6: Prepare Training Components
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False,
)

training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=1,
    logging_steps=100,
    prediction_loss_only=True,
    fp16=True,
    report_to="none",
)

In [None]:
# Step 7: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
# Step 8: Save Model
model.save_pretrained("gpt2-finetuned")
tokenizer.save_pretrained("gpt2-finetuned")

In [None]:
# Step 9: Generate Text from the Fine-Tuned Model
generator = pipeline("text-generation", model="gpt2-finetuned", tokenizer=tokenizer)
prompt = "Once upon a time"
output = generator(prompt, max_new_tokens=100, num_return_sequences=1, repetition_penalty=1.2)
print(output[0]['generated_text'])