<a href="https://colab.research.google.com/github/Pavadareni/AI_Training/blob/main/Text%20Generation/Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch




In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.nn import functional as F

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Adjust tokenizer for GPT-2
tokenizer.pad_token = tokenizer.eos_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Custom Dataset for Text Generation
class TextGenerationDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        self.examples = tokenizer(text, return_tensors="pt", max_length=block_size, truncation=True, padding="max_length")["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Paths to dataset files
train_file_path = "train.txt"
test_file_path = "test.txt"

# Load datasets
train_dataset = TextGenerationDataset(train_file_path, tokenizer)
test_dataset = TextGenerationDataset(test_file_path, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training Loop
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in train_dataloader:
        batch = batch.to(device)
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")
print("Fine-tuned model saved.")

# Evaluation Loop
model.eval()
with torch.no_grad():
    eval_loss = 0
    for batch in test_dataloader:
        batch = batch.to(device)
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss
        eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(test_dataloader)
    print(f"Evaluation Loss: {avg_eval_loss:.4f}")

# Text Generation with Fine-Tuned Model
def generate_text(prompt, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=max_length, num_return_sequences=1, temperature=0.7)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example text generation
prompt = "Once upon a time"
generated_text = generate_text(prompt)
print(f"Generated Text:\n{generated_text}")


Epoch [1/3], Loss: 2.7149
Epoch [2/3], Loss: 2.2406
Epoch [3/3], Loss: 1.9627
Fine-tuned model saved.


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Evaluation Loss: 2.9892
Generated Text:
Once upon a time, the world was a place of great beauty and great danger. The world was filled with people who were willing to sacrifice their lives for the good of the world.

The world was filled with people who were willing to sacrifice


In [6]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load fine-tuned GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Adjust tokenizer for GPT-2
tokenizer.pad_token = tokenizer.eos_token
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Function to generate text
def generate_text(prompt, max_length=50, temperature=0.7, top_k=50, top_p=0.95):
    model.eval()
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        num_return_sequences=1,
        do_sample=True,
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Take user input for text generation
print("Enter a prompt for text generation:")
prompt = input().strip()

# Generate and print text
generated_text = generate_text(prompt)
print(f"\nGenerated Text:\n{generated_text}")


Enter a prompt for text generation:
once a upon a time


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated Text:
once a upon a time, but the time of the sun is at hand, and the sun and the moon are at hand.

A good friend of mine, Mr. Johnson, who was the chairman of the Committee on the Presentation of
