# <b> Text Generation with GPT-2 and Fine-Tuning <B>#

In [1]:
pip install faiss-cpu sentence-transformers transformers torch


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [6]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.nn import functional as F

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Adjust tokenizer for GPT-2
tokenizer.pad_token = tokenizer.eos_token  # Set padding token to eos token
device = torch.device("cuda")
model = model.to(device)

# Custom Dataset for Text Generation
class TextGenerationDataset(Dataset):
    def __init__(self, file_path, tokenizer, block_size=128):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        # Tokenize the input text and convert it into input_ids for GPT-2
        self.examples = tokenizer(text, return_tensors="pt", max_length=block_size, truncation=True, padding="max_length")["input_ids"]

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return self.examples[idx]

# Paths to dataset files
train_file_path = "/content/train.txt"  # Update with your training data file path
test_file_path = "/content/test.txt"    # Update with your test data file path

# Load datasets
train_dataset = TextGenerationDataset(train_file_path, tokenizer)
test_dataset = TextGenerationDataset(test_file_path, tokenizer)

train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=2)

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

# Training Loop
num_epochs = 3
model.train()

for epoch in range(num_epochs):
    epoch_loss = 0
    for batch in train_dataloader:
        batch = batch.to(device)
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss / len(train_dataloader)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")
print("Fine-tuned model saved.")

# Evaluation Loop
model.eval()
with torch.no_grad():
    eval_loss = 0
    for batch in test_dataloader:
        batch = batch.to(device)
        outputs = model(input_ids=batch, labels=batch)
        loss = outputs.loss
        eval_loss += loss.item()

    avg_eval_loss = eval_loss / len(test_dataloader)
    print(f"Evaluation Loss: {avg_eval_loss:.4f}")

# Function to generate text with fine-tuned model
def generate_text(prompt, model, tokenizer, max_length=350, temperature=0.7, top_k=50, top_p=0.95):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        num_return_sequences=1,
        do_sample=True,  # Enable sampling to allow creativity in the output
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)


# Alternative example: Text generation with user input
print("\nEnter a prompt for text generation:")
user_prompt = input().strip()

generated_text_user = generate_text(user_prompt, model, tokenizer)
print(f"\nGenerated Text for your prompt:\n{generated_text_user}")


Epoch [1/3], Loss: 2.5187
Epoch [2/3], Loss: 1.9273
Epoch [3/3], Loss: 1.2865
Fine-tuned model saved.
Evaluation Loss: 2.0357

Enter a prompt for text generation:
there is a story


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



Generated Text for your prompt:
there is a story that the President told a reporter about a woman he met while he was in the White House. The story says that when he went to the White House, he asked her, 'What are you doing?' and she said, 'I'm in a room with the President.' He said, 'I don't know what I'm doing.' She said, 'You're not going to tell me anything.' He said, 'No, I'm not going to tell you anything. I'm going to make up stories.'"

It's possible that the two stories are connected, as some have suggested. But a new study by the National Center for Missing and Exploited Children and the National Center for Missing and Exploited Children at New York University's Langone Center for Child and Adolescent Mental Health finds that "no substantial connection exists between the two stories."

"It's a very interesting study," said William C. B. Kugel, a social psychologist at New York University and the lead author of the study. "We can see that people who don't know the story are 