#**Text Generation with GPT-2**

#1. Setting up the Environment

In [None]:
!pip install transformers
!pip install datasets
!pip install ipykernel
!pip install torch



# 2.Data Preparation

**a) Dataset collecting and formatting**

In [None]:
from datasets import load_dataset
import pandas as pd

In [None]:
dataset = load_dataset("csv", data_files="/deeplearning_questions.csv")

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
df = pd.DataFrame(dataset['train'])
df.head()

Unnamed: 0,ID,DESCRIPTION
0,1,What is padding
1,2,Sigmoid Vs Softmax
2,3,What is PoS Tagging
3,4,What is tokenization
4,5,What is topic modeling


**b) Tokenization**

In [None]:
from transformers import GPT2Tokenizer

In [None]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['ID', 'DESCRIPTION'],
        num_rows: 111
    })
})


In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    # Tokenize descriptions and set labels to be same as input_ids
    tokenized = tokenizer(examples["DESCRIPTION"], truncation=True, padding="max_length", max_length=128)
    tokenized["labels"] = tokenized["input_ids"]  # Set labels to be same as input_ids
    return tokenized

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/99 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

**c) Data splitting**

In [None]:
# Split the dataset into training and validation sets
from datasets import DatasetDict

# Create a validation split from the training data
dataset = DatasetDict({
    'train': tokenized_datasets['train'].shuffle(seed=42).select([i for i in list(range(int(0.9 * len(tokenized_datasets['train']))))]),
    'validation': tokenized_datasets['train'].shuffle(seed=42).select([i for i in list(range(int(0.9 * len(tokenized_datasets['train'])), len(tokenized_datasets['train'])))])
})

# 3. Fine Tuning GPT 2

**1. Load the Pre-trained GPT-2 Model:**

In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

In [None]:
# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# If you added a padding token, resize the model’s embedding layer to match the tokenizer’s vocabulary size
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

**2. Set Up Training Arguments:**

In [None]:
training_args = TrainingArguments(
    output_dir="./results",          # Directory to save model checkpoints
    overwrite_output_dir=True,       # Overwrite the content of the output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=4,   # Batch size per device during training
    save_steps=500,                  # Save checkpoint every 500 steps
    save_total_limit=2,              # Limit the total number of checkpoints
    prediction_loss_only=True,       # Only return loss in the evaluation
)


**3. Set Up the Trainer:**

In [None]:
trainer = Trainer(
    model=model,                        # The pre-trained GPT-2 model
    args=training_args,                 # Training arguments
    train_dataset=dataset['train'],     # Training dataset
    eval_dataset=dataset['validation']  # Validation dataset
)


**4. Start Training:**

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=84, training_loss=0.6881452287946429, metrics={'train_runtime': 856.8833, 'train_samples_per_second': 0.389, 'train_steps_per_second': 0.098, 'total_flos': 21752561664000.0, 'train_loss': 0.6881452287946429, 'epoch': 3.0})

**5. Run a small test batch:**

In [None]:
# Select a small subset for testing
small_test_dataset = tokenized_datasets["train"].select([0, 1, 2])  # Select first 3 examples

# Update the Trainer to use the small test dataset
trainer.train_dataset = small_test_dataset

# Run training
trainer.train()


Step,Training Loss


TrainOutput(global_step=3, training_loss=0.13036566972732544, metrics={'train_runtime': 48.8927, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.061, 'total_flos': 587907072000.0, 'train_loss': 0.13036566972732544, 'epoch': 3.0})

**4. Check Model Output with Sample Data:**

In [None]:
# Load model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Tokenize a sample input
inputs = tokenizer("Example input text", return_tensors="pt")

# Perform a forward pass
outputs = model(**inputs, labels=inputs["input_ids"])
print(outputs.loss)  # Should print the loss value

tensor(8.3631, grad_fn=<NllLossBackward0>)


#4. Evaluation

In [None]:
results = trainer.evaluate()
print(results)


{'eval_loss': 10.494246482849121, 'eval_runtime': 14.5794, 'eval_samples_per_second': 0.823, 'eval_steps_per_second': 0.137}


#5. Save the Trained Model

In [None]:
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

#6. Generate Text

In [None]:
# Generate text
input_text = " What is tokenization"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

In [None]:
# Generate output
output = model.generate(input_ids, max_length=100 , num_return_sequences=1, no_repeat_ngram_size=2)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [None]:
# Decode and print the generated text
print(tokenizer.decode(output[0], skip_special_tokens=True))

 What is tokenization?

Tokenization is a process of creating a new token. It is the process by which a token is created.
. The process is called tokenizing. This process takes a number of steps. First, it creates a unique identifier for the token, and then it generates a set of tokens. Then, the tokens are created and the new tokens can be used to create new accounts. Finally, tokens that are not created are used for other purposes. For example,
