# Step 1: Install Required Libraries


In [1]:
pip install transformers torch




# Step 2: Import Libraries


In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import random
import pandas as pd


# Step 3: Load the Pre-trained GPT-2 Model and Tokenizer


In [3]:
# Load GPT-2 tokenizer and model
model_name = "gpt2"  # You can also try "gpt2-medium", "gpt2-large" for better quality
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set the model to evaluation mode to prevent training
model.eval()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

# Step 4: Define a Function for Text Generation


In [5]:
def generate_text_variations(prompt, max_length=50, num_variations=5, temperature=0.7):
    """
    Generate multiple text variations from a prompt using GPT-2.

    Args:
        prompt (str): Input text or sentence for generation.
        max_length (int): Maximum number of tokens to generate.
        num_variations (int): Number of text variations to create.
        temperature (float): Sampling temperature, controls the creativity of generation.

    Returns:
        List[str]: List of generated text variations.
    """
    generated_texts = []

    for _ in range(num_variations):
        # Tokenize input prompt
        input_ids = tokenizer.encode(prompt, return_tensors="pt") # convert to tensor

        # Generate text with specified parameters
        outputs = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=1, # number of variations to generate
            do_sample=True, # sample from the distribution
            top_k=50, # number of highest probability vocabulary tokens to consider at each step
            top_p=0.95 # cumulative probability threshold
        )

        # Decode and append generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_texts.append(generated_text)

    return generated_texts


# Step 5: Generate Synthetic Data for Your Dataset


In [6]:
# Sample dataset of sentences
original_data = [
    "The weather is nice today.",
    "I love playing football.",
    "Artificial intelligence is changing the world.",
    "Data science is a fascinating field."
]

# Dictionary to store augmented data
augmented_data = {}

for sentence in original_data:
    augmented_data[sentence] = generate_text_variations(sentence, num_variations=3)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask

# Step 6: Combine the Augmented Data with the Original Data


In [7]:
# Flatten the data and organize into a DataFrame
augmented_sentences = []
original_sentences = []

for original, variations in augmented_data.items():
    for variation in variations:
        original_sentences.append(original)
        augmented_sentences.append(variation)

# Create a DataFrame
df_augmented = pd.DataFrame({
    "Original Sentence": original_sentences,
    "Augmented Sentence": augmented_sentences
})

print(df_augmented.head())

# Save to CSV for later use
df_augmented.to_csv("augmented_data.csv", index=False)


            Original Sentence  \
0  The weather is nice today.   
1  The weather is nice today.   
2  The weather is nice today.   
3    I love playing football.   
4    I love playing football.   

                                  Augmented Sentence  
0  The weather is nice today. And I'm pretty sure...  
1  The weather is nice today. We are very lucky t...  
2  The weather is nice today.\n\nYou should be fi...  
3  I love playing football. I'm not going to sit ...  
4  I love playing football. I'm a big fan of the ...  


# Step 7: Customizing for NLP Tasks
 ```For specific tasks (e.g., classification), add a target label and balance classes by generating more data for underrepresented ones.```

In [8]:
# Sample labeled dataset
labeled_data = [
    {"text": "The movie was fantastic!", "label": "positive"},
    {"text": "I did not enjoy the book.", "label": "negative"}
]

augmented_labeled_data = []

for data in labeled_data:
    sentence, label = data["text"], data["label"]
    variations = generate_text_variations(sentence, num_variations=3)
    for variation in variations:
        augmented_labeled_data.append({"text": variation, "label": label})

# Convert to DataFrame
df_augmented_labeled = pd.DataFrame(augmented_labeled_data)
print(df_augmented_labeled.head())


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

                                                text     label
0  The movie was fantastic! The actors were great...  positive
1  The movie was fantastic! It was the best I hav...  positive
2  The movie was fantastic! I could have seen it ...  positive
3  I did not enjoy the book. I had some personal ...  negative
4  I did not enjoy the book. I enjoyed the story,...  negative
