In [5]:
!pip install transformers datasets torch




In [12]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset
dataset = load_dataset("google-research-datasets/poem_sentiment")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_poems(examples):
    tokenized_inputs = tokenizer(examples['verse_text'], truncation=True, padding='max_length', max_length=128)
    tokenized_inputs["labels"] = tokenized_inputs.input_ids.copy()  # Set labels for language modeling
    return tokenized_inputs

# Apply the tokenization function to the dataset
dataset = dataset.map(tokenize_poems, batched=True)


dataset = dataset.map(tokenize_poems, batched=True)
train_dataset = dataset['train']

# Remove columns not needed for training
train_dataset = train_dataset.remove_columns(['id', 'verse_text', 'label'])


Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [13]:
model = GPT2LMHeadModel.from_pretrained("gpt2")


In [14]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_poems_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"  # Disabling wandb
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Ensure this dataset includes 'labels'
)

# Start training
trainer.train()


Step,Training Loss
10,8.7531
20,8.2559
30,7.0476
40,5.1236
50,2.9694
60,1.3733
70,0.5655
80,0.4809
90,0.4448
100,0.3977


TrainOutput(global_step=336, training_loss=1.316896990651176, metrics={'train_runtime': 27.1628, 'train_samples_per_second': 98.517, 'train_steps_per_second': 12.37, 'total_flos': 174804369408000.0, 'train_loss': 1.316896990651176, 'epoch': 3.0})

In [15]:
model.save_pretrained("./gpt2_poems_finetuned")


In [16]:
from transformers import GPT2LMHeadModel

# Load the model from the saved directory
model_path = "./gpt2_poems_finetuned"
model = GPT2LMHeadModel.from_pretrained(model_path)


In [21]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Assuming the model is loaded as `model`
# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Example prompt
prompt = "Write a poem about the sea"

# Encode the prompt
inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

# Generate text using the model with creative settings
outputs = model.generate(
    **inputs,
    max_length=150,
    num_return_sequences=1,
    temperature=0.9,    # Adds creativity to generation by influencing the probability distribution
    top_p=0.92,         # Nucleus sampling: focuses generation on the top 92% probability
    top_k=50,           # Top-k sampling: Limits the sampling pool to 50 tokens
    repetition_penalty=1.2,  # Penalizes repetition to reduce redundancy
    do_sample=True,     # Enables sampling, necessary for using temperature, top_p, and top_k
)

# Decode generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Write a poem about the sea ----sour, sweet and perfect! with that poetical hand i read; what may be my future


In [22]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset

# Load dataset
dataset = load_dataset("google-research-datasets/poem_sentiment")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Necessary for models like GPT-2

# Tokenize the dataset
def tokenize_poems(examples):
    tokenized_inputs = tokenizer(examples['verse_text'], truncation=True, padding='max_length', max_length=128)
    tokenized_inputs["labels"] = tokenized_inputs['input_ids'].copy()
    return tokenized_inputs

dataset = dataset.map(tokenize_poems, batched=True)
train_dataset = dataset['train'].remove_columns(['id', 'verse_text', 'label'])


Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [23]:
# Load the pre-trained GPT-2 model
base_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Example prompt
prompt = "Write a poem about the sea"

# Encode the prompt
inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

# Generate text using the base model
base_outputs = base_model.generate(**inputs, max_length=150, num_return_sequences=1, do_sample=True)
base_generated_text = tokenizer.decode(base_outputs[0], skip_special_tokens=True)
print("Output from base model:", base_generated_text)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output from base model: Write a poem about the sea

Like your father

The sea, this land, your mother

How little can we lose


In [24]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./gpt2_poems_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none"
)

# Initialize the Trainer
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
)

# Start training
trainer.train()
base_model.save_pretrained("./gpt2_poems_finetuned")


Step,Training Loss


In [29]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2_poems_finetuned")
tokenizer.pad_token = tokenizer.eos_token

# Ensure pad token is set for models that require it
if tokenizer.eos_token is not None:
    fine_tuned_model.config.pad_token_id = tokenizer.eos_token_id

# Prepare the prompt
prompt = "Write a poem about the sea"
inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True, padding="max_length")

# Adjust generation settings
ft_outputs = fine_tuned_model.generate(
    **inputs,
    max_length=150,
    num_return_sequences=1,
    do_sample=True,         # Enable sampling
    temperature=0.8,        # Slightly lower temperature, still encourages creativity
    top_k=50,               # Limit the number of highest probability vocabulary tokens considered for generation
    top_p=0.92,             # Nucleus sampling
    repetition_penalty=1.2  # Penalize repetition
)

# Decode the generated text
ft_generated_text = tokenizer.decode(ft_outputs[0], skip_special_tokens=True)
print("Output from fine-tuned model:", ft_generated_text)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output from fine-tuned model: Write a poem about the sea
--and thy life, and our freedom. -- o'er his stream of sleep!--

 (End


In [27]:
# 4. Load the Fine-Tuned Model and Generate Output
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2_poems_finetuned")
ft_outputs = fine_tuned_model.generate(**inputs, max_length=150, num_return_sequences=1, do_sample=True)
print("Output from fine-tuned model:", tokenizer.decode(ft_outputs[0], skip_special_tokens=True))

# 5. Generate Creative Outputs Using Fine-Tuned Model
creative_outputs = fine_tuned_model.generate(
    **inputs,
    max_length=150,        # Maximum length of the poem
    num_return_sequences=3,  # Generate multiple poems for variety
    temperature=0.9,        # Slightly higher for more creativity
    top_k=40,               # Controls the diversity
    top_p=0.85,             # Nucleus sampling for focused creativity
    repetition_penalty=1.5, # Discourage repetition
    do_sample=True          # Enable probabilistic sampling
)

# Decode and print each generated text
for i, output in enumerate(creative_outputs, 1):
    print(f"Creative Output {i}:", tokenizer.decode(output, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output from fine-tuned model: Write a poem about the sea
Creative Output 1: Write a poem about the sea
'er with me, my wife. ----------------a little to-day? i hope not again
Creative Output 2: Write a poem about the sea boughs, and call it.
The sun's rise is over! i will sail again;--
Creative Output 3: Write a poem about the sea morn.
_--hoover, _a thousand urns?___; i'll try


In [48]:
import torch
import torch.nn as nn
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset and tokenize
def tokenize_poems(examples):
    tokenized_inputs = tokenizer(
        examples['verse_text'], truncation=True, padding="max_length", max_length=128
    )
    # Copy input_ids to labels and mask padding tokens in labels
    tokenized_inputs["labels"] = tokenized_inputs["input_ids"].copy()
    tokenized_inputs["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in label]
        for label in tokenized_inputs["labels"]
    ]
    return tokenized_inputs


dataset = dataset.map(tokenize_poems, batched=True)
train_dataset = dataset['train'].remove_columns(['id', 'verse_text', 'label'])


dataset = load_dataset("google-research-datasets/poem_sentiment")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
dataset = dataset.map(tokenize_poems, batched=True)
train_dataset = dataset['train'].remove_columns(['id', 'verse_text', 'label'])

# Load the base model and generate initial output
base_model = GPT2LMHeadModel.from_pretrained("gpt2")
inputs = tokenizer("Write a poem about the sea", return_tensors="pt", max_length=128, truncation=True, padding="max_length")
base_outputs = base_model.generate(**inputs, max_length=500, num_return_sequences=1, do_sample=True)
print("Output from base model:", tokenizer.decode(base_outputs[0], skip_special_tokens=True))



Map:   0%|          | 0/892 [00:00<?, ? examples/s]

Map:   0%|          | 0/105 [00:00<?, ? examples/s]

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Output from base model: Write a poem about the sea

From what you can glean from the ocean,

The fish can eat in water.


Let us live in a world of great danger.

For a boat will be built on a spot.

And when the sun will pass from the horizon.


But when that dark sea turns

From where the shore lies?

And the waters turn into clouds.


I am a true lover of peace.

When my husband and I die

My love will die out with him.


You shall never regret it

For no thing has so long taken my heart.


And in my heart, my spirit remains high! I do keep your words of heart.


Now, where I sit

If the sun shines out

In my presence

Showing love, peace and love.


O that I should speak

And live

This way.


But do you still feel a little weary

Of the world we live in?

Of the world we live in?


But you may feel a little comfort

In what I have to do

With the things I have to do.


"In your sleep are I able

To carry you to heaven.

"I shall tell you one thing

That is true for you."


And I w

In [63]:
# Define a LoRA layer
class LoRAAttention(nn.Module):
    def __init__(self, attention, config, r=16):  # Increase LoRA rank to 16
        super().__init__()
        self.attention = attention
        self.lora_dim = config.hidden_size // r
        self.lora_A = nn.Parameter(torch.randn(config.hidden_size, self.lora_dim))
        self.lora_B = nn.Parameter(torch.randn(self.lora_dim, config.hidden_size))
        nn.init.xavier_uniform_(self.lora_A)
        nn.init.xavier_uniform_(self.lora_B)

    def forward(self, *args, **kwargs):
        hidden_states = args[0]
        lora_adjusted = torch.matmul(hidden_states, self.lora_A)
        lora_adjusted = torch.matmul(lora_adjusted, self.lora_B)
        hidden_states = hidden_states + lora_adjusted
        args = (hidden_states, *args[1:])
        return self.attention(*args, **kwargs)

# Modify GPT-2 to integrate LoRA
class GPT2WithLoRA(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.lora_r = 4  # Rank for the low-rank matrices
        for i, layer in enumerate(self.transformer.h):
            layer.attn = LoRAAttention(layer.attn, config, r=self.lora_r)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
        return outputs  # Ensure that loss is computed if labels are provided

    def init_lora_params(self, base_model):
        """ Initialize LoRA parameters from the base model. """
        self.load_state_dict(base_model.state_dict(), strict=False)  # Load matching params
        for name, param in self.named_parameters():
            if "lora_A" in name or "lora_B" in name:
                nn.init.xavier_uniform_(param)  # Initialize new LoRA params



# Initialize LoRA model with updated rank
config = GPT2Config.from_pretrained("gpt2")
lora_model = GPT2WithLoRA(config)
lora_model.init_lora_params(base_model)

# Fine-tuning parameters
training_args = TrainingArguments(
    output_dir="./gpt2_with_lora_r16_finetuned",
    num_train_epochs=10,  # Increase epochs
    per_device_train_batch_size=4,
    learning_rate=1e-5,  # Lower learning rate
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none"
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()
lora_model.save_pretrained("./gpt2_with_lora_r16_finetuned")





Step,Training Loss
500,6.5557
1000,5.7621
1500,5.5041
2000,5.3696


In [65]:
# Load the fine-tuned models
fine_tuned_model_r16 = GPT2WithLoRA.from_pretrained("./gpt2_with_lora_r16_finetuned")

# Prompts for testing
prompts = [
    "Write a poem about the sea",
    "Describe the beauty of a forest in poetic form",
    "Compose a haiku about the mountains"
]

# Compare outputs
for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt", max_length=250, truncation=True, padding="max_length")

    # Generate with r = 16 model
    outputs_r16 = fine_tuned_model_r16.generate(
        **inputs,
        max_length=300,
        do_sample=True,
        temperature=0.8,
        top_p=0.85,
        repetition_penalty=1.2
    )
    print(f"Prompt: {prompt}")
    print(f"Output (r = 16): {tokenizer.decode(outputs_r16[0], skip_special_tokens=True)}")
    print("-" * 50)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Write a poem about the sea
Output (r = 16): Write a poem about the seathe-s, and then he had heard--" by his back to thy brotherly eyes. in their lives of our home with her head; butler is that was not so you seein'er's marendi', i know
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Describe the beauty of a forest in poetic form
Output (r = 16): Describe the beauty of a forest in poetic formmore, with his wife's morn. 'tis'n't knowed their lives."--in', i sayings from behind me; and we all but my sisterhood—the king over at once more often called to you see him whom
--------------------------------------------------
Prompt: Compose a haiku about the mountains
Output (r = 16): Compose a haiku about the mountains'er's-blades, but not seen with her feet? -- and his forehead. my heart:--d on! in that maneldoque’s of thy brotherhood; when you have known only mortal tongue has been
--------------------------------------------------


In [66]:
# Define a LoRA layer
class LoRAAttention(nn.Module):
    def __init__(self, attention, config, r=32):  # Increase LoRA rank to 32
        super().__init__()
        self.attention = attention
        self.lora_dim = config.hidden_size // r
        self.lora_A = nn.Parameter(torch.randn(config.hidden_size, self.lora_dim))
        self.lora_B = nn.Parameter(torch.randn(self.lora_dim, config.hidden_size))
        nn.init.xavier_uniform_(self.lora_A)
        nn.init.xavier_uniform_(self.lora_B)

    def forward(self, *args, **kwargs):
        hidden_states = args[0]
        lora_adjusted = torch.matmul(hidden_states, self.lora_A)
        lora_adjusted = torch.matmul(lora_adjusted, self.lora_B)
        hidden_states = hidden_states + lora_adjusted
        args = (hidden_states, *args[1:])
        return self.attention(*args, **kwargs)

# Modify GPT-2 to integrate LoRA
class GPT2WithLoRA(GPT2LMHeadModel):
    def __init__(self, config):
        super().__init__(config)
        self.lora_r = 4  # Rank for the low-rank matrices
        for i, layer in enumerate(self.transformer.h):
            layer.attn = LoRAAttention(layer.attn, config, r=self.lora_r)

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = super().forward(input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs)
        return outputs  # Ensure that loss is computed if labels are provided

    def init_lora_params(self, base_model):
        """ Initialize LoRA parameters from the base model. """
        self.load_state_dict(base_model.state_dict(), strict=False)  # Load matching params
        for name, param in self.named_parameters():
            if "lora_A" in name or "lora_B" in name:
                nn.init.xavier_uniform_(param)  # Initialize new LoRA params



# Initialize LoRA model with updated rank
config = GPT2Config.from_pretrained("gpt2")
lora_model = GPT2WithLoRA(config)
lora_model.init_lora_params(base_model)

# Fine-tuning parameters
training_args = TrainingArguments(
    output_dir="./gpt2_with_lora_r32_finetuned",
    num_train_epochs=10,  # Increase epochs
    per_device_train_batch_size=4,
    learning_rate=1e-5,  # Lower learning rate
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none"
)

trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()
lora_model.save_pretrained("./gpt2_with_lora_r32_finetuned")





Step,Training Loss
500,6.5406
1000,5.752
1500,5.4927
2000,5.3577


In [67]:
# Load the fine-tuned models
fine_tuned_model_r16 = GPT2WithLoRA.from_pretrained("./gpt2_with_lora_r32_finetuned")

# Prompts for testing
prompts = [
    "Write a poem about the sea",
    "Describe the beauty of a forest in poetic form",
    "Compose a haiku about the mountains"
]

# Compare outputs
for prompt in prompts:
    inputs = tokenizer(prompt, return_tensors="pt", max_length=250, truncation=True, padding="max_length")

    # Generate with r = 16 model
    outputs_r16 = fine_tuned_model_r16.generate(
        **inputs,
        max_length=300,
        do_sample=True,
        temperature=0.8,
        top_p=0.85,
        repetition_penalty=1.2
    )
    print(f"Prompt: {prompt}")
    print(f"Output (r = 32): {tokenizer.decode(outputs_r16[0], skip_special_tokens=True)}")
    print("-" * 50)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Write a poem about the sea
Output (r = 32): Write a poem about the seaThe road, and his eyes of youth's footsteps. she came hither; but forget him all that he who know how? -- to see thee! — i can scarce leaves us: 'the sun shines on my soul had nothilda-bl
--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Prompt: Describe the beauty of a forest in poetic form
Output (r = 32): Describe the beauty of a forest in poetic formThe earth's day and when they had been,--the people with me.  
s all night like an accident? as we see his pipe dream!’d'e-eyed from her lips to bear down on our children;
--------------------------------------------------
Prompt: Compose a haiku about the mountains
Output (r = 32): Compose a haiku about the mountainsa't see what you, i should not only mortal's name of our fatherly on. -- it is all human heart; and gals!--the road? !" she spoke his brows o'en again to us from heretics
--------------------------------------------------
