In [3]:
!pip install -q transformers datasets tokenizers wandb
!pip install --upgrade transformers
!pip install -q kaggle


!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d imbikramsaha/poems
!unzip -q poems.zip

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2024.12.0 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.5.3.2 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-cupti-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-cupti-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-nvrtc-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-nvrtc-cu12 12.5.82 which is incompatible.
torch 2.6.0+cu124 requires nvidia-cuda-runtime-cu12==12.4.127; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cuda-runtime-cu12 12.

In [4]:

import pandas as pd
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split



poetry_df = pd.read_csv('poems-100.csv')
poetry_df = poetry_df.dropna(subset=['text'])

print(f"Dataset contains {len(poetry_df)} poems")
print(poetry_df.head())

def prepare_dataset(poems, tokenizer, max_length=128):

    encodings = tokenizer(poems, truncation=True, padding="max_length",
                         max_length=max_length, return_tensors="pt")


    dataset = Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': encodings['input_ids'].clone()
    })

    return dataset


model_name = "gpt2"  #try g
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)


tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id


train_texts, val_texts = train_test_split(poetry_df['text'].tolist(), test_size=0.1, random_state=42)


train_dataset = prepare_dataset(train_texts, tokenizer)
val_dataset = prepare_dataset(val_texts, tokenizer)


training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    eval_steps=500,
    save_steps=500,
    warmup_steps=500,
    eval_strategy="steps",
    logging_dir="./logs",
    logging_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


trainer.train()


model_path = "./poetry-gpt2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

Dataset contains 100 poems
                                                text
0  O my Luve's like a red, red rose\nThat’s newly...
1  The rose is red,\nThe violet's blue,\nSugar is...
2  How do I love thee? Let me count the ways.\nI ...
3  Had I the heavens' embroidered cloths,\nEnwrou...
4  I.\n    Enough! we're tired, my heart and I.\n...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss


('./poetry-gpt2/tokenizer_config.json',
 './poetry-gpt2/special_tokens_map.json',
 './poetry-gpt2/vocab.json',
 './poetry-gpt2/merges.txt',
 './poetry-gpt2/added_tokens.json')

# New Section

In [5]:
def generate_poem(prompt, model, tokenizer, max_length=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long)

    output = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        top_k=50,
        top_p=0.95,
        temperature=0.7,
        do_sample=True
    )

    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text


print("\n--- Generated Poems ---\n")
prompts = [
    "The sun sets over the horizon",
    "In the garden of dreams",
    "Whispers of the wind",
    "A life on the ocean wave",

]

for prompt in prompts:
    print(f"Prompt: {prompt}")
    poem = generate_poem(prompt, model, tokenizer)
    print(f"Generated poem:\n{poem}\n")
    print("-" * 50)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



--- Generated Poems ---

Prompt: The sun sets over the horizon


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated poem:
The sun sets over the horizon, and the moon sets in the far distance.

But the sun is not shining over that horizon. It is just a shadow,
The shadow of the star is
the sun and moon.

--------------------------------------------------
Prompt: In the garden of dreams


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated poem:
In the garden of dreams, we grow up, where we are, and we can't stop dreaming.

In dreams we're a part of the universe, a place where things come and go, but we live in the world that we've never seen before.

--------------------------------------------------
Prompt: Whispers of the wind


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated poem:
Whispers of the wind, you may not hear.

The sky is dark, and the rain is swift,
But that is not a lie, no man can tell
Nor that the moon is a shadow.

--------------------------------------------------
Prompt: A life on the ocean wave
Generated poem:
A life on the ocean wave.

From the first waves of the Great Ocean Wave, the sea was now a sea of light and cold. Its waves were a long, long way from us. The sea is now, and the waves are a far away thing.

--------------------------------------------------
