In [2]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer, 
    TrainingArguments, 
    Trainer,
    EarlyStoppingCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import warnings
warnings.filterwarnings('ignore')

print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

# Model selection (use smaller model for Kaggle)
model_name = "gpt2-large"  # Better than base, fits in P100
# For better results (if you have GPU memory): "mistralai/Mistral-7B-v0.1"

print(f"Loading {model_name}...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load dataset
dataset = load_dataset("json", data_files="/kaggle/working/poems.jsonl")
dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)

print(f"Train: {len(dataset['train'])} | Val: {len(dataset['test'])}")

# Tokenization function
def tokenize(examples):
    # Combine prompt and completion with proper formatting
    texts = [
        f"{prompt}{completion}"
        for prompt, completion in zip(examples["prompt"], examples["completion"])
    ]
    
    encodings = tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512,
        return_tensors=None
    )
    
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings

# Tokenize dataset
tokenized_dataset = dataset.map(
    tokenize,
    batched=True,
    remove_columns=dataset["train"].column_names
)

print("Dataset tokenized")

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,                        # Rank
    lora_alpha=32,               # Scaling factor
    target_modules=[             # Target attention layers
        "c_attn",                # For GPT-2
        # For Mistral use: "q_proj", "k_proj", "v_proj", "o_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Training arguments
training_args = TrainingArguments(
    output_dir="./poetry-lora-model",
    num_train_epochs=5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    weight_decay=0.01,
    warmup_steps=100,
    fp16=True,
    logging_steps=50,
    eval_strategy="steps",
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    seed=42,
    dataloader_num_workers=0,
    remove_unused_columns=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    callbacks=[
        EarlyStoppingCallback(
            early_stopping_patience=3,
            early_stopping_threshold=0.01
        )
    ]
)

print("\nStarting LoRA training...")
trainer.train()

# Save model
output_dir = "./poetry-lora-final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"\nLoRA model saved to {output_dir}")

# Test generation
print("\nTesting poetry generation:")
model.eval()

test_prompts = [
    "Write a poem about love",
    "Write a poem starting with: The moon",
    "Write a creative poem"
]

for prompt in test_prompts:
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=150,
            temperature=0.85,
            top_k=50,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"\n{'-'*60}")
    print(f"Prompt: {prompt}")
    print(f"\n{generated}")
    print(f"{'-'*60}")

print("\nTraining complete!")

2025-11-13 01:08:36.259036: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762996116.502499      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762996116.571623      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

GPU: Tesla P100-PCIE-16GB
Loading gpt2-large...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Train: 3931 | Val: 437


Map:   0%|          | 0/3931 [00:00<?, ? examples/s]

Map:   0%|          | 0/437 [00:00<?, ? examples/s]

Dataset tokenized


model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


trainable params: 2,949,120 || all params: 776,979,200 || trainable%: 0.3796

Starting LoRA training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss,Validation Loss
200,1.4832,1.417541
400,1.4149,1.395075
600,1.4355,1.386361
800,1.438,1.381945
1000,1.4706,1.379042



LoRA model saved to ./poetry-lora-final

Testing poetry generation:

------------------------------------------------------------
Prompt: Write a poem about love

Write a poem about love and the

Love and the sun
Will stay forever as you see them together in this day,
You will always be together like brothers and sisters and lovers for ever.
------------------------------------------------------------

------------------------------------------------------------
Prompt: Write a poem starting with: The moon

Write a poem starting with: The moon and

The moon and the stars are always together
And
I am on earth.
It is dark so dark and there are clouds everywhere,
But it is beautiful for me to see the bright colors of day and night;
In the daytime all my thoughts focus around these two;
To hear the birds sing and to look at the sunset from my window;
It was also a beautiful day to be in the garden.
------------------------------------------------------------

-----------------------------

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import warnings
warnings.filterwarnings('ignore')

print("Loading LoRA model...")

# Load base model and tokenizer
base_model_name = "gpt2-large"  # Must match training
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/poetry-lora-final")

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load LoRA weights
model = PeftModel.from_pretrained(model, "/kaggle/working/poetry-lora-final")
model.eval()

print(f"Model loaded on {model.device}\n")

def generate_poem(prompt, max_length=200, temperature=0.85):
    """Generate a poem based on prompt"""
    
    # Format prompt if needed
    if not prompt.startswith("Write"):
        prompt = f"Write a poem about {prompt}"
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_length,
            temperature=temperature,
            top_k=50,
            top_p=0.92,
            do_sample=True,
            repetition_penalty=1.25,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the poem part (remove prompt)
    if prompt in generated:
        poem = generated.split(prompt, 1)[1].strip()
    else:
        poem = generated
    
    return poem

# Test examples
print("Sample Poems:\n")

test_themes = [
    "love and romance",
    "the moonlight",
    "nature and gardens",
    "time passing",
    "friendship"]]

for theme in test_themes:
    print(f"Theme: {theme}")
    poem = generate_poem(theme)
    print(poem)
    print("\n" + "="*60 + "\n")

# Interactive mode
print("Interactive Poetry Generator")
print("Enter themes or prompts (or 'quit' to exit)\n")

while True:
    user_input = input("Theme/Prompt: ").strip()
    
    if user_input.lower() in ['quit', 'exit', 'q']:
        print("Goodbye!")
        break
    
    if not user_input:
        continue
    
    print("\nGenerating...\n")
    poem = generate_poem(user_input)
    print(poem)
    print("\n" + "-"*60 + "\n")