In [None]:
!pip install transformers datasets accelerate bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer
)

# Check GPU availability
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("No GPU detected - training will be slow")

# Load a smaller subset of the dataset for faster training
dataset = load_dataset("euclaise/writingprompts", split="train[:10000]")
print(f"Loaded {len(dataset)} examples")
print(dataset.column_names)

# Let's examine the first example to understand the structure
print("\nExample structure:")
example = dataset[0]
print(f"Type of prompt: {type(example['prompt'])}")
print(f"Type of story: {type(example['story'])}")
if isinstance(example['prompt'], list):
    print(f"Length of prompt list: {len(example['prompt'])}")

# Initialize model and tokenizer

# Initialize model and tokenizer
model_name = "facebook/opt-125m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    # Remove the torch_dtype=torch.float16 setting
    device_map="auto"
)


# Set padding token
tokenizer.pad_token = tokenizer.eos_token

# Fixed preprocess function to handle list inputs
def preprocess(example):
    # Handle the case where prompt and story are lists
    prompt_text = example["prompt"][0] if isinstance(example["prompt"], list) else example["prompt"]
    story_text = example["story"][0] if isinstance(example["story"], list) else example["story"]

    # Combine prompt and story with EOS token between them
    text = prompt_text + tokenizer.eos_token + story_text
    return tokenizer(text, truncation=True, max_length=512)  # Reduced max length

# Process dataset with multiple workers
tokenized_dataset = dataset.map(
    preprocess,
    remove_columns=dataset.column_names,
    num_proc=4,  # Parallel processing
    batched=False,  # Process one example at a time to avoid complexities with lists
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Training arguments optimized for speed
training_args = TrainingArguments(
    output_dir="./opt125m-writingprompts",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    learning_rate=3e-5,
    num_train_epochs=1,
    # Either set fp16=False or use bf16=True if supported by your GPU
    fp16=False,
    bf16=False,
    logging_steps=50,
    save_steps=1000,
    save_total_limit=1,
    report_to="none",
    dataloader_num_workers=2,  # Reduced based on warning
    gradient_checkpointing=True,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train model
trainer.train()

# Save the final model
trainer.save_model("./opt125m-writingprompts-final")

# Test the model
test_prompt = "Write a story about a magical forest:"
inputs = tokenizer(test_prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    output = model.generate(
        inputs["input_ids"],
        max_length=200,
        temperature=0.7,
        top_p=0.9,
        do_sample=True
    )

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Sample output:")
print(generated_text)

GPU available: Tesla T4
GPU memory: 15.83 GB
Loaded 10000 examples
['prompt', 'story']

Example structure:
Type of prompt: <class 'str'>
Type of story: <class 'str'>


Step,Training Loss
50,3.4702
100,3.2786
150,3.2458
200,3.2369
250,3.2267
300,3.2216
350,3.2161
400,3.1913
450,3.1944
500,3.1875




Sample output:
Write a story about a magical forest:
 other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other other also also also also and that the- " " " -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- -- --


In [None]:
!pip install transformers==4.38.0



In [None]:
!pip install -U bitsandbytes



In [None]:
# Install necessary packages
!pip install -q transformers datasets accelerate peft trl

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)

# Check GPU
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
    print(f"GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Load a suitable small dataset of creative writing samples
dataset = load_dataset("euclaise/writingprompts", split="train[:5000]")
print(f"Loaded {len(dataset)} examples")

# Show a sample to understand structure
print("\nSample data:")
print(f"Prompt: {dataset[0]['prompt'][:100]}...")
print(f"Story: {dataset[0]['story'][:100]}...")

# Load a small but capable pre-trained model
model_name = "distilgpt2"  # Smaller version of GPT-2
print(f"\nLoading {model_name}...")

# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" if torch.cuda.is_available() else None
)

# Prepare the dataset for fine-tuning
def preprocess_function(examples):
    # Format: prompt + story
    texts = [prompt + tokenizer.eos_token + story
             for prompt, story in zip(examples["prompt"], examples["story"])]

    # Tokenize
    result = tokenizer(texts, truncation=True, max_length=512)
    return result

# Process the dataset
tokenized_dataset = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=dataset.column_names
)

print(f"Processed dataset size: {len(tokenized_dataset)}")

# Create data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Not using masked language modeling
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./distilgpt2-story-generator",
    num_train_epochs=1,              # Just 1 epoch for quick training
    per_device_train_batch_size=4,   # Adjust based on GPU memory
    gradient_accumulation_steps=4,   # Effective batch size of 16
    learning_rate=5e-5,
    logging_steps=10,
    save_steps=200,
    save_total_limit=1,
    lr_scheduler_type="cosine",
    warmup_ratio=0.1,
    fp16=torch.cuda.is_available(),  # Use fp16 if GPU is available
    report_to="none",                # Disable wandb/tensorboard reporting
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the fine-tuned model
trainer.save_model("./distilgpt2-story-generator-final")

# Test the model
def generate_story(prompt, max_length=300):
    # Add a separator to make it clear where the prompt ends
    input_text = f"{prompt}\n\n"

    # Tokenize
    inputs = tokenizer(input_text, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")

    # Generate
    with torch.no_grad():
        output = model.generate(
            inputs["input_ids"],
            max_length=max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            repetition_penalty=1.1
        )

    # Decode
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example usage
test_prompt = "Write a story about a magical forest:"
story = generate_story(test_prompt)
print("\nSample generated story:")
print(story)

print("\nTo generate more stories, use:")
print("""
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load model and tokenizer
model_path = "./distilgpt2-story-generator-final"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
if torch.cuda.is_available():
    model = model.to("cuda")

# Generate stories
def generate_story(prompt, max_length=300):
    input_text = f"{prompt}\\n\\n"
    inputs = tokenizer(input_text, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")

    outputs = model.generate(
        inputs["input_ids"],
        max_length=max_length,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
        repetition_penalty=1.1
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
prompt = "Write a story about a magical forest:"
story = generate_story(prompt)
print(story)
""")

GPU available: True
GPU name: Tesla T4
GPU memory: 15.83 GB
Loaded 5000 examples

Sample data:
Prompt: [ WP ] You 've finally managed to discover the secret to immortality . Suddenly , Death appears befo...
Story: So many times have I walked on ruins, the remainings of places that I loved and got used to.. At fir...

Loading distilgpt2...


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map (num_proc=4):   0%|          | 0/5000 [00:00<?, ? examples/s]

Processed dataset size: 5000


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.1704
20,3.9541
30,3.8
40,3.7989
50,3.7224
60,3.6056
70,3.6928
80,3.6722
90,3.6907
100,3.6485


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Sample generated story:
Write a story about a magical forest:


 
 It's been over four years since the last of us, and we've always felt our lives have ended in this one way. But that is what has now come to mind; an old man who died for nothing but love as he walked on his horse! He was so sweet with my heart I tried to n't let it go at once because when i came out from outside, I wanted to show them how wonderful things are... What would be if they knew everything? Well then no-one could even guess why their life had begun yet again… So there you go! You're right back here! The only thing left after being alone by me or something happened too far away isn’ t make any sense anymore! And just like most people do never want to know anything more than the ones around him.. No worries though.. They can tell all sorts of secrets before finally realizing these truths!! ” Why did she die!? She should still get rid of her family?! A woman saved herself off some time ago....she needed someone

In [None]:
!zip -r distilgpt2-story-generator-final.zip distilgpt2-story-generator-final


  adding: distilgpt2-story-generator-final/ (stored 0%)
  adding: distilgpt2-story-generator-final/tokenizer_config.json (deflated 54%)
  adding: distilgpt2-story-generator-final/model.safetensors (deflated 7%)
  adding: distilgpt2-story-generator-final/special_tokens_map.json (deflated 60%)
  adding: distilgpt2-story-generator-final/merges.txt (deflated 53%)
  adding: distilgpt2-story-generator-final/vocab.json (deflated 59%)
  adding: distilgpt2-story-generator-final/config.json (deflated 51%)
  adding: distilgpt2-story-generator-final/training_args.bin (deflated 51%)
  adding: distilgpt2-story-generator-final/tokenizer.json (deflated 82%)
  adding: distilgpt2-story-generator-final/generation_config.json (deflated 24%)
