In [1]:
%%capture
!pip install -U datasets
!pip install -U accelerate
!pip install -U peft
!pip install -U trl
!pip install -U bitsandbytes
!pip install git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3

In [2]:
%%capture
!pip install -q unsloth
!pip install -q --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [3]:
from huggingface_hub import login

hf_token = "hf_YOUR_TOKEN"
login(hf_token)

In [4]:
from unsloth import FastModel

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-12b-it",
    max_seq_length = 2048, # Choose any for long context
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    full_finetuning = False,
    token=hf_token
)


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.18: Fast Gemma3 patching. Transformers: 4.50.0.dev0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Using float16 precision for gemma3 won't work! Using float32.


model.safetensors.index.json:   0%|          | 0.00/259k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/3.01G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/192 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/70.0 [00:00<?, ?B/s]

chat_template.json:   0%|          | 0.00/1.61k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/670 [00:00<?, ?B/s]

In [5]:
from datasets import load_dataset

dataset_name = "bexgboost/openai-agents-python-qa-firecrawl"
dataset = load_dataset(
    dataset_name, split = "train", trust_remote_code=True
)


README.md:   0%|          | 0.00/480 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/138k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/20.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/387 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/44 [00:00<?, ? examples/s]

In [6]:
EOS_TOKEN = tokenizer.eos_token

def format_instruction(example):

    prompt = """Below is an instruction that describes a task, paired with an input that provides further context.
    Write a response that appropriately completes the request.
    Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

    ### Instruction:
    You are a web scraping expert with advanced knowledge in Firecrawl, which is an AI-based web-scraping engine.
    Please answer the following question about Firecrawl.

    ### Question:
    {}

    ### Response:
    {}"""

    return {
        "text": prompt.format(example['question'], example['answer']) + EOS_TOKEN
    }

dataset = dataset.map(format_instruction)

Map:   0%|          | 0/387 [00:00<?, ? examples/s]

In [7]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding=False)

# Manually preprocess and tokenize
processed_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"],  # Remove original text column
    desc="Tokenizing dataset"
)

Tokenizing dataset:   0%|          | 0/387 [00:00<?, ? examples/s]

In [8]:
model = FastModel.get_peft_model(
    model,
    r=8,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    finetune_vision_layers = False, # Turn off for just text!
    finetune_language_layers = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules = True,  # Should leave on always!
    lora_alpha=8,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=1000,
    use_rslora=False
)

Unsloth: Making `model.base_model.model.vision_tower.vision_model.encoder` require gradients


In [9]:
from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = processed_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        dataset_kwargs = {"skip_prepare_dataset": True},
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1,
        learning_rate = 2e-4,
        logging_steps = 5,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
    ),
)

Unsloth: Switching to float32 training since model cannot work with float16


In [None]:
FastModel.for_training(model)

In [16]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 387 | Num Epochs = 1 | Total steps = 48
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 34,228,224/12,000,000,000 (0.29% trained)


Step,Training Loss
5,0.7842
10,0.7762
15,0.7534
20,0.8133
25,0.7443
30,0.7127
35,0.7642
40,0.7458
45,0.6923


TrainOutput(global_step=48, training_loss=0.7500371287266413, metrics={'train_runtime': 702.0766, 'train_samples_per_second': 0.551, 'train_steps_per_second': 0.068, 'total_flos': 7244008055653824.0, 'train_loss': 0.7500371287266413})

In [17]:
from transformers import TextStreamer

prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}
"""

FastModel.for_inference(model)

instruction = "How do you create an agent using OpenAI Agents SDK?"
message = prompt.format(instruction, "")
inputs = tokenizer([message], return_tensors="pt").to("cuda")
text_streamer = TextStreamer(tokenizer)

_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=512, use_cache=True)

<bos>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
How do you create an agent using OpenAI Agents SDK?

### Response:

To create an agent using the OpenAI Agents SDK, you need to define the agent's configuration using the `AgentConfig` class. This configuration includes the agent's name, instructions, tools, and other properties. Here is a basic example:

```python
from openai_agents import AgentConfig

agent_config = AgentConfig(
    name="Assistant",
    instructions="You are a web scraping expert.",
    tools=[
        ToolConfig(
            name="search_engine",
            description="Use this tool to search the web.",
        )
    ]
)
```

This configuration defines an agent named "Assistant" with instructions to act as a web scraping expert and provides a tool called "search_engine" for web searches.<end_of_turn>
