# üá∏üá¶ Arabic FunctionGemma Fine-tuning

**Dataset:** Sa74ll/arabic-mobile-actions (45,729 samples)
**Model:** google/functiongemma-270m-it
**GPU:** A100 (40GB)

---

## Key Features:
- ‚úÖ Checkpoint every 500 steps to Google Drive
- ‚úÖ Auto-resume from last checkpoint if Colab crashes
- ‚úÖ Optimized for A100 GPU
- ‚úÖ WandB logging

## 1. Mount Google Drive (IMPORTANT!)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create checkpoint directory
import os
CHECKPOINT_DIR = '/content/drive/MyDrive/arabic_functiongemma_checkpoints'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print(f"‚úÖ Checkpoints will be saved to: {CHECKPOINT_DIR}")

## 2. Install Dependencies

In [None]:
%%capture
!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install wandb

## 3. Load Model with LoRA

In [None]:
from unsloth import FastModel
import torch

# Check GPU
print(f"üñ•Ô∏è GPU: {torch.cuda.get_device_name(0)}")
print(f"üíæ VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

# Load FunctionGemma
model, tokenizer = FastModel.from_pretrained(
    model_name="google/functiongemma-270m-it",
    max_seq_length=2048,
    load_in_4bit=True,  # Memory efficient
    dtype=None,
)

# Add LoRA adapters
model = FastModel.get_peft_model(
    model,
    r=32,                  # LoRA rank
    lora_alpha=64,         # 2x rule
    lora_dropout=0.05,     # Slight regularization
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj"],
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

print("‚úÖ Model loaded with LoRA!")

## 4. Load Arabic Dataset

In [None]:
from datasets import load_dataset

# Load YOUR Arabic dataset
dataset = load_dataset("Sa74ll/arabic-mobile-actions", split="train")
print(f"üìä Dataset size: {len(dataset):,} samples")

# Check a sample
print(f"\nüìù Sample query: {dataset[0]['messages'][1]['content'][:100]}...")

## 5. Process Dataset for Training

In [None]:
def process_dataset(row, tokenizer):
    """Convert messages + tools to training text format."""
    text = tokenizer.apply_chat_template(
        row["messages"],
        tools=row["tools"],
        tokenize=False,
        add_generation_prompt=False,
    )
    return {"text": text}

# Process all samples
dataset = dataset.map(
    process_dataset, 
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=4  # Parallel processing
)

print("‚úÖ Dataset processed!")
print(f"üìè Sample text length: {len(dataset[0]['text'])} chars")

## 6. Setup WandB (Optional but Recommended)

In [None]:
import wandb

# Login to WandB (run once, then comment out)
# !wandb login

wandb.init(
    project="Arabic-FunctionGemma",
    name="arabic-ft-v1",
    config={
        "model": "functiongemma-270m",
        "dataset": "Sa74ll/arabic-mobile-actions",
        "dataset_size": len(dataset),
        "lora_r": 32,
        "learning_rate": 1e-4,
        "batch_size": 8,
        "gradient_accumulation": 4,
        "effective_batch_size": 32,
    },
    tags=["arabic", "function-calling", "gemma"],
)

## 7. Configure Training (A100 Optimized)

### Key Settings:
- **Batch size 8** (A100 can handle it)
- **Gradient accumulation 4** ‚Üí Effective batch = 32
- **Save every 500 steps** to Google Drive
- **Auto-resume** from last checkpoint

In [None]:
from trl import SFTTrainer, SFTConfig

# Split for evaluation
split_dataset = dataset.train_test_split(test_size=500, shuffle=True, seed=3407)
print(f"üìä Train: {len(split_dataset['train']):,} | Eval: {len(split_dataset['test']):,}")

# === TRAINING CONFIG (A100 OPTIMIZED) ===
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    args=SFTConfig(
        # === Core Training ===
        dataset_text_field="text",
        max_steps=6000,                      # ~4 epochs
        per_device_train_batch_size=8,       # A100 can handle 8
        gradient_accumulation_steps=4,       # Effective batch = 32
        
        # === Learning Rate ===
        learning_rate=1e-4,                  # Conservative for Arabic
        warmup_steps=200,                    # Warm start
        lr_scheduler_type="cosine",          # Smooth decay
        weight_decay=0.01,
        
        # === Checkpointing (CRITICAL!) ===
        save_strategy="steps",
        save_steps=500,                      # Save every 500 steps
        save_total_limit=5,                  # Keep last 5 checkpoints
        output_dir=CHECKPOINT_DIR,           # Save to Google Drive!
        
        # === Evaluation ===
        eval_strategy="steps",
        eval_steps=500,                      # Evaluate every 500 steps
        
        # === Logging ===
        logging_steps=25,
        logging_first_step=True,
        report_to="wandb",
        run_name="arabic-functiongemma-v1",
        
        # === Optimizer ===
        optim="adamw_8bit",                  # Memory efficient
        
        # === Misc ===
        seed=3407,
        bf16=True,                           # A100 supports bf16
    ),
)

print("‚úÖ Trainer configured!")
print(f"üìÅ Checkpoints: {CHECKPOINT_DIR}")

## 8. Train Only on Model Responses

This is crucial! We only want the model to learn the **response** part, not the user query.

In [None]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<start_of_turn>user\n",
    response_part="<start_of_turn>model\n",
)

print("‚úÖ Set to train on responses only!")

## 9. Check Memory Before Training

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"üñ•Ô∏è GPU: {gpu_stats.name}")
print(f"üíæ Max Memory: {max_memory} GB")
print(f"üìä Currently Reserved: {start_gpu_memory} GB")
print(f"‚úÖ Available for Training: {max_memory - start_gpu_memory:.1f} GB")

## 10. üöÄ Start Training!

### Important Notes:
- Training will take ~1-1.5 hours on A100
- Checkpoints save to Drive every 500 steps (~8 minutes)
- If Colab crashes, just re-run from Cell 1 - it will auto-resume!

In [None]:
import os

# Check for existing checkpoints to resume from
checkpoints = [d for d in os.listdir(CHECKPOINT_DIR) if d.startswith('checkpoint-')] if os.path.exists(CHECKPOINT_DIR) else []

if checkpoints:
    latest = sorted(checkpoints, key=lambda x: int(x.split('-')[1]))[-1]
    resume_path = os.path.join(CHECKPOINT_DIR, latest)
    print(f"üîÑ Resuming from: {resume_path}")
else:
    resume_path = None
    print("üÜï Starting fresh training...")

print("\n" + "="*50)
print("üöÄ STARTING TRAINING")
print("="*50)

trainer_stats = trainer.train(resume_from_checkpoint=resume_path)

print("\n" + "="*50)
print("‚úÖ TRAINING COMPLETE!")
print("="*50)

wandb.finish()

## 11. Training Stats

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_training = round(used_memory - start_gpu_memory, 3)

print(f"‚è±Ô∏è Training Time: {trainer_stats.metrics['train_runtime']:.0f} seconds")
print(f"‚è±Ô∏è Training Time: {trainer_stats.metrics['train_runtime']/60:.1f} minutes")
print(f"üíæ Peak Memory: {used_memory} GB")
print(f"üíæ Memory for Training: {used_memory_for_training} GB")

## 12. Test the Fine-tuned Model

In [None]:
# Test queries in Arabic
test_queries = [
    "ŸÖÿß ŸáŸà ÿßŸÑÿ∑ŸÇÿ≥ ŸÅŸä ÿßŸÑÿ±Ÿäÿßÿ∂ÿü",
    "ÿßÿ≠ÿ¨ÿ≤ ŸÑŸä ŸÖŸàÿπÿØ ŸÖÿπ ÿØŸÉÿ™Ÿàÿ± ÿ£ÿ≥ŸÜÿßŸÜ",
    "ÿπÿßŸäÿ≤ ÿ£ÿπÿ±ŸÅ ŸÖŸàÿßÿπŸäÿØ ÿßŸÑÿµŸÑÿßÿ© ŸÅŸä ÿßŸÑŸÇÿßŸáÿ±ÿ©",
]

# Sample tools for testing
test_tools = [
    {
        "type": "function",
        "function": {
            "name": "get_weather",
            "description": "Get weather for a city",
            "parameters": {
                "type": "object",
                "properties": {"city": {"type": "string"}},
                "required": ["city"]
            }
        }
    }
]

from transformers import TextStreamer

for query in test_queries:
    print(f"\n{'='*50}")
    print(f"üìù Query: {query}")
    print(f"{'='*50}")
    
    messages = [
        {"role": "developer", "content": "You are a helpful assistant that can use tools."},
        {"role": "user", "content": query}
    ]
    
    text = tokenizer.apply_chat_template(
        messages,
        tools=test_tools,
        tokenize=False,
        add_generation_prompt=True,
    ).removeprefix('<bos>')
    
    _ = model.generate(
        **tokenizer(text, return_tensors="pt").to("cuda"),
        max_new_tokens=256,
        streamer=TextStreamer(tokenizer, skip_prompt=True),
        temperature=0.1,
        do_sample=True,
    )

## 13. Save Final Model

In [None]:
# Save to Drive
FINAL_MODEL_PATH = "/content/drive/MyDrive/arabic_functiongemma_final"

# Save LoRA adapters
model.save_pretrained(FINAL_MODEL_PATH)
tokenizer.save_pretrained(FINAL_MODEL_PATH)

print(f"‚úÖ Model saved to: {FINAL_MODEL_PATH}")

## 14. (Optional) Push to HuggingFace Hub

In [None]:
# Uncomment and run to push to HuggingFace

# HF_TOKEN = "your_token_here"
# HF_REPO = "Sa74ll/arabic-functiongemma-270m"

# model.push_to_hub(HF_REPO, token=HF_TOKEN)
# tokenizer.push_to_hub(HF_REPO, token=HF_TOKEN)

# print(f"‚úÖ Pushed to: https://huggingface.co/{HF_REPO}")

## 15. (Optional) Merge & Export to GGUF

In [None]:
# Uncomment to merge LoRA and save as full model

# model.save_pretrained_merged(
#     "/content/drive/MyDrive/arabic_functiongemma_merged",
#     tokenizer,
#     save_method="merged_16bit"
# )

# For GGUF export (for llama.cpp):
# model.save_pretrained_gguf(
#     "/content/drive/MyDrive/arabic_functiongemma_gguf",
#     tokenizer,
#     quantization_method="Q8_0"
# )