## 1. Install Dependencies

In [None]:
!pip install -q transformers==4.49.0 datasets==3.3.2 accelerate==1.4.0 bitsandbytes==0.45.3 trl==0.15.2 peft==0.14.0 sentencepiece protobuf

## 2. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 3. Import Libraries and Check GPU

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset

# Check GPU
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")
print("PyTorch version:", torch.__version__)

## 4. Load and Prepare Dataset

In [None]:
# Load datasets from Google Drive
print("Loading datasets from Google Drive...")
dataset = load_dataset("json", data_files={
    "train": "/content/drive/MyDrive/phishing_project/train_gemma.jsonl",
    "test": "/content/drive/MyDrive/phishing_project/eval_gemma.jsonl"
})

print(f"Train samples: {len(dataset['train'])}")
print(f"Eval samples: {len(dataset['test'])}")

# Show sample
print("\nSample data:")
print(dataset['train'][0])

## 5. Convert to Conversational Format

In [None]:
def create_conversation(sample):
    """
    Convert JSONL sample to conversational format for SFTTrainer.
    SFTTrainer expects 'messages' format with role-based conversation.
    """
    return {
        "messages": [
            {"role": "user", "content": f"{sample['instruction']}\n\n{sample['input']}"},
            {"role": "assistant", "content": sample['output']}
        ]
    }

# Convert datasets
print("Converting to conversational format...")
dataset = dataset.map(create_conversation, remove_columns=list(dataset["train"].features), batched=False)

print("\nConverted sample:")
print(dataset['train'][0])

## 6. Configure Quantization (QLoRA)

In [None]:
# 4-bit quantization config for QLoRA
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

print("âœ“ QLoRA configuration ready")

## 7. Load Tokenizer

**Note:** Using instruction-tuned tokenizer which has chat template support

In [None]:
print("Loading tokenizer from HuggingFace...")
tokenizer = AutoTokenizer.from_pretrained(
    "google/gemma-2-2b-it",  # Instruction-tuned tokenizer with chat template
    trust_remote_code=True,
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

print("âœ“ Tokenizer loaded")

## 8. Load Model

**Downloads ~5GB - takes ~30 seconds on Colab**

In [None]:
print("Loading Gemma-2-2b model from HuggingFace...")
print("This will download ~5GB (takes ~30 seconds on Colab)")

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b",  # Downloads automatically
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    attn_implementation="eager",
)

print("âœ“ Model loaded successfully")

## 9. Configure LoRA

In [None]:
# LoRA configuration (as per Google's Gemma guide)
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.05,
    r=16,
    bias="none",
    target_modules="all-linear",  # Google's recommendation for Gemma
    task_type="CAUSAL_LM",
)

print("âœ“ LoRA configuration ready")

## 10. Configure Training

**Training Parameters:**
- Epochs: 3
- Batch size: 4 (effective 16 with gradient accumulation)
- Learning rate: 2e-4
- Optimizer: paged_adamw_8bit (memory efficient)
- **Results saved to Google Drive** for persistence

In [None]:
# Training arguments
training_args = SFTConfig(
    output_dir="/content/drive/MyDrive/phishing_project/gemma-2-2b-phishing",  # Save to Drive!
    num_train_epochs=3,
    per_device_train_batch_size=4,  # Can use larger batch on Colab T4
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,
    max_grad_norm=0.3,
    warmup_ratio=0.03,
    lr_scheduler_type="constant",
    report_to=[],
)

print("âœ“ Training configuration ready")
print(f"Output will be saved to: {training_args.output_dir}")

## 11. Create Trainer

In [None]:
print("Setting up SFTTrainer...")
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    peft_config=peft_config,
    processing_class=tokenizer,
)

print("âœ“ Trainer ready")

## 12. Train Model

**This will take ~2-3 hours on T4 GPU**

**Important:** The model will be saved to Google Drive automatically, so even if Colab disconnects, your progress is safe!

In [None]:
print("ðŸš€ Starting training...")
print("Estimated time: 2-3 hours for 3 epochs")
print("Model checkpoints will be saved to Google Drive")
print()

trainer.train()

print("\nâœ… Training complete!")

## 13. Save Final Model

In [None]:
print("Saving final model to Google Drive...")
output_dir = "/content/drive/MyDrive/phishing_project/gemma-2-2b-phishing-final"

trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"\nâœ… Model saved to: {output_dir}")
print("\nðŸ“¥ To use locally:")
print("1. Download the 'gemma-2-2b-phishing-final' folder from Google Drive")
print("2. Extract to 'artifacts/models/gemma-2-2b-phishing' in your local project")
print("3. Load with: AutoModelForCausalLM.from_pretrained('artifacts/models/gemma-2-2b-phishing')")

## 14. Quick Test (Optional)

In [None]:
# Test the trained model
from transformers import pipeline

# Load the trained model
pipe = pipeline(
    "text-generation",
    model=output_dir,
    device_map="auto",
)

# Test sample
test_prompt = """You are a security model. Classify the following email as 'phishing' or 'safe'. Reply with exactly one word: phishing or safe.

Subject: Verify your account now!
From: support@paypa1.com
Body: Your account has been suspended. Click here to verify immediately.

Classification:"""

result = pipe(test_prompt, max_new_tokens=10, temperature=0.1)
print("\nTest Result:")
print(result[0]['generated_text'])

## Summary

**Training Complete!**

Your fine-tuned Gemma-2-2b model is saved in Google Drive at:
- Checkpoints: `/MyDrive/phishing_project/gemma-2-2b-phishing/`
- Final model: `/MyDrive/phishing_project/gemma-2-2b-phishing-final/`

**Next Steps:**
1. Download the model folder from Google Drive
2. Use it in your local ensemble with DeBERTa
3. Evaluate performance and create ensemble metrics

**Benefits over local training:**
- âœ… 2-3 hours vs 12-15 hours locally
- âœ… Free GPU (T4)
- âœ… Persistent storage in Google Drive
- âœ… No local environment conflicts