# 🚀 Train BLOOMZ-560M with FLORES Parallel Translation Data

## 📝 What This Does:
This notebook trains a LoRA adapter on **high-quality parallel translation data** (FLORES-101).

Unlike the previous attempt with monolingual data, this uses **26,117 English↔Indian language translation pairs** so the model learns to actually translate!

## 🎯 Expected Results:
- ✅ Proper translations (not gibberish)
- ✅ Follows "Translate to [Language]:" instructions
- ✅ Much better quality than base model alone

---

## 📋 Quick Start:

### Step 1: Upload Data to Google Drive
1. Upload `flores_training_data.txt` to your Google Drive
2. Note the exact file path

### Step 2: Enable GPU
1. Runtime → Change runtime type → GPU (T4)
2. Click Save

### Step 3: Run All Cells
1. Click Runtime → Run all
2. Grant Drive access when prompted
3. Wait ~40-60 minutes

### Step 4: Download & Test
1. Download `gurukul_adapter.zip`
2. Extract to `adapters/gurukul_lite/`
3. Test with `test_colab_adapter.py`

---


In [None]:
# Cell 1: Install Dependencies & Check GPU
print("Installing dependencies...\n")
!pip install -q transformers datasets peft accelerate bitsandbytes scipy

import torch
print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("\nWARNING: No GPU detected!")
    print("Go to Runtime -> Change runtime type -> Select GPU")


In [None]:
# Cell 2: Mount Google Drive & Load Data
from google.colab import drive
import os

print("Mounting Google Drive...\n")
drive.mount('/content/drive')

# UPDATE THIS PATH to where you uploaded flores_training_data.txt
data_file = "/content/drive/MyDrive/flores_training_data.txt"

# Check if file exists
if not os.path.exists(data_file):
    print(f"\nERROR: File not found: {data_file}")
    print("\nPlease:")
    print("1. Upload flores_training_data.txt to your Google Drive")
    print("2. Update the 'data_file' path above to match your file location")
    print("3. Re-run this cell")
    raise FileNotFoundError(f"Missing {data_file}")

print(f"\nFound data file!")
print(f"Size: {os.path.getsize(data_file) / 1e6:.2f} MB")

# Read the file
print("\nLoading translation pairs...")
with open(data_file, 'r', encoding='utf-8') as f:
    content = f.read()

# Split into pairs (each pair separated by double newline)
pairs = [p.strip() for p in content.split('\n\n') if p.strip()]

print(f"Loaded {len(pairs):,} translation pairs")
print(f"\nSample pairs:")
for i, pair in enumerate(pairs[:3], 1):
    lines = pair.split('\n')
    if len(lines) >= 2:
        print(f"\n{i}. {lines[0][:60]}...")
        print(f"   {lines[1][:60]}...")


In [None]:
# Cell 3: Load Model & Tokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training

print("Loading BLOOMZ-560M...\n")

# Quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)

model_name = "bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

print("Model loaded!")
print(f"Model size: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M parameters")


In [None]:
# Cell 4: Prepare Training Data
from datasets import Dataset

print("Preparing training data...\n")

# Convert pairs to dataset format
texts = []
for pair in pairs:
    # Each pair is "Translate to [Lang]: [English]\n[Translation]"
    # We combine them into a single training text
    texts.append(pair)

# Create dataset
dataset = Dataset.from_dict({'text': texts})

print(f"Dataset created: {len(dataset):,} examples")
print(f"\nFirst example:")
print(dataset[0]['text'][:200] + "...")


In [None]:
# Cell 5: Tokenize Data
print("Tokenizing data...\n")

def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=256  # Keep it short for translation pairs
    )

# Tokenize
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    desc="Tokenizing"
)

print(f"Tokenization complete!")
print(f"Dataset size: {len(tokenized_dataset):,} examples")
print(f"Sample token length: {len(tokenized_dataset[0]['input_ids'])}")


In [None]:
# Cell 6: Apply LoRA Adapter
print("Applying LoRA adapter...\n")

# Prepare model
model = prepare_model_for_kbit_training(model)

# LoRA configuration (proven settings for BLOOM)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,  # Rank
    lora_alpha=16,  # Alpha
    lora_dropout=0.05,
    target_modules=['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h'],
    bias="none"
)

model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print("LoRA applied!")
print(f"Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
print(f"Total params: {total_params:,}")
print(f"\nAdapter size: ~{trainable_params * 2 / 1e6:.1f} MB")


In [None]:
# Cell 7: Train the Adapter!
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

print("Starting training...\n")
print("OPTIMIZED for speed - should take 45-60 minutes on T4 GPU\n")

# SPEED OPTIMIZATIONS:
# 1. Larger batch size (8 instead of 4) - 2x faster
# 2. Fewer epochs (3 instead of 5) - still good quality
# 3. More aggressive gradient accumulation
# 4. Disabled gradient checkpointing - faster but uses more VRAM
# 5. Frequent checkpoints every 500 steps

# Training configuration
training_args = TrainingArguments(
    output_dir="./adapter_training",
    num_train_epochs=3,  # Reduced from 5 - still effective
    per_device_train_batch_size=8,  # Increased from 4 - 2x faster!
    gradient_accumulation_steps=2,  # Effective batch size = 16 (same)
    learning_rate=3e-4,  # Slightly higher since fewer epochs
    fp16=True,  # Mixed precision for speed
    logging_steps=50,  # More frequent logging
    save_steps=500,  # CHECKPOINT every 500 steps (~every 6-8 minutes)
    save_total_limit=3,  # Keep last 3 checkpoints
    warmup_steps=100,  # Reduced warmup
    logging_dir="./logs",
    report_to="none",
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    gradient_checkpointing=False,  # DISABLED for speed (uses more VRAM but faster)
    dataloader_num_workers=2,  # Use 2 workers for faster data loading
    dataloader_pin_memory=True,  # Pin memory for faster GPU transfer
    max_grad_norm=1.0,  # Gradient clipping for stability
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Train!
print("="*80)
print("TRAINING INFO:")
print(f"  Total steps: {len(tokenized_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")
print(f"  Checkpoint every: {training_args.save_steps} steps (~6-8 minutes)")
print(f"  Checkpoints saved to: {training_args.output_dir}/")
print(f"  If training crashes, you can resume from last checkpoint!")
print("="*80)
print("\nStarting training...\n")

# Check if we have a checkpoint to resume from
import os
checkpoints = [d for d in os.listdir(training_args.output_dir) if d.startswith("checkpoint-")] if os.path.exists(training_args.output_dir) else []
resume_from_checkpoint = None

if checkpoints:
    # Sort by step number and get the latest
    latest_checkpoint = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))[-1]
    resume_from_checkpoint = os.path.join(training_args.output_dir, latest_checkpoint)
    print(f"Found checkpoint: {latest_checkpoint}")
    print(f"Resuming training from step {latest_checkpoint.split('-')[1]}...\n")

# Train (will resume if checkpoint found)
trainer.train(resume_from_checkpoint=resume_from_checkpoint)

print("\n" + "="*80)
print("TRAINING COMPLETE!")
print("="*80)
print(f"\nCheckpoints saved in: {training_args.output_dir}/")
print("You can find:")
for checkpoint in sorted(os.listdir(training_args.output_dir)):
    if checkpoint.startswith("checkpoint-"):
        print(f"  - {checkpoint}")
print("="*80)


---

## 💾 CHECKPOINT MANAGEMENT

**Checkpoints are saved every 500 steps (~6-8 minutes).**

If training crashes or Colab disconnects:
1. Just re-run Cell 7 - it will automatically resume from the last checkpoint!
2. No need to start from scratch

**To manually download a checkpoint during training:**
- Navigate to the Files tab (📁) on the left
- Go to `adapter_training/checkpoint-XXXX/`
- Right-click → Download (or compress and download)

**Each checkpoint contains:**
- `adapter_model.safetensors` - The trained adapter weights
- `adapter_config.json` - Configuration
- `optimizer.pt` - Optimizer state (for resuming)
- `trainer_state.json` - Training progress

---


In [None]:
# Cell 9: Test the Trained Adapter
print("Testing the trained adapter...\n")

test_prompts = [
    "Translate to Hindi: Hello friend, how are you?",
    "Translate to Bengali: Good morning, have a nice day.",
    "Translate to Tamil: Thank you very much.",
    "Translate to Telugu: Welcome to our school.",
    "Translate to Gujarati: How can I help you?",
]

model.eval()

print("="*80)
print("TEST RESULTS")
print("="*80)

for i, prompt in enumerate(test_prompts, 1):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            min_new_tokens=10,
            temperature=0.3,  # Lower for more focused output
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    print(f"\n{i}. Prompt: {prompt}")
    print(f"   Output: {generated}")
    
print("\n" + "="*80)


In [None]:
# Cell 10: Save & Download Adapter
import shutil

print("Saving adapter...\n")

# Save adapter
output_dir = "gurukul_flores_adapter"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print(f"Adapter saved to {output_dir}/")

# Create ZIP file
zip_file = "gurukul_flores_adapter"
shutil.make_archive(zip_file, 'zip', output_dir)

print(f"\nCreated {zip_file}.zip")
print(f"Size: {os.path.getsize(zip_file + '.zip') / 1e6:.2f} MB")

# Download
from google.colab import files
print("\nDownloading...")
files.download(zip_file + '.zip')

print("\n" + "="*80)
print("ALL DONE!")
print("="*80)
print("\nNext steps:")
print("1. Extract gurukul_flores_adapter.zip to adapters/gurukul_lite/ on your PC")
print("2. Run: python test_colab_adapter.py")
print("3. Compare with base model results!")
print("\n" + "="*80)
