# 🚀 Train BLOOMZ-560M Adapter on YOUR Multilingual Data

## 📝 Instructions:

### Step 1: Upload Your Data (GOOGLE DRIVE - RECOMMENDED!)

**Option A: Google Drive (Faster & Easier)** ✅
1. Upload ALL `.txt` files from your `data/training/` folder to Google Drive:
   - Create a folder in Google Drive called `multilingual_training_data`
   - Upload: `hi_train.txt`, `bn_train.txt`, `ta_train.txt`, `te_train.txt`, etc.
2. In Cell 3 below, keep `USE_GOOGLE_DRIVE = True`
3. Update the `data_folder` path to match your Drive folder name

**Option B: Direct Upload to Colab** (Slower)
1. Set `USE_GOOGLE_DRIVE = False` in Cell 3
2. Click the **folder icon** on the left sidebar in Colab
3. Create a folder called `training_data`
4. Manually upload all `.txt` files (this can take 10-15 minutes)

### Step 2: Enable GPU
- Click **Runtime → Change runtime type**
- Select **GPU** (T4 is free)
- Click **Save**

### Step 3: Run All Cells
- Click **Runtime → Run all**
- Wait ~20-30 minutes for training

### Step 4: Download
- Download the `gurukul_adapter.zip` file
- Extract it to `adapters/gurukul_lite/` on your PC

---


In [None]:
# Cell 1: Install Dependencies & Check GPU
print("📦 Installing dependencies...\n")
!pip install -q transformers datasets peft accelerate bitsandbytes scipy

import torch
print(f"\n✅ GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"   GPU: {torch.cuda.get_device_name(0)}")
    print(f"   VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️ WARNING: No GPU detected. Training will be VERY slow!")
    print("   Go to Runtime → Change runtime type → Select GPU")


In [None]:
# Cell 2: Load Model & Prepare Training
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import Dataset
import glob
import random

print("🤖 Loading BLOOMZ-560M...\n")

# Quantization config for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16
)

model_name = "bigscience/bloomz-560m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)

# Set padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    model.config.pad_token_id = tokenizer.eos_token_id

print("✅ Model loaded!\n")

# Training config (adjust these if needed)
config = {
    'max_samples': 5000,      # Total samples across all languages
    'max_length': 256,        # Max token length
    'num_epochs': 3,          # Training epochs
    'batch_size': 4,          # Batch size
    'learning_rate': 3e-4,    # Learning rate
    'lora_r': 8,              # LoRA rank
    'lora_alpha': 16,         # LoRA alpha
    'lora_dropout': 0.05      # LoRA dropout
}

print(f"📊 Training Config:")
for k, v in config.items():
    print(f"   {k}: {v}")


In [None]:
# Cell 3: Load YOUR Training Data
import os

print("📂 Loading your training data...\n")

# OPTION 1: Google Drive (RECOMMENDED - faster!)
# Uncomment the lines below if you uploaded your data to Google Drive
USE_GOOGLE_DRIVE = True  # Change to False if uploading directly to Colab

if USE_GOOGLE_DRIVE:
    from google.colab import drive
    print("🔗 Mounting Google Drive...")
    drive.mount('/content/drive')
    
    # CHANGE THIS PATH to where you uploaded your training files in Google Drive
    # Example: If you created a folder "My Drive/multilingual_training_data/"
    data_folder = "/content/drive/MyDrive/multilingual_training_data"
    
    print(f"✅ Google Drive mounted!")
    print(f"   Looking for data in: {data_folder}\n")
else:
    # OPTION 2: Direct upload to Colab
    data_folder = "training_data"

# Check if data folder exists
if not os.path.exists(data_folder):
    print(f"❌ ERROR: Folder '{data_folder}' not found!")
    if USE_GOOGLE_DRIVE:
        print("\n⚠️ Please:")
        print("   1. Upload your training .txt files to Google Drive")
        print("   2. Update 'data_folder' path above to match your Drive folder")
        print("   3. Re-run this cell")
    else:
        print("\n⚠️ Please:")
        print("   1. Click the folder icon on the left")
        print("   2. Create a folder called 'training_data'")
        print("   3. Upload all your .txt files from data/training/")
        print("   4. Re-run this cell")
    raise FileNotFoundError(f"Missing {data_folder}/")

# Find all .txt files
txt_files = glob.glob(f"{data_folder}/*.txt")

if not txt_files:
    print(f"❌ ERROR: No .txt files found in {data_folder}/")
    print("\n⚠️ Please upload your training files!")
    raise FileNotFoundError("No training data found")

print(f"✅ Found {len(txt_files)} files:\n")
for f in txt_files:
    size_mb = os.path.getsize(f) / 1e6
    print(f"   • {os.path.basename(f):20s} ({size_mb:.2f} MB)")

# Load and combine data from all files
all_texts = []
lang_counts = {}

for filepath in txt_files:
    lang_code = os.path.basename(filepath).split('_')[0]  # e.g., 'hi' from 'hi_train.txt'
    
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    
    # Sample from each file
    samples_per_file = min(len(lines), config['max_samples'] // len(txt_files))
    sampled = random.sample(lines, samples_per_file)
    
    all_texts.extend(sampled)
    lang_counts[lang_code] = len(sampled)
    
print(f"\n📊 Samples per language:")
for lang, count in sorted(lang_counts.items()):
    print(f"   {lang}: {count:,}")

print(f"\n📈 Total samples: {len(all_texts):,}")

# Shuffle
random.shuffle(all_texts)

# Create dataset
dataset = Dataset.from_dict({'text': all_texts})
print(f"\n✅ Dataset created with {len(dataset):,} examples")


In [None]:
# Cell 4: Tokenize Data
print("🔤 Tokenizing data...\n")

def tokenize_function(examples):
    # Simple tokenization - data collator handles padding AND label creation
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=config['max_length']
    )

# Tokenize in batches
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    desc="Tokenizing",
    batch_size=100  # Process in larger batches for speed
)

print(f"\n✅ Tokenization complete!")
print(f"   Dataset size: {len(tokenized_dataset):,} examples")

# Verify tokenization
sample = tokenized_dataset[0]
print(f"   Sample input length: {len(sample['input_ids'])}")
print(f"   First 10 tokens: {sample['input_ids'][:10]}")


In [None]:
# Cell 5: Apply LoRA Adapter
print("🔧 Applying LoRA adapter...\n")

# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# LoRA configuration (PROVEN settings for BLOOM)
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=config['lora_r'],
    lora_alpha=config['lora_alpha'],
    lora_dropout=config['lora_dropout'],
    target_modules=['query_key_value', 'dense', 'dense_h_to_4h', 'dense_4h_to_h'],
    bias="none"
)

model = get_peft_model(model, lora_config)

# Print trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"✅ LoRA applied!")
print(f"   Trainable params: {trainable_params:,} ({100 * trainable_params / total_params:.2f}%)")
print(f"   Total params: {total_params:,}")


In [None]:
# Cell 6: Train the Model!
print("🚀 Starting training...\n")
print("⏱️ This will take 20-30 minutes depending on GPU.\n")

# Training arguments
training_args = TrainingArguments(
    output_dir="./adapter_training",
    num_train_epochs=config['num_epochs'],
    per_device_train_batch_size=config['batch_size'],
    gradient_accumulation_steps=4,
    learning_rate=config['learning_rate'],
    fp16=True,  # Mixed precision for speed
    logging_steps=50,
    save_steps=500,
    save_total_limit=2,
    warmup_steps=100,
    logging_dir="./logs",
    report_to="none",
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    gradient_checkpointing=True,
    dataloader_num_workers=0,  # Use 0 to avoid multiprocessing issues
)

# Data collator - use the built-in one, it's simpler and works!
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal LM (not masked LM)
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Train!
trainer.train()

print("\n" + "="*80)
print("🎉 TRAINING COMPLETE!")
print("="*80)


In [None]:
# Cell 7: Test the Trained Adapter
print("🧪 Testing the trained adapter...\n")

test_prompts = [
    "Translate to Hindi: Hello friend, how are you?",
    "Translate to Bengali: Good morning.",
    "Translate to Tamil: Thank you.",
]

model.eval()

for i, prompt in enumerate(test_prompts, 1):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            min_new_tokens=10,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print(f"{i}. Prompt: {prompt}")
    print(f"   Output: {generated}")
    print()


In [None]:
# Cell 8: Save & Package Adapter
print("💾 Saving adapter...\n")

# Save adapter
model.save_pretrained("gurukul_adapter")
tokenizer.save_pretrained("gurukul_adapter")

# Create ZIP file
import shutil
shutil.make_archive("gurukul_adapter", 'zip', "gurukul_adapter")

print("✅ Adapter saved!\n")
print("📦 Download 'gurukul_adapter.zip' and extract to adapters/gurukul_lite/\n")

# Provide download link
from google.colab import files
print("⬇️ Click below to download:")
files.download('gurukul_adapter.zip')

print("\n" + "="*80)
print("🎉 ALL DONE!")
print("="*80)
print("\nNext steps:")
print("1. Download gurukul_adapter.zip")
print("2. Extract to adapters/gurukul_lite/ on your PC")
print("3. Test with: python test_colab_adapter.py")
