# 🚀 Multilingual Fine-tuning Training Script for Google Colab

This notebook fine-tunes a language model on multilingual data (Hindi, Sanskrit, Marathi, English) using LoRA/PEFT for efficient training.

## Features:
- ✅ All dependencies included
- ✅ Sample data generation for demo
- ✅ Memory-optimized for Colab's GPU constraints
- ✅ Automatic GPU detection and configuration
- ✅ Progress tracking and logging
- ✅ LoRA/PEFT support for efficient training

## Usage:
1. Enable GPU in Colab (Runtime > Change runtime type > GPU)
2. Run each cell in sequence
3. Download the fine-tuned model when complete


In [None]:
# Install required packages
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install -q transformers datasets accelerate peft bitsandbytes
%pip install -q sentencepiece langdetect

# Verify installation
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")


In [None]:
# Import libraries and setup
import logging
import os
import gc
import hashlib
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration
MODEL_NAME = "AhinsaAI/ahinsa0.5-llama3.2-3B"  # Change this to your preferred model
OUTPUT_DIR = "fine_tuned_model"
EPOCHS = 2
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 4
WARMUP_STEPS = 100
LEARNING_RATE = 5e-5
MAX_LENGTH = 512
USE_QUANTIZATION = True
USE_PEFT = True

print(f"🤖 Model: {MODEL_NAME}")
print(f"📊 Training Epochs: {EPOCHS}")
print(f"🔧 Quantization: {'Enabled' if USE_QUANTIZATION else 'Disabled'}")
print(f"🔧 PEFT/LoRA: {'Enabled' if USE_PEFT else 'Disabled'}")


In [None]:
# Utility functions
def clear_gpu_memory():
    """Clear GPU memory and run garbage collection"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

def check_gpu_usage():
    """Check and log GPU usage"""
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        gpu_name = torch.cuda.get_device_name(device)
        memory_allocated = torch.cuda.memory_allocated(device) / 1024**3
        memory_reserved = torch.cuda.memory_reserved(device) / 1024**3
        memory_total = torch.cuda.get_device_properties(device).total_memory / 1024**3
        
        logger.info(f"GPU: {gpu_name}")
        logger.info(f"GPU Memory - Allocated: {memory_allocated:.2f} GB, Reserved: {memory_reserved:.2f} GB, Total: {memory_total:.2f} GB")
        return True
    else:
        logger.info("CUDA not available - using CPU")
        return False

# Check GPU
check_gpu_usage()


In [None]:
# Create sample multilingual training data
def create_sample_data():
    """Create sample multilingual training data for demonstration"""
    
    sample_data = {
        "hindi": [
            "मैं एक भारतीय हूं और मुझे हिंदी भाषा सीखना अच्छा लगता है।",
            "यह दुनिया बहुत सुंदर है और हमें इसे संरक्षित करना चाहिए।",
            "शिक्षा सबसे महत्वपूर्ण चीज है जो एक व्यक्ति को सफल बना सकती है।",
            "प्रेम और करुणा हमारे जीवन को सुंदर बनाते हैं।",
            "भारत विविधता में एकता का देश है।"
        ],
        "sanskrit": [
            "सर्वे भवन्तु सुखिनः सर्वे सन्तु निरामयाः।",
            "विद्या ददाति विनयं विनयाद्याति पात्रताम्।",
            "सत्यमेव जयते नानृतं सत्येन पन्था विततो देवयानः।",
            "अहिंसा परमो धर्मः धर्मस्य प्रतिष्ठा।",
            "वसुधैव कुटुम्बकम् इति सुभाषितम्।"
        ],
        "marathi": [
            "मी एक महाराष्ट्रीय आहे आणि मला मराठी भाषा आवडते।",
            "शिक्षण हे माणसाच्या जीवनातील सर्वात महत्वाची गोष्ट आहे।",
            "प्रेम आणि करुणा यांनी जग सुंदर बनतो।",
            "महाराष्ट्र हा संस्कृतीचा आणि परंपरेचा गौरवशाली राज्य आहे।",
            "एकता ही शक्तीचा स्रोत आहे।"
        ],
        "english": [
            "I am learning multiple languages to understand different cultures better.",
            "Education is the key to success and personal development.",
            "Love and compassion make the world a better place to live.",
            "India is a diverse country with unity in diversity.",
            "Technology has revolutionized the way we communicate and learn."
        ]
    }
    
    # Create directories
    os.makedirs("data/training", exist_ok=True)
    os.makedirs("data/validation", exist_ok=True)
    
    # Write sample data to files
    for lang, texts in sample_data.items():
        # Training data
        train_file = f"data/training/{lang}_train.txt"
        with open(train_file, 'w', encoding='utf-8') as f:
            for text in texts:
                f.write(text + "\n")
        
        # Validation data (smaller subset)
        val_file = f"data/validation/{lang}_val.txt"
        with open(val_file, 'w', encoding='utf-8') as f:
            for text in texts[:2]:  # Use first 2 samples for validation
                f.write(text + "\n")
    
    print("✅ Sample data created successfully!")
    print("📁 Training data files:")
    for lang in sample_data.keys():
        train_file = f"data/training/{lang}_train.txt"
        val_file = f"data/validation/{lang}_val.txt"
        print(f"  - {train_file}")
        print(f"  - {val_file}")

# Create sample data
create_sample_data()


In [None]:
# Load model and tokenizer with optimizations
def load_model_and_tokenizer():
    """Load model and tokenizer with memory optimizations"""
    
    clear_gpu_memory()
    
    # Load tokenizer
    logger.info(f"Loading tokenizer from {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Configure quantization
    quantization_config = None
    if USE_QUANTIZATION and torch.cuda.is_available():
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        )
        logger.info("🔧 Using 8-bit quantization")
    
    # Load model
    logger.info(f"Loading model from {MODEL_NAME}")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto" if torch.cuda.is_available() else None,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16 if quantization_config else torch.float32,
    )
    
    # Apply LoRA if enabled
    if USE_PEFT:
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=16,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        )
        
        model = get_peft_model(model, lora_config)
        logger.info("🔧 Applied LoRA adapters")
        model.print_trainable_parameters()
    
    # Add padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        logger.info("Added EOS token as padding token")
    
    return model, tokenizer

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer()
check_gpu_usage()


In [None]:
# Prepare training data
def load_training_data():
    """Load and tokenize training data"""
    
    # Corpus files mapping
    corpus_files = {
        "hindi": "hindi_train.txt",
        "sanskrit": "sanskrit_train.txt", 
        "marathi": "marathi_train.txt",
        "english": "english_train.txt"
    }
    
    # Load training data
    train_texts = []
    for lang, filename in corpus_files.items():
        filepath = os.path.join("data/training", filename)
        if os.path.exists(filepath):
            logger.info(f"Loading {lang} training data from {filepath}")
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10]
                train_texts.extend(filtered_lines)
                logger.info(f"Loaded {len(filtered_lines)} {lang} training samples")
    
    # Load validation data
    eval_texts = []
    val_files = {
        "hindi": "hindi_val.txt",
        "sanskrit": "sanskrit_val.txt", 
        "marathi": "marathi_val.txt",
        "english": "english_val.txt"
    }
    
    for lang, filename in val_files.items():
        filepath = os.path.join("data/validation", filename)
        if os.path.exists(filepath):
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10]
                eval_texts.extend(filtered_lines)
    
    logger.info(f"Total training samples: {len(train_texts)}")
    logger.info(f"Total validation samples: {len(eval_texts)}")
    
    # Create datasets
    train_dataset = Dataset.from_dict({"text": train_texts})
    eval_dataset = Dataset.from_dict({"text": eval_texts}) if eval_texts else None
    
    return train_dataset, eval_dataset

# Load and tokenize data
train_dataset, eval_dataset = load_training_data()

# Tokenization function
def tokenize_fn(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        max_length=MAX_LENGTH,
        padding=True,
        return_tensors=None
    )

# Tokenize datasets
logger.info("Tokenizing training dataset...")
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

tokenized_eval = None
if eval_dataset:
    logger.info("Tokenizing validation dataset...")
    tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

logger.info("✅ Data tokenization completed!")


In [None]:
# Setup training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
    return_tensors="pt",
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    learning_rate=LEARNING_RATE,
    fp16=False,
    dataloader_drop_last=True,
    dataloader_pin_memory=False,
    report_to=None,
    dataloader_num_workers=0,
    save_total_limit=2,
    max_grad_norm=1.0,
    save_strategy="steps" if USE_PEFT else "epoch",
    eval_strategy="steps" if USE_PEFT else "no",
    load_best_model_at_end=True if USE_PEFT else False,
    remove_unused_columns=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

# Add progress callback
from transformers import TrainerCallback
class ProgressCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:
            logger.info(f"Training step {state.global_step}/{state.max_steps} - Loss: {state.log_history[-1].get('train_loss', 'N/A') if state.log_history else 'N/A'}")

trainer.add_callback(ProgressCallback())

print("✅ Training setup completed!")


In [None]:
# Execute training
clear_gpu_memory()

if torch.cuda.is_available():
    logger.info(f"GPU memory before training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

logger.info("🚀 Starting training...")
try:
    trainer.train()
    logger.info("✅ Training completed successfully!")
except torch.cuda.OutOfMemoryError as e:
    logger.error(f"CUDA out of memory error: {e}")
    logger.info("Try reducing BATCH_SIZE further or MAX_LENGTH")
    clear_gpu_memory()
    raise
except Exception as e:
    logger.error(f"Training error: {e}")
    clear_gpu_memory()
    raise


In [None]:
# Save model and create download package
logger.info(f"Saving model to {OUTPUT_DIR}")
if USE_PEFT:
    model.save_pretrained(OUTPUT_DIR)
    logger.info("✅ Saved LoRA adapters")
else:
    trainer.save_model()

tokenizer.save_pretrained(OUTPUT_DIR)

# Create zip file for download
import zipfile
zip_filename = f"{OUTPUT_DIR}.zip"

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(OUTPUT_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, OUTPUT_DIR)
            zipf.write(file_path, arcname)

logger.info(f"✅ Model saved and packaged as {zip_filename}")

# Download the model
from google.colab import files
print(f"\n📥 Download your trained model:")
files.download(zip_filename)

print(f"\n🎉 Training completed! Model saved to: {OUTPUT_DIR}")
print(f"📦 Download package: {zip_filename}")
print("🤖 You can now use this model for inference!")


In [None]:
# Test the fine-tuned model (Optional)
def test_model():
    """Test the fine-tuned model with sample prompts"""
    
    # Test prompts in different languages
    test_prompts = [
        "मैं एक भारतीय हूं",  # Hindi
        "सर्वे भवन्तु सुखिनः",  # Sanskrit
        "मी एक महाराष्ट्रीय आहे",  # Marathi
        "I am learning multiple languages"  # English
    ]
    
    print("🧪 Testing fine-tuned model with sample prompts:")
    print("=" * 60)
    
    for prompt in test_prompts:
        print(f"\n📝 Prompt: {prompt}")
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"🤖 Response: {response}")
        print("-" * 40)

# Test the model
test_model()
