# üöÄ Multilingual Fine-tuning Training Script for Google Colab

This notebook fine-tunes a language model on multilingual data (Hindi, Sanskrit, Marathi, English) using LoRA/PEFT for efficient training.

## Features:
- ‚úÖ All dependencies included
- ‚úÖ Sample data generation for demo
- ‚úÖ Memory-optimized for Colab's GPU constraints
- ‚úÖ Automatic GPU detection and configuration
- ‚úÖ Progress tracking and logging
- ‚úÖ LoRA/PEFT support for efficient training

## Usage:
1. Enable GPU in Colab (Runtime > Change runtime type > GPU)
2. Run each cell in sequence
3. **Note**: If prompted for W&B API key, the notebook has been configured to disable all external logging
4. Download the fine-tuned model when complete


In [None]:
# Install required packages
%pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
%pip install -q transformers datasets accelerate peft bitsandbytes
%pip install -q sentencepiece langdetect

# Verify installation
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU: {torch.cuda.get_device_name(0)}")


In [None]:
# Import libraries and setup
import logging
import os
import gc
import hashlib
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
import torch
import numpy as np

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configuration
MODEL_NAME = "AhinsaAI/ahinsa0.5-llama3.2-3B"  # Change this to your preferred model
OUTPUT_DIR = "fine_tuned_model"
EPOCHS = 2
BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 4
WARMUP_STEPS = 100
LEARNING_RATE = 5e-5
MAX_LENGTH = 512
USE_QUANTIZATION = True
USE_PEFT = True

print(f"ü§ñ Model: {MODEL_NAME}")
print(f"üìä Training Epochs: {EPOCHS}")
print(f"üîß Quantization: {'Enabled' if USE_QUANTIZATION else 'Disabled'}")
print(f"üîß PEFT/LoRA: {'Enabled' if USE_PEFT else 'Disabled'}")


In [None]:
# Utility functions
def clear_gpu_memory():
    """Clear GPU memory and run garbage collection"""
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.synchronize()
    gc.collect()

def check_gpu_usage():
    """Check and log GPU usage"""
    if torch.cuda.is_available():
        device = torch.cuda.current_device()
        gpu_name = torch.cuda.get_device_name(device)
        memory_allocated = torch.cuda.memory_allocated(device) / 1024**3
        memory_reserved = torch.cuda.memory_reserved(device) / 1024**3
        memory_total = torch.cuda.get_device_properties(device).total_memory / 1024**3
        
        logger.info(f"GPU: {gpu_name}")
        logger.info(f"GPU Memory - Allocated: {memory_allocated:.2f} GB, Reserved: {memory_reserved:.2f} GB, Total: {memory_total:.2f} GB")
        return True
    else:
        logger.info("CUDA not available - using CPU")
        return False

# Check GPU
check_gpu_usage()


## üìÅ Data Configuration

**IMPORTANT**: Make sure your Google Drive has the following folder structure:

```
Google Drive/
‚îî‚îÄ‚îÄ Data/
    ‚îú‚îÄ‚îÄ training/
    ‚îÇ   ‚îú‚îÄ‚îÄ hi_train.txt    (Hindi training data)
    ‚îÇ   ‚îú‚îÄ‚îÄ sa_train.txt    (Sanskrit training data)
    ‚îÇ   ‚îú‚îÄ‚îÄ mr_train.txt    (Marathi training data)
    ‚îÇ   ‚îî‚îÄ‚îÄ en_train.txt    (English training data)
    ‚îî‚îÄ‚îÄ validation/
        ‚îú‚îÄ‚îÄ hi_val.txt      (Hindi validation data)
        ‚îú‚îÄ‚îÄ sa_val.txt      (Sanskrit validation data)
        ‚îú‚îÄ‚îÄ mr_val.txt      (Marathi validation data)
        ‚îî‚îÄ‚îÄ en_val.txt      (English validation data)
```

**If your files have different names**, you can modify the `corpus_files` dictionary in the next cell to match your actual file names.


In [None]:
# Mount Google Drive and setup data paths
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Set up data paths to your Google Drive Data folder
DRIVE_DATA_PATH = "/content/drive/MyDrive/Data"  # Adjust this path if your folder is different
TRAINING_DATA_PATH = os.path.join(DRIVE_DATA_PATH, "training")
VALIDATION_DATA_PATH = os.path.join(DRIVE_DATA_PATH, "validation")

print(f"üîó Google Drive mounted successfully!")
print(f"üìÅ Data path: {DRIVE_DATA_PATH}")
print(f"üìÅ Training data path: {TRAINING_DATA_PATH}")
print(f"üìÅ Validation data path: {VALIDATION_DATA_PATH}")

# Check if the paths exist
if os.path.exists(TRAINING_DATA_PATH):
    print(f"‚úÖ Training data folder found!")
    train_files = os.listdir(TRAINING_DATA_PATH)
    print(f"üìÑ Training files: {train_files}")
else:
    print(f"‚ùå Training data folder not found at: {TRAINING_DATA_PATH}")
    print("Please check your Google Drive folder structure")

if os.path.exists(VALIDATION_DATA_PATH):
    print(f"‚úÖ Validation data folder found!")
    val_files = os.listdir(VALIDATION_DATA_PATH)
    print(f"üìÑ Validation files: {val_files}")
else:
    print(f"‚ùå Validation data folder not found at: {VALIDATION_DATA_PATH}")
    print("Please check your Google Drive folder structure")


In [None]:
# Customize file names if needed
# If your files have different names, modify the dictionaries below

# Training file names (modify these to match your actual file names)
TRAINING_FILES = {
    "hindi": "hi_train.txt",
    "sanskrit": "sa_train.txt", 
    "marathi": "mr_train.txt",
    "english": "en_train.txt"
}

# Validation file names (modify these to match your actual file names)
VALIDATION_FILES = {
    "hindi": "hi_val.txt",
    "sanskrit": "sa_val.txt", 
    "marathi": "mr_val.txt",
    "english": "en_val.txt"
}

print("üìã Current file configuration:")
print("Training files:")
for lang, filename in TRAINING_FILES.items():
    print(f"  {lang}: {filename}")
print("\nValidation files:")
for lang, filename in VALIDATION_FILES.items():
    print(f"  {lang}: {filename}")

print(f"\nüîç If you need to change file names, modify the TRAINING_FILES and VALIDATION_FILES dictionaries above.")
print(f"üìÅ Make sure your Google Drive Data folder has the same structure as shown in the previous cell.")


In [None]:
# Disable Weights & Biases and other logging integrations
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Disable wandb completely
try:
    import wandb
    wandb.init(mode="disabled")
except ImportError:
    pass

print("‚úÖ Disabled Weights & Biases and other logging integrations")
print("üìä Training progress will be shown in console logs only")


In [None]:
# Additional environment setup to prevent W&B prompts
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["WANDB_MODE"] = "disabled"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Try to disable wandb if it's available
try:
    import wandb
    os.environ["WANDB_PROJECT"] = ""
    print("‚úÖ Weights & Biases disabled via environment variables")
except ImportError:
    print("‚úÖ Weights & Biases not installed - no action needed")

print("üìä All external logging integrations disabled")


In [None]:
# Load model and tokenizer with optimizations
def load_model_and_tokenizer():
    """Load model and tokenizer with memory optimizations"""
    
    clear_gpu_memory()
    
    # Load tokenizer
    logger.info(f"Loading tokenizer from {MODEL_NAME}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # Configure quantization
    quantization_config = None
    if USE_QUANTIZATION and torch.cuda.is_available():
        quantization_config = BitsAndBytesConfig(
            load_in_8bit=True,
            llm_int8_threshold=6.0,
            llm_int8_has_fp16_weight=False,
        )
        logger.info("üîß Using 8-bit quantization")
    
    # Load model
    logger.info(f"Loading model from {MODEL_NAME}")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=quantization_config,
        device_map="auto" if torch.cuda.is_available() else None,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16 if quantization_config else torch.float32,
    )
    
    # Apply LoRA if enabled
    if USE_PEFT:
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            inference_mode=False,
            r=16,
            lora_alpha=32,
            lora_dropout=0.1,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        )
        
        model = get_peft_model(model, lora_config)
        logger.info("üîß Applied LoRA adapters")
        model.print_trainable_parameters()
    
    # Add padding token
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
        logger.info("Added EOS token as padding token")
    
    return model, tokenizer

# Load model and tokenizer
model, tokenizer = load_model_and_tokenizer()
check_gpu_usage()


In [None]:
# Prepare training data from Google Drive
def load_training_data():
    """Load and tokenize training data from Google Drive"""
    
    # Use the configurable file names
    corpus_files = TRAINING_FILES
    
    # Load training data from Google Drive
    train_texts = []
    for lang, filename in corpus_files.items():
        filepath = os.path.join(TRAINING_DATA_PATH, filename)
        if os.path.exists(filepath):
            logger.info(f"Loading {lang} training data from {filepath}")
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10]
                train_texts.extend(filtered_lines)
                logger.info(f"Loaded {len(filtered_lines)} {lang} training samples")
        else:
            logger.warning(f"Training file not found: {filepath}")
            logger.info(f"Available files in training directory: {os.listdir(TRAINING_DATA_PATH) if os.path.exists(TRAINING_DATA_PATH) else 'Directory not found'}")
    
    # Load validation data from Google Drive
    eval_texts = []
    val_files = VALIDATION_FILES
    
    for lang, filename in val_files.items():
        filepath = os.path.join(VALIDATION_DATA_PATH, filename)
        if os.path.exists(filepath):
            logger.info(f"Loading {lang} validation data from {filepath}")
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.readlines()
                filtered_lines = [line.strip() for line in lines if len(line.strip()) > 10]
                eval_texts.extend(filtered_lines)
                logger.info(f"Loaded {len(filtered_lines)} {lang} validation samples")
        else:
            logger.warning(f"Validation file not found: {filepath}")
            logger.info(f"Available files in validation directory: {os.listdir(VALIDATION_DATA_PATH) if os.path.exists(VALIDATION_DATA_PATH) else 'Directory not found'}")
    
    if len(train_texts) == 0:
        logger.error("‚ùå No training data loaded! Please check your file paths and names.")
        logger.info("Expected file structure:")
        logger.info("Google Drive/Data/training/")
        logger.info("  ‚îú‚îÄ‚îÄ hi_train.txt")
        logger.info("  ‚îú‚îÄ‚îÄ sa_train.txt")
        logger.info("  ‚îú‚îÄ‚îÄ mr_train.txt")
        logger.info("  ‚îî‚îÄ‚îÄ en_train.txt")
        logger.info("Google Drive/Data/validation/")
        logger.info("  ‚îú‚îÄ‚îÄ hi_val.txt")
        logger.info("  ‚îú‚îÄ‚îÄ sa_val.txt")
        logger.info("  ‚îú‚îÄ‚îÄ mr_val.txt")
        logger.info("  ‚îî‚îÄ‚îÄ en_val.txt")
        return None, None
    
    logger.info(f"‚úÖ Total training samples: {len(train_texts)}")
    logger.info(f"‚úÖ Total validation samples: {len(eval_texts)}")
    
    # Create datasets
    train_dataset = Dataset.from_dict({"text": train_texts})
    eval_dataset = Dataset.from_dict({"text": eval_texts}) if eval_texts else None
    
    return train_dataset, eval_dataset

# Load and tokenize data
train_dataset, eval_dataset = load_training_data()

# Check if data was loaded successfully
if train_dataset is None or eval_dataset is None:
    raise ValueError("‚ùå Failed to load training data. Please check your Google Drive folder structure and file names.")

# Tokenization function
def tokenize_fn(examples):
    return tokenizer(
        examples["text"], 
        truncation=True, 
        max_length=MAX_LENGTH,
        padding=True,
        return_tensors=None
    )

# Tokenize datasets
logger.info("Tokenizing training dataset...")
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

tokenized_eval = None
if eval_dataset:
    logger.info("Tokenizing validation dataset...")
    tokenized_eval = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

logger.info("‚úÖ Data tokenization completed!")


In [None]:
# Setup training
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8,
    return_tensors="pt",
)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=WARMUP_STEPS,
    logging_steps=50,
    save_steps=200,
    eval_steps=200,
    learning_rate=LEARNING_RATE,
    fp16=False,
    dataloader_drop_last=True,
    dataloader_pin_memory=False,
    report_to=None,
    dataloader_num_workers=0,
    save_total_limit=2,
    max_grad_norm=1.0,
    save_strategy="steps" if USE_PEFT else "epoch",
    eval_strategy="steps" if USE_PEFT else "no",
    load_best_model_at_end=True if USE_PEFT else False,
    remove_unused_columns=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    data_collator=data_collator,
)

# Add progress callback
from transformers import TrainerCallback
class ProgressCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step % 10 == 0:
            logger.info(f"Training step {state.global_step}/{state.max_steps} - Loss: {state.log_history[-1].get('train_loss', 'N/A') if state.log_history else 'N/A'}")

trainer.add_callback(ProgressCallback())

print("‚úÖ Training setup completed!")


In [None]:
# Execute training
clear_gpu_memory()

if torch.cuda.is_available():
    logger.info(f"GPU memory before training: {torch.cuda.memory_allocated() / 1024**3:.2f} GB")

logger.info("üöÄ Starting training...")
try:
    trainer.train()
    logger.info("‚úÖ Training completed successfully!")
except torch.cuda.OutOfMemoryError as e:
    logger.error(f"CUDA out of memory error: {e}")
    logger.info("Try reducing BATCH_SIZE further or MAX_LENGTH")
    clear_gpu_memory()
    raise
except Exception as e:
    logger.error(f"Training error: {e}")
    clear_gpu_memory()
    raise


In [None]:
# Save model and create download package
logger.info(f"Saving model to {OUTPUT_DIR}")
if USE_PEFT:
    model.save_pretrained(OUTPUT_DIR)
    logger.info("‚úÖ Saved LoRA adapters")
else:
    trainer.save_model()

tokenizer.save_pretrained(OUTPUT_DIR)

# Create zip file for download
import zipfile
zip_filename = f"{OUTPUT_DIR}.zip"

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(OUTPUT_DIR):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, OUTPUT_DIR)
            zipf.write(file_path, arcname)

logger.info(f"‚úÖ Model saved and packaged as {zip_filename}")

# Download the model
from google.colab import files
print(f"\nüì• Download your trained model:")
files.download(zip_filename)

print(f"\nüéâ Training completed! Model saved to: {OUTPUT_DIR}")
print(f"üì¶ Download package: {zip_filename}")
print("ü§ñ You can now use this model for inference!")


In [None]:
# Test the fine-tuned model (Optional)
def test_model():
    """Test the fine-tuned model with sample prompts"""
    
    # Test prompts in different languages
    test_prompts = [
        "‡§Æ‡•à‡§Ç ‡§è‡§ï ‡§≠‡§æ‡§∞‡§§‡•Ä‡§Ø ‡§π‡•Ç‡§Ç",  # Hindi
        "‡§∏‡§∞‡•ç‡§µ‡•á ‡§≠‡§µ‡§®‡•ç‡§§‡•Å ‡§∏‡•Å‡§ñ‡§ø‡§®‡§É",  # Sanskrit
        "‡§Æ‡•Ä ‡§è‡§ï ‡§Æ‡§π‡§æ‡§∞‡§æ‡§∑‡•ç‡§ü‡•ç‡§∞‡•Ä‡§Ø ‡§Ü‡§π‡•á",  # Marathi
        "I am learning multiple languages"  # English
    ]
    
    print("üß™ Testing fine-tuned model with sample prompts:")
    print("=" * 60)
    
    for prompt in test_prompts:
        print(f"\nüìù Prompt: {prompt}")
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}
        
        # Generate response
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,
                temperature=0.7,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id
            )
        
        # Decode response
        response = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"ü§ñ Response: {response}")
        print("-" * 40)

# Test the model
test_model()
