In [56]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/legal-fine-tuning-data/fine_tuning_data_train.jsonl
/kaggle/input/legal-fine-tuning-data/fine_tuning_data_val.jsonl


In [57]:
#!pip install -q transformers datasets accelerate sentencepiece sacrebleu unbabel-comet evaluate peft

In [58]:
#pip install --upgrade pyarrow==12.0.1

In [59]:
pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [60]:
import os
import json
import torch
import numpy as np
from pathlib import Path
from dataclasses import dataclass
from typing import Dict, List, Optional
from tqdm.auto import tqdm

# Set memory optimization environment variables
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:128"

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    get_linear_schedule_with_warmup,
    set_seed
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    PeftModel
)
from datasets import Dataset, DatasetDict
from accelerate import Accelerator, DistributedDataParallelKwargs
from torch.utils.data import DataLoader
import evaluate

# Set random seed for reproducibility
set_seed(42)

# Clear CUDA cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    # Set memory fraction to prevent over-allocation
    for i in range(torch.cuda.device_count()):
        torch.cuda.set_per_process_memory_fraction(0.95, i)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

# Print GPU memory info
if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        print(f"GPU {i}: {props.name} - {props.total_memory / 1024**3:.2f} GB")

PyTorch version: 2.8.0+cu126
CUDA available: True
Number of GPUs: 2
GPU 0: Tesla T4 - 14.74 GB
GPU 1: Tesla T4 - 14.74 GB


In [61]:
@dataclass
class Config:
    """Training configuration"""
    # Model
    model_name: str = "facebook/nllb-200-distilled-600M"
    
    # Languages (NLLB codes)
    src_lang: str = "fra_Latn"  # French
    tgt_lang: str = "arb_Arab"  # Arabic
    
    # Data
    train_file: str = "/kaggle/input/legal-fine-tuning-data/fine_tuning_data_train.jsonl" 
    val_file: str = "/kaggle/input/legal-fine-tuning-data/fine_tuning_data_val.jsonl"  
    max_length: int = 256  # Can use full length with LoRA
    
    # LoRA Configuration
    lora_r: int = 16  # LoRA rank (higher = more parameters, better quality)
    lora_alpha: int = 32  # LoRA alpha (scaling factor)
    lora_dropout: float = 0.05
    lora_target_modules: List[str] = None  # Will be set automatically
    
    # Training
    num_epochs: int = 5
    per_device_train_batch_size: int = 8  # Can use larger batch with LoRA!
    per_device_eval_batch_size: int = 16
    gradient_accumulation_steps: int = 4
    learning_rate: float = 3e-4  # Higher LR is better for LoRA
    weight_decay: float = 0.01
    warmup_ratio: float = 0.1
    max_grad_norm: float = 1.0
    
    # Optimization
    fp16: bool = True
    
    # Checkpointing (based on optimizer steps)
    output_dir: str = "/kaggle/working/nllb-fr-ar-legal-lora"
    save_steps: int = 5
    eval_steps: int = 5
    logging_steps: int = 2
    
    # Evaluation
    num_beams: int = 5  # Can use more beams with LoRA
    early_stopping: bool = True
    
    # Memory optimization
    gradient_checkpointing: bool = False  # Not needed with LoRA

config = Config()
os.makedirs(config.output_dir, exist_ok=True)

print("\n" + "="*80)
print("LoRA Configuration:")
print(f"  Rank (r): {config.lora_r}")
print(f"  Alpha: {config.lora_alpha}")
print(f"  Dropout: {config.lora_dropout}")
print(f"  Learning Rate: {config.learning_rate}")
print("="*80 + "\n")


LoRA Configuration:
  Rank (r): 16
  Alpha: 32
  Dropout: 0.05
  Learning Rate: 0.0003



In [62]:
def load_jsonl_dataset(file_path: str) -> List[Dict[str, str]]:
    """Load JSONL dataset"""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

def create_datasets(config: Config):
    """Create train and validation datasets"""
    
    # Load raw data
    print("Loading datasets...")
    train_data = load_jsonl_dataset(config.train_file)
    val_data = load_jsonl_dataset(config.val_file)
    
    print(f"Train samples: {len(train_data)}")
    print(f"Validation samples: {len(val_data)}")
    
    # Create HuggingFace datasets
    train_dataset = Dataset.from_list(train_data)
    val_dataset = Dataset.from_list(val_data)
    
    return DatasetDict({
        "train": train_dataset,
        "validation": val_dataset
    })

# Load datasets
try:
    raw_datasets = create_datasets(config)
    print("\nExample training sample:")
    print(f"Source: {raw_datasets['train'][0]['src_text'][:100]}...")
    print(f"Target: {raw_datasets['train'][0]['tgt_text'][:100]}...")
except FileNotFoundError:
    print("‚ö†Ô∏è  Dataset files not found. Creating dummy dataset for demonstration...")
    # Create dummy dataset for testing
    dummy_data = [
        {"src_text": f"Article {i} du code civil fran√ßais.", 
         "tgt_text": f"ÿßŸÑŸÖÿßÿØÿ© {i} ŸÖŸÜ ÿßŸÑŸÇÿßŸÜŸàŸÜ ÿßŸÑŸÖÿØŸÜŸä ÿßŸÑŸÅÿ±ŸÜÿ≥Ÿä."}
        for i in range(1, 101)
    ]
    raw_datasets = DatasetDict({
        "train": Dataset.from_list(dummy_data[:80]),
        "validation": Dataset.from_list(dummy_data[80:])
    })
    print("‚úì Dummy dataset created for demonstration")

Loading datasets...
Train samples: 300
Validation samples: 31

Example training sample:
Source: Il n‚Äôy a pas d‚Äôinfraction, ni de peine ou de mesures de s√ªret√© sans loi....
Target: ŸÑÿß ÿ¨ÿ±ŸäŸÖÿ© ŸàŸÑÿß ÿπŸÇŸàÿ®ÿ© ÿ£Ÿà ÿ™ÿØÿßÿ®Ÿäÿ± ÿ£ŸÖŸÜ ÿ®ÿ∫Ÿäÿ± ŸÇÿßŸÜŸàŸÜ.

...


In [63]:
print("Loading tokenizer and model...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    config.model_name,
    src_lang=config.src_lang,
    tgt_lang=config.tgt_lang
)

# Get forced_bos_token_id for Arabic
if hasattr(tokenizer, 'lang_code_to_id'):
    forced_bos_token_id = tokenizer.lang_code_to_id[config.tgt_lang]
else:
    forced_bos_token_id = tokenizer.convert_tokens_to_ids(config.tgt_lang)

print(f"Forced BOS token ID for {config.tgt_lang}: {forced_bos_token_id}")

# Load base model in fp16 for memory efficiency
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    config.model_name,
    torch_dtype=torch.float16 if config.fp16 else torch.float32
)
base_model.config.forced_bos_token_id = forced_bos_token_id

print(f"Base model parameters: {base_model.num_parameters():,}")
print(f"Base model dtype: {base_model.dtype}")

Loading tokenizer and model...
Forced BOS token ID for arb_Arab: 256011
Base model parameters: 615,073,792
Base model dtype: torch.float16


In [64]:
# Configure LoRA
# Target modules: for NLLB, we target q_proj, v_proj in attention layers
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    target_modules=["q_proj", "v_proj", "k_proj", "out_proj", "fc1", "fc2"],  # Transformer layers
    lora_dropout=config.lora_dropout,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

# Apply LoRA to the model
model = get_peft_model(base_model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()

print(f"\n‚úì LoRA applied successfully!")
print(f"Total parameters: {model.num_parameters():,}")
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"Trainable %: {100 * sum(p.numel() for p in model.parameters() if p.requires_grad) / model.num_parameters():.2f}%")

trainable params: 8,650,752 || all params: 623,724,544 || trainable%: 1.3870

‚úì LoRA applied successfully!
Total parameters: 623,724,544
Trainable parameters: 8,650,752
Trainable %: 1.39%


In [65]:
def preprocess_function(examples):
    """Tokenize inputs and targets"""
    # Set source language
    tokenizer.src_lang = config.src_lang
    
    # Tokenize inputs
    model_inputs = tokenizer(
        examples["src_text"],
        max_length=config.max_length,
        truncation=True,
        padding=False
    )
    
    # Tokenize targets
    tokenizer.src_lang = config.tgt_lang
    labels = tokenizer(
        examples["tgt_text"],
        max_length=config.max_length,
        truncation=True,
        padding=False
    )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
print("Tokenizing datasets...")
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
    desc="Tokenizing"
)

print(f"Tokenized train samples: {len(tokenized_datasets['train'])}")
print(f"Tokenized val samples: {len(tokenized_datasets['validation'])}")

Tokenizing datasets...


Tokenizing:   0%|          | 0/300 [00:00<?, ? examples/s]

Tokenizing:   0%|          | 0/31 [00:00<?, ? examples/s]

Tokenized train samples: 300
Tokenized val samples: 31


In [66]:
@dataclass
class DataCollatorForSeq2Seq:
    """Custom data collator with dynamic padding"""
    tokenizer: AutoTokenizer
    model: Optional[AutoModelForSeq2SeqLM] = None
    padding: bool = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    
    def __call__(self, features: List[Dict[str, List[int]]]) -> Dict[str, torch.Tensor]:
        labels = [f["labels"] for f in features] if "labels" in features[0] else None
        features_without_labels = [{k: v for k, v in f.items() if k != "labels"} for f in features]
        
        batch = self.tokenizer.pad(
            features_without_labels,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt"
        )
        
        if labels is not None:
            max_label_length = max(len(l) for l in labels)
            if self.pad_to_multiple_of is not None:
                max_label_length = (
                    (max_label_length + self.pad_to_multiple_of - 1)
                    // self.pad_to_multiple_of
                    * self.pad_to_multiple_of
                )
            
            padded_labels = []
            for label in labels:
                remainder = [self.label_pad_token_id] * (max_label_length - len(label))
                padded_labels.append(label + remainder)
            
            batch["labels"] = torch.tensor(padded_labels)
        
        return batch

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8 if config.fp16 else None
)

In [67]:
# Configure DDP
ddp_kwargs = DistributedDataParallelKwargs(find_unused_parameters=False)

# Initialize accelerator
accelerator = Accelerator(
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    mixed_precision="fp16" if config.fp16 else "no",
    log_with=None,
    kwargs_handlers=[ddp_kwargs]
)

print(f"Distributed type: {accelerator.distributed_type}")
print(f"Number of processes: {accelerator.num_processes}")
print(f"Process index: {accelerator.process_index}")
print(f"Device: {accelerator.device}")

# Create dataloaders
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    batch_size=config.per_device_train_batch_size,
    shuffle=True,
    collate_fn=data_collator,
    num_workers=2,
    pin_memory=True
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"],
    batch_size=config.per_device_eval_batch_size,
    shuffle=False,
    collate_fn=data_collator,
    num_workers=2,
    pin_memory=True
)


Distributed type: DistributedType.NO
Number of processes: 1
Process index: 0
Device: cuda


In [68]:
# Optimizer - only optimize LoRA parameters
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=config.learning_rate,
    weight_decay=config.weight_decay
)

# Calculate total training steps
num_update_steps_per_epoch = len(train_dataloader) // config.gradient_accumulation_steps
max_train_steps = config.num_epochs * num_update_steps_per_epoch
num_warmup_steps = int(max_train_steps * config.warmup_ratio)

# Scheduler
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=max_train_steps
)

print(f"Total training steps: {max_train_steps}")
print(f"Warmup steps: {num_warmup_steps}")
print(f"Steps per epoch: {num_update_steps_per_epoch}")


Total training steps: 45
Warmup steps: 4
Steps per epoch: 9


In [69]:
# Prepare everything with accelerator
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

print("‚úì Model and dataloaders prepared for distributed training")
print(f"‚úì Model now on device: {accelerator.device}")

‚úì Model and dataloaders prepared for distributed training
‚úì Model now on device: cuda


In [70]:
pip install sacrebleu

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [71]:
# Load metrics
bleu_metric = evaluate.load("sacrebleu")

def postprocess_text(preds, labels):
    """Post-process predictions and labels for metric computation"""
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

def compute_metrics(predictions, references):
    """Compute BLEU score"""
    decoded_preds, decoded_labels = postprocess_text(predictions, references)
    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [72]:
def evaluate_model(model, eval_dataloader, accelerator, config, forced_bos_token_id):
    """Evaluate model on validation set"""
    model.eval()
    
    all_predictions = []
    all_references = []
    total_loss = 0
    
    # Clear cache before evaluation
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    
    with torch.no_grad():
        for batch in tqdm(eval_dataloader, desc="Evaluating", disable=not accelerator.is_local_main_process):
            # Compute loss
            outputs = model(**batch)
            loss = outputs.loss
            total_loss += accelerator.gather(loss).mean().item()
            
            # Generate predictions
            generated_tokens = accelerator.unwrap_model(model).generate(
    input_ids=batch["input_ids"],
    attention_mask=batch["attention_mask"],
    max_length=config.max_length,
    num_beams=config.num_beams,
    early_stopping=config.early_stopping,
    forced_bos_token_id=forced_bos_token_id
            )
            
            # Gather predictions and labels from all processes
            generated_tokens = accelerator.pad_across_processes(
                generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
            )
            labels = accelerator.pad_across_processes(
                batch["labels"], dim=1, pad_index=-100
            )
            
            generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
            labels = accelerator.gather(labels).cpu().numpy()
            
            # Replace -100 in labels
            labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
            
            # Decode
            if isinstance(generated_tokens, tuple):
                generated_tokens = generated_tokens[0]
            
            decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
            
            all_predictions.extend(decoded_preds)
            all_references.extend(decoded_labels)
    
    # Compute metrics only on main process
    metrics = {}
    if accelerator.is_main_process:
        avg_loss = total_loss / len(eval_dataloader)
        metrics = compute_metrics(all_predictions, all_references)
        metrics["eval_loss"] = avg_loss
    
    model.train()
    return metrics

In [73]:
def train(model, train_dataloader, eval_dataloader, optimizer, lr_scheduler, accelerator, config, forced_bos_token_id):
    """Main training loop"""
    
    global_step = 0
    best_bleu = 0
    
    # Calculate actual optimizer steps per epoch
    optimizer_steps_per_epoch = len(train_dataloader) // config.gradient_accumulation_steps
    
    accelerator.print("=" * 80)
    accelerator.print("Starting LoRA fine-tuning...")
    accelerator.print(f"Dataloader steps per epoch: {len(train_dataloader)}")
    accelerator.print(f"Gradient accumulation steps: {config.gradient_accumulation_steps}")
    accelerator.print(f"Optimizer steps per epoch: {optimizer_steps_per_epoch}")
    accelerator.print(f"Total optimizer steps: {max_train_steps}")
    accelerator.print(f"Evaluation every {config.eval_steps} optimizer steps")
    accelerator.print(f"Checkpoint every {config.save_steps} optimizer steps")
    accelerator.print("=" * 80)
    
    model.train()
    
    for epoch in range(config.num_epochs):
        accelerator.print(f"\nEpoch {epoch + 1}/{config.num_epochs}")
        
        # Clear cache at start of each epoch
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
        
        progress_bar = tqdm(
            total=len(train_dataloader),
            disable=not accelerator.is_local_main_process,
            desc=f"Epoch {epoch + 1}"
        )
        
        for step, batch in enumerate(train_dataloader):
            with accelerator.accumulate(model):
                outputs = model(**batch)
                loss = outputs.loss
                
                accelerator.backward(loss)
                
                if accelerator.sync_gradients:
                    accelerator.clip_grad_norm_(model.parameters(), config.max_grad_norm)
                
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
            
            if accelerator.sync_gradients:
                global_step += 1
                progress_bar.update(1)
                
                # Logging
                if global_step % config.logging_steps == 0:
                    avg_loss = accelerator.gather(loss).mean().item()
                    accelerator.print(
                        f"Step {global_step} | Loss: {avg_loss:.4f} | LR: {lr_scheduler.get_last_lr()[0]:.2e}"
                    )
                
                # Evaluation
                if global_step % config.eval_steps == 0:
                    accelerator.print("\n" + "=" * 80)
                    accelerator.print(f"Evaluating at step {global_step}...")
                    
                    metrics = evaluate_model(model, eval_dataloader, accelerator, config, forced_bos_token_id)
                    
                    if accelerator.is_main_process:
                        accelerator.print(f"Eval Loss: {metrics['eval_loss']:.4f}")
                        accelerator.print(f"BLEU Score: {metrics['bleu']:.2f}")
                        
                        # Save best model
                        if metrics['bleu'] > best_bleu:
                            best_bleu = metrics['bleu']
                            accelerator.print(f"‚úì New best BLEU: {best_bleu:.2f}")
                            
                            # Save LoRA adapter
                            unwrapped_model = accelerator.unwrap_model(model)
                            unwrapped_model.save_pretrained(
                                config.output_dir,
                                is_main_process=accelerator.is_main_process,
                                save_function=accelerator.save
                            )
                            tokenizer.save_pretrained(config.output_dir)
                            accelerator.print(f"‚úì LoRA adapter saved to {config.output_dir}")
                    
                    accelerator.print("=" * 80 + "\n")
                
                # Save periodic checkpoint
                if global_step % config.save_steps == 0:
                    checkpoint_dir = os.path.join(config.output_dir, f"checkpoint-{global_step}")
                    os.makedirs(checkpoint_dir, exist_ok=True)
                    
                    unwrapped_model = accelerator.unwrap_model(model)
                    unwrapped_model.save_pretrained(
                        checkpoint_dir,
                        is_main_process=accelerator.is_main_process,
                        save_function=accelerator.save
                    )
                    if accelerator.is_main_process:
                        tokenizer.save_pretrained(checkpoint_dir)
                        accelerator.print(f"‚úì Checkpoint saved at step {global_step}")
        
        progress_bar.close()
    
    accelerator.print("\n" + "=" * 80)
    accelerator.print("Training completed!")
    accelerator.print(f"Best BLEU score: {best_bleu:.2f}")
    accelerator.print("=" * 80)
    
    return best_bleu

In [74]:
best_bleu = train(
    model=model,
    train_dataloader=train_dataloader,
    eval_dataloader=eval_dataloader,
    optimizer=optimizer,
    lr_scheduler=lr_scheduler,
    accelerator=accelerator,
    config=config,
    forced_bos_token_id=forced_bos_token_id
)

Starting LoRA fine-tuning...
Dataloader steps per epoch: 38
Gradient accumulation steps: 4
Optimizer steps per epoch: 9
Total optimizer steps: 45
Evaluation every 5 optimizer steps
Checkpoint every 5 optimizer steps

Epoch 1/5


Epoch 1:   0%|          | 0/38 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELI

Step 2 | Loss: 3.1471 | LR: 1.50e-04
Step 4 | Loss: 2.2406 | LR: 3.00e-04

Evaluating at step 5...


Evaluating:   0%|          | 0/2 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELI

OutOfMemoryError: CUDA out of memory. Tried to allocate 3.67 GiB. GPU 1 has a total capacity of 14.74 GiB of which 2.05 GiB is free. Process 3951 has 12.69 GiB memory in use. 14.00 GiB allowed; Of the allocated memory 12.15 GiB is allocated by PyTorch, and 419.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
if accelerator.is_main_process:
    print("\n" + "=" * 80)
    print("Running final evaluation with COMET...")
    print("=" * 80)
    
    # Load COMET metric
    try:
        comet_metric = evaluate.load("comet")
        
        # Prepare data for COMET
        eval_samples = raw_datasets["validation"].select(range(min(100, len(raw_datasets["validation"]))))
        sources = eval_samples["src_text"]
        references = eval_samples["tgt_text"]
        
        # Generate translations
        model.eval()
        predictions = []
        
        for i in tqdm(range(0, len(sources), config.per_device_eval_batch_size), desc="Generating"):
            batch_sources = sources[i:i + config.per_device_eval_batch_size]
            
            tokenizer.src_lang = config.src_lang
            inputs = tokenizer(
                batch_sources,
                return_tensors="pt",
                padding=True,
                truncation=True,
                max_length=config.max_length
            ).to(accelerator.device)
            
            with torch.no_grad():
                generated = accelerator.unwrap_model(model).generate(
                    **inputs,
                    max_length=config.max_length,
                    num_beams=config.num_beams,
                    early_stopping=True
                )
            
            decoded = tokenizer.batch_decode(generated, skip_special_tokens=True)
            predictions.extend(decoded)
        
        # Compute COMET
        comet_input = {
            "sources": sources,
            "predictions": predictions,
            "references": references
        }
        
        comet_score = comet_metric.compute(**comet_input, model_name="Unbabel/wmt22-comet-da")
        
        print(f"\n‚úì Final COMET Score: {comet_score['mean_score']:.4f}")
        print(f"‚úì Final BLEU Score: {best_bleu:.2f}")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  Could not compute COMET score: {e}")
        print("This is normal on Kaggle due to resource constraints.")


In [None]:
if accelerator.is_main_process:
    print("\n" + "=" * 80)
    print("Saving final LoRA adapter...")
    
    # Save the LoRA adapter (only a few MB!)
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(config.output_dir)
    tokenizer.save_pretrained(config.output_dir)
    
    # Save config
    with open(os.path.join(config.output_dir, "training_config.json"), "w") as f:
        json.dump(vars(config), f, indent=2)
    
    print(f"‚úì LoRA adapter saved to: {config.output_dir}")
    print(f"‚úì Adapter size: Only trainable parameters saved (~10-50 MB)")
    print("=" * 80)


In [None]:
if accelerator.is_main_process:
    print("\n" + "=" * 80)
    print("Testing inference with LoRA fine-tuned model...")
    print("=" * 80)
    
    # Reload base model and LoRA adapter for inference
    print("Loading base model...")
    inference_base_model = AutoModelForSeq2SeqLM.from_pretrained(
        config.model_name,
        torch_dtype=torch.float16 if config.fp16 else torch.float32
    )
    
    print("Loading LoRA adapter...")
    inference_model = PeftModel.from_pretrained(inference_base_model, config.output_dir)
    inference_model = inference_model.merge_and_unload()  # Merge LoRA weights into base model
    
    inference_tokenizer = AutoTokenizer.from_pretrained(config.output_dir)
    inference_model.to(accelerator.device)
    inference_model.eval()
    
    # Test samples
    test_texts = [
        "Article 1er : La loi r√©git l'ensemble du territoire national.",
        "Le code civil r√©git les droits patrimoniaux et extrapatrimoniaux.",
        "Conform√©ment aux dispositions du journal officiel, le d√©cret entre en vigueur imm√©diatement."
    ]
    
    print("\nüîç Translation Examples:\n")
    
    for i, text in enumerate(test_texts, 1):
        # Tokenize
        inference_tokenizer.src_lang = config.src_lang
        inputs = inference_tokenizer(
            text,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=config.max_length
        ).to(accelerator.device)
        
        # Generate
        with torch.no_grad():
            generated = inference_model.generate(
                **inputs,
                max_length=config.max_length,
                num_beams=5,
                early_stopping=True
            )
        
        # Decode
        translation = inference_tokenizer.decode(generated[0], skip_special_tokens=True)
        
        print(f"Example {i}:")
        print(f"  FR: {text}")
        print(f"  AR: {translation}")
        print()
    
    print("=" * 80)
    print("‚úÖ LoRA fine-tuning and evaluation complete!")
    print(f"üìÅ LoRA adapter location: {config.output_dir}")
    print("=" * 80)

In [None]:
"""
# ========================================================================
# USING THE LoRA FINE-TUNED MODEL
# ========================================================================

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

# Method 1: Load base model + LoRA adapter separately (saves memory)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/nllb-200-distilled-600M",
    torch_dtype=torch.float16
)
model = PeftModel.from_pretrained(base_model, "/kaggle/working/nllb-fr-ar-legal-lora")
tokenizer = AutoTokenizer.from_pretrained("/kaggle/working/nllb-fr-ar-legal-lora")

# Method 2: Merge LoRA weights into base model (for deployment)
base_model = AutoModelFor