In [None]:
%pip install datasets transformers peft torch evaluate sacrebleu

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-no

In [None]:
import os
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import DataLoader
import torch
from evaluate import load
import gc
import time

In [None]:
project_config = {
    "model_name": "Helsinki-NLP/opus-mt-fr-en",
    "dataset_name": "opus100",
    "language_pair": "en-fr",
    "max_samples": 50000,
    "train_test_split": 0.8,
    "lora_config": {
        "r": 128,  # Increased from 16 to 32 for more capacity
        "lora_alpha": 128,
        "lora_dropout": 0.05,  # Reduced from 0.1 for less regularization
        "target_modules": ["q", "v"]  # Will be set dynamically
    },
    "training_config": {
        "batch_size": 32,
        "learning_rate":  1e-4,  # Reduced from 1e-4 for stability
        "epochs": 5  # Increased from 3 to 5 for more training
    }
}

In [None]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

device = 'cuda' if torch.cuda.is_available()  else 'cpu'
print(f"Using device: {device}")

def analyze_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_size_mb = total_params * 4 / (1024 * 1024)
    trainable_size_mb = trainable_params * 4 / (1024 * 1024)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)")
    print(f"Total model size: {total_size_mb:.2f} MB")
    print(f"Trainable portion size: {trainable_size_mb:.2f} MB")
    return {"total_params": total_params, "trainable_params": trainable_params, "total_size_mb": total_size_mb, "trainable_size_mb": trainable_size_mb}

Using device: cuda


In [None]:
def load_dataset_for_translation():
    print("Loading dataset...")
    dataset = load_dataset(project_config["dataset_name"], project_config["language_pair"], split=f"train[:{project_config['max_samples']}]")

    # Filter misaligned examples
    def is_aligned(example):
        fr_words = len(example["translation"]["fr"].split())
        en_words = len(example["translation"]["en"].split())
        ratio = fr_words / max(en_words, 1)
        return 0.5 <= ratio <= 2.0  # Stricter alignment check

    dataset = dataset.filter(is_aligned)
    train_size = int(project_config["train_test_split"] * len(dataset))
    train_dataset = dataset.select(range(train_size))
    test_dataset = dataset.select(range(train_size, len(dataset)))
    print(f"Loaded {len(train_dataset)} training and {len(test_dataset)} test samples after filtering")
    return train_dataset, test_dataset

train_dataset, test_dataset = load_dataset_for_translation()

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/327k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/334k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

Loaded 38060 training and 9516 test samples after filtering


In [None]:
def setup_tokenizer_and_model():
    print(f"Loading model: {project_config['model_name']}")
    tokenizer = AutoTokenizer.from_pretrained(project_config["model_name"])
    model = AutoModelForSeq2SeqLM.from_pretrained(project_config["model_name"]).to(device)
    print("\nBase model statistics:")
    base_stats = analyze_model_size(model)
    return tokenizer, model, base_stats

tokenizer, base_model, base_model_stats = setup_tokenizer_and_model()

Loading model: Helsinki-NLP/opus-mt-fr-en


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Base model statistics:
Total parameters: 75,133,952
Trainable parameters: 74,609,664 (99.30%)
Total model size: 286.61 MB
Trainable portion size: 284.61 MB


In [None]:
def evaluate_model(model, tokenizer, test_dataset, src_lang="fr", tgt_lang="en"):
    bleu_metric = load("sacrebleu")
    model.eval()
    predictions, references = [], []
    print("Evaluating model on test set...")
    for i in range(len(test_dataset)):
        example = test_dataset[i]
        source = example["translation"][src_lang]
        reference = example["translation"][tgt_lang]
        tokenized_input = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model.generate(**tokenized_input, max_length=128, num_beams=5, length_penalty=0.8, early_stopping=True)
        prediction = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append([reference])
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1}/{len(test_dataset)} examples")
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)
    print(f"BLEU score: {bleu_result['score']:.2f}")
    return {"bleu": bleu_result["score"]}

print("Evaluating base model...")
base_model_results = evaluate_model(base_model, tokenizer, test_dataset)
clear_memory()

Evaluating base model...


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Evaluating model on test set...
Processed 50/9516 examples
Processed 100/9516 examples
Processed 150/9516 examples
Processed 200/9516 examples
Processed 250/9516 examples
Processed 300/9516 examples
Processed 350/9516 examples
Processed 400/9516 examples
Processed 450/9516 examples
Processed 500/9516 examples
Processed 550/9516 examples
Processed 600/9516 examples
Processed 650/9516 examples
Processed 700/9516 examples
Processed 750/9516 examples
Processed 800/9516 examples
Processed 850/9516 examples
Processed 900/9516 examples
Processed 950/9516 examples
Processed 1000/9516 examples
Processed 1050/9516 examples
Processed 1100/9516 examples
Processed 1150/9516 examples
Processed 1200/9516 examples
Processed 1250/9516 examples
Processed 1300/9516 examples
Processed 1350/9516 examples
Processed 1400/9516 examples
Processed 1450/9516 examples
Processed 1500/9516 examples
Processed 1550/9516 examples
Processed 1600/9516 examples
Processed 1650/9516 examples
Processed 1700/9516 examples
Pr

In [None]:
def setup_lora_model(base_model):
    print("Inspecting model structure...")
    model_keys = list(base_model.state_dict().keys())
    q_modules = [key.replace('.weight', '') for key in model_keys if 'q_proj.weight' in key]
    v_modules = [key.replace('.weight', '') for key in model_keys if 'v_proj.weight' in key]
    target_modules = q_modules + v_modules  # Use all q and v projections
    print(f"Using {len(target_modules)} target modules: {target_modules[:5]}...")

    lora_config = LoraConfig(
        r=project_config["lora_config"]["r"],
        lora_alpha=project_config["lora_config"]["lora_alpha"],
        target_modules=target_modules,
        lora_dropout=project_config["lora_config"]["lora_dropout"],
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
    model = get_peft_model(base_model, lora_config)
    print("\nLoRA-adapted model statistics:")
    lora_stats = analyze_model_size(model)
    size_reduction = (1 - lora_stats["trainable_size_mb"] / base_model_stats["total_size_mb"]) * 100
    print(f"Size reduction through LoRA: {size_reduction:.2f}%")
    return model, lora_stats

lora_model, lora_stats = setup_lora_model(base_model)

Inspecting model structure...
Using 36 target modules: ['model.encoder.layers.0.self_attn.q_proj', 'model.encoder.layers.1.self_attn.q_proj', 'model.encoder.layers.2.self_attn.q_proj', 'model.encoder.layers.3.self_attn.q_proj', 'model.encoder.layers.4.self_attn.q_proj']...

LoRA-adapted model statistics:
Total parameters: 79,852,544
Trainable parameters: 4,718,592 (5.91%)
Total model size: 304.61 MB
Trainable portion size: 18.00 MB
Size reduction through LoRA: 93.72%


In [None]:
def preprocess_dataset(dataset, tokenizer, src_lang="fr", tgt_lang="en", max_length=128):
    def preprocess_function(examples):
        inputs = [ex[src_lang] for ex in examples["translation"]]
        targets = [ex[tgt_lang] for ex in examples["translation"]]
        model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print("Tokenizing dataset...")
    processed_dataset = dataset.map(preprocess_function, batched=True, batch_size=project_config["training_config"]["batch_size"], remove_columns=dataset.column_names)
    return processed_dataset

processed_train = preprocess_dataset(train_dataset, tokenizer)
processed_test = preprocess_dataset(test_dataset, tokenizer)

Tokenizing dataset...


Map:   0%|          | 0/38060 [00:00<?, ? examples/s]



Tokenizing dataset...


Map:   0%|          | 0/9516 [00:00<?, ? examples/s]

In [None]:
def create_dataloaders(processed_train, processed_test):
    processed_train.set_format(type="torch")
    processed_test.set_format(type="torch")
    train_dataloader = DataLoader(processed_train, batch_size=project_config["training_config"]["batch_size"], shuffle=True)
    test_dataloader = DataLoader(processed_test, batch_size=project_config["training_config"]["batch_size"])
    print(f"Created dataloaders with batch size {project_config['training_config']['batch_size']}")
    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = create_dataloaders(processed_train, processed_test)

Created dataloaders with batch size 32


In [None]:
def train_model(model, train_dataloader, num_epochs):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_dataloader) * num_epochs)
    model.train()
    total_training_time = 0
    best_loss = float('inf')

    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()
        for batch_idx, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            if (batch_idx + 1) % 10 == 0:
                print(f"  Batch {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}, LR: {scheduler.get_last_lr()[0]:.6f}")
        epoch_time = time.time() - start_time
        total_training_time += epoch_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} completed. Average loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s")
        if avg_loss < best_loss:
            best_loss = avg_loss
            print(f"New best loss: {best_loss:.4f}")
    print(f"Training completed. Total time: {total_training_time:.2f}s")
    return model, total_training_time

trained_model, training_time = train_model(lora_model, train_dataloader, num_epochs=project_config["training_config"]["epochs"])

Starting epoch 1/5
  Batch 10/1190, Loss: 1.5710, LR: 0.000100
  Batch 20/1190, Loss: 0.2860, LR: 0.000100
  Batch 30/1190, Loss: 0.2942, LR: 0.000100
  Batch 40/1190, Loss: 0.2179, LR: 0.000100
  Batch 50/1190, Loss: 0.2962, LR: 0.000100
  Batch 60/1190, Loss: 0.1997, LR: 0.000100
  Batch 70/1190, Loss: 0.2328, LR: 0.000100
  Batch 80/1190, Loss: 0.2129, LR: 0.000100
  Batch 90/1190, Loss: 0.3380, LR: 0.000100
  Batch 100/1190, Loss: 0.2892, LR: 0.000100
  Batch 110/1190, Loss: 0.2681, LR: 0.000100
  Batch 120/1190, Loss: 0.3708, LR: 0.000100
  Batch 130/1190, Loss: 0.2446, LR: 0.000100
  Batch 140/1190, Loss: 0.1841, LR: 0.000100
  Batch 150/1190, Loss: 0.2208, LR: 0.000100
  Batch 160/1190, Loss: 0.4133, LR: 0.000100
  Batch 170/1190, Loss: 0.3200, LR: 0.000100
  Batch 180/1190, Loss: 0.1806, LR: 0.000100
  Batch 190/1190, Loss: 0.2420, LR: 0.000100
  Batch 200/1190, Loss: 0.2401, LR: 0.000100
  Batch 210/1190, Loss: 0.2811, LR: 0.000100
  Batch 220/1190, Loss: 0.2050, LR: 0.000100


In [None]:
print("Evaluating LoRA fine-tuned model...")
lora_evaluation_results = evaluate_model(trained_model, tokenizer, test_dataset)
print(f"LoRA model BLEU score: {lora_evaluation_results['bleu']:.2f}")
print(f"Improvement over base model: {lora_evaluation_results['bleu'] - base_model_results['bleu']:.2f}")

output_dir = "lora_fr_en_improved"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
trained_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

results_to_save = {
    "base_model_bleu": base_model_results["bleu"],
    "lora_model_bleu": lora_evaluation_results["bleu"],
    "improvement": lora_evaluation_results["bleu"] - base_model_results["bleu"],
    "training_config": project_config["training_config"],
    "lora_config": project_config["lora_config"],
    "training_time_seconds": training_time
}
with open(f"{output_dir}/evaluation_results.json", "w") as f:
    json.dump(results_to_save, f, indent=4)
print(f"Model and results saved to {output_dir}")

Evaluating LoRA fine-tuned model...
Evaluating model on test set...
Processed 50/9516 examples
Processed 100/9516 examples
Processed 150/9516 examples
Processed 200/9516 examples
Processed 250/9516 examples
Processed 300/9516 examples
Processed 350/9516 examples
Processed 400/9516 examples
Processed 450/9516 examples
Processed 500/9516 examples
Processed 550/9516 examples
Processed 600/9516 examples
Processed 650/9516 examples
Processed 700/9516 examples
Processed 750/9516 examples
Processed 800/9516 examples
Processed 850/9516 examples
Processed 900/9516 examples
Processed 950/9516 examples
Processed 1000/9516 examples
Processed 1050/9516 examples
Processed 1100/9516 examples
Processed 1150/9516 examples
Processed 1200/9516 examples
Processed 1250/9516 examples
Processed 1300/9516 examples
Processed 1350/9516 examples
Processed 1400/9516 examples
Processed 1450/9516 examples
Processed 1500/9516 examples
Processed 1550/9516 examples
Processed 1600/9516 examples
Processed 1650/9516 exam

In [None]:
torch.save(trained_model.state_dict(), os.path.join(output_dir, "pytorch_model.pth"))
