In [None]:
%pip install datasets transformers peft torch evaluate sacrebleu accelerate bitsandbytes

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.4-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB

In [None]:
import os
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import DataLoader
import torch
from evaluate import load
import gc
import time

In [None]:
project_config = {
    "model_name": "Helsinki-NLP/opus-mt-fr-en",
    "dataset_name": "opus100",
    "language_pair": "en-fr",
    "max_samples": 5000,
    "train_test_split": 0.8,
    "lora_config": {
        "r": 64,  # Increased from 16 to 32 for more capacity
        "lora_alpha": 64,
        "lora_dropout": 0.05,  # Reduced from 0.1 for less regularization
        "target_modules": ["q", "v"]  # Will be set dynamically
    },
    "training_config": {
        "batch_size": 4,
        "learning_rate":  1e-5,  # Reduced from 1e-4 for stability
        "epochs": 6  # Increased from 3 to 5 for more training
    }
}

In [None]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

device = 'cuda' if torch.cuda.is_available()  else 'cpu'
print(f"Using device: {device}")

def analyze_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_size_mb = total_params * 4 / (1024 * 1024)
    trainable_size_mb = trainable_params * 4 / (1024 * 1024)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)")
    print(f"Total model size: {total_size_mb:.2f} MB")
    print(f"Trainable portion size: {trainable_size_mb:.2f} MB")
    return {"total_params": total_params, "trainable_params": trainable_params, "total_size_mb": total_size_mb, "trainable_size_mb": trainable_size_mb}

Using device: cuda


In [None]:
def load_dataset_for_translation():
    print("Loading dataset...")
    dataset = load_dataset(project_config["dataset_name"], project_config["language_pair"], split=f"train[:{project_config['max_samples']}]")

    # Filter misaligned examples
    def is_aligned(example):
        fr_words = len(example["translation"]["fr"].split())
        en_words = len(example["translation"]["en"].split())
        ratio = fr_words / max(en_words, 1)
        return 0.5 <= ratio <= 2.0  # Stricter alignment check

    dataset = dataset.filter(is_aligned)
    train_size = int(project_config["train_test_split"] * len(dataset))
    train_dataset = dataset.select(range(train_size))
    test_dataset = dataset.select(range(train_size, len(dataset)))
    print(f"Loaded {len(train_dataset)} training and {len(test_dataset)} test samples after filtering")
    return train_dataset, test_dataset

train_dataset, test_dataset = load_dataset_for_translation()

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/327k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/334k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

Loaded 3780 training and 946 test samples after filtering


In [None]:
def setup_tokenizer_and_model():
    print(f"Loading model: {project_config['model_name']}")
    tokenizer = AutoTokenizer.from_pretrained(project_config["model_name"])
    model = AutoModelForSeq2SeqLM.from_pretrained(project_config["model_name"]).to(device)
    print("\nBase model statistics:")
    base_stats = analyze_model_size(model)
    return tokenizer, model, base_stats

tokenizer, base_model, base_model_stats = setup_tokenizer_and_model()

Loading model: Helsinki-NLP/opus-mt-fr-en


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Base model statistics:
Total parameters: 75,133,952
Trainable parameters: 74,609,664 (99.30%)
Total model size: 286.61 MB
Trainable portion size: 284.61 MB


In [None]:
def evaluate_model(model, tokenizer, test_dataset, src_lang="fr", tgt_lang="en"):
    bleu_metric = load("sacrebleu")
    model.eval()
    predictions, references = [], []
    print("Evaluating model on test set...")
    for i in range(len(test_dataset)):
        example = test_dataset[i]
        source = example["translation"][src_lang]
        reference = example["translation"][tgt_lang]
        tokenized_input = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model.generate(**tokenized_input, max_length=128, num_beams=5, length_penalty=0.8, early_stopping=True)
        prediction = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append([reference])
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1}/{len(test_dataset)} examples")
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)
    print(f"BLEU score: {bleu_result['score']:.2f}")
    return {"bleu": bleu_result["score"]}

print("Evaluating base model...")
base_model_results = evaluate_model(base_model, tokenizer, test_dataset)
clear_memory()

Evaluating base model...


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 38.24


In [None]:
def setup_lora_model(base_model):
    print("Inspecting model structure...")
    model_keys = list(base_model.state_dict().keys())
    q_modules = [key.replace('.weight', '') for key in model_keys if 'q_proj.weight' in key]
    v_modules = [key.replace('.weight', '') for key in model_keys if 'v_proj.weight' in key]
    target_modules = q_modules + v_modules  # Use all q and v projections
    print(f"Using {len(target_modules)} target modules: {target_modules[:5]}...")

    lora_config = LoraConfig(
        r=project_config["lora_config"]["r"],
        lora_alpha=project_config["lora_config"]["lora_alpha"],
        target_modules=target_modules,
        lora_dropout=project_config["lora_config"]["lora_dropout"],
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
    model = get_peft_model(base_model, lora_config)
    print("\nLoRA-adapted model statistics:")
    lora_stats = analyze_model_size(model)
    size_reduction = (1 - lora_stats["trainable_size_mb"] / base_model_stats["total_size_mb"]) * 100
    print(f"Size reduction through LoRA: {size_reduction:.2f}%")
    return model, lora_stats

lora_model, lora_stats = setup_lora_model(base_model)

Inspecting model structure...
Using 36 target modules: ['model.encoder.layers.0.self_attn.q_proj', 'model.encoder.layers.1.self_attn.q_proj', 'model.encoder.layers.2.self_attn.q_proj', 'model.encoder.layers.3.self_attn.q_proj', 'model.encoder.layers.4.self_attn.q_proj']...

LoRA-adapted model statistics:
Total parameters: 77,493,248
Trainable parameters: 2,359,296 (3.04%)
Total model size: 295.61 MB
Trainable portion size: 9.00 MB
Size reduction through LoRA: 96.86%


In [None]:
def preprocess_dataset(dataset, tokenizer, src_lang="fr", tgt_lang="en", max_length=128):
    def preprocess_function(examples):
        inputs = [ex[src_lang] for ex in examples["translation"]]
        targets = [ex[tgt_lang] for ex in examples["translation"]]
        model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print("Tokenizing dataset...")
    processed_dataset = dataset.map(preprocess_function, batched=True, batch_size=project_config["training_config"]["batch_size"], remove_columns=dataset.column_names)
    return processed_dataset

processed_train = preprocess_dataset(train_dataset, tokenizer)
processed_test = preprocess_dataset(test_dataset, tokenizer)

Tokenizing dataset...
Tokenizing dataset...


In [None]:
print(processed_train)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3780
})


In [None]:
def create_dataloaders(processed_train, processed_test):
    processed_train.set_format(type="torch")
    processed_test.set_format(type="torch")
    train_dataloader = DataLoader(processed_train, batch_size=project_config["training_config"]["batch_size"], shuffle=True)
    test_dataloader = DataLoader(processed_test, batch_size=project_config["training_config"]["batch_size"])
    print(f"Created dataloaders with batch size {project_config['training_config']['batch_size']}")
    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = create_dataloaders(processed_train, processed_test)

Created dataloaders with batch size 8


Starting epoch 1/6
  Batch 10/945, Loss: 7.1143, LR: 0.000010
  Batch 20/945, Loss: 4.8895, LR: 0.000010
  Batch 30/945, Loss: 6.0929, LR: 0.000010
  Batch 40/945, Loss: 4.6819, LR: 0.000010
  Batch 50/945, Loss: 4.0722, LR: 0.000010
  Batch 60/945, Loss: 5.3589, LR: 0.000010
  Batch 70/945, Loss: 4.7103, LR: 0.000010
  Batch 80/945, Loss: 3.5967, LR: 0.000010
  Batch 90/945, Loss: 4.3438, LR: 0.000010
  Batch 100/945, Loss: 3.4671, LR: 0.000010
  Batch 110/945, Loss: 3.1459, LR: 0.000010
  Batch 120/945, Loss: 2.2820, LR: 0.000010
  Batch 130/945, Loss: 3.4255, LR: 0.000010
  Batch 140/945, Loss: 1.5920, LR: 0.000010
  Batch 150/945, Loss: 1.8451, LR: 0.000010
  Batch 160/945, Loss: 0.9739, LR: 0.000010
  Batch 170/945, Loss: 0.4648, LR: 0.000010
  Batch 180/945, Loss: 0.3856, LR: 0.000010
  Batch 190/945, Loss: 0.3276, LR: 0.000010
  Batch 200/945, Loss: 0.2409, LR: 0.000010
  Batch 210/945, Loss: 0.2843, LR: 0.000010
  Batch 220/945, Loss: 0.4260, LR: 0.000010
  Batch 230/945, Loss:

In [None]:
def train_model(model, train_dataloader, num_epochs):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_dataloader) * num_epochs)
    model.train()
    total_training_time = 0
    best_loss = float('inf')

    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()
        for batch_idx, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            if (batch_idx + 1) % 10 == 0:
                print(f"  Batch {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}, LR: {scheduler.get_last_lr()[0]:.6f}")
        epoch_time = time.time() - start_time
        total_training_time += epoch_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} completed. Average loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s")
        if avg_loss < best_loss:
            best_loss = avg_loss
            print(f"New best loss: {best_loss:.4f}")
    print(f"Training completed. Total time: {total_training_time:.2f}s")
    return model, total_training_time

In [None]:
trained_model, training_time = train_model(lora_model, train_dataloader, num_epochs=project_config["training_config"]["epochs"])

In [None]:
print("Evaluating LoRA fine-tuned model...")
lora_evaluation_results = evaluate_model(trained_model, tokenizer, test_dataset)
print(f"LoRA model BLEU score: {lora_evaluation_results['bleu']:.2f}")
print(f"Improvement over base model: {lora_evaluation_results['bleu'] - base_model_results['bleu']:.2f}")

output_dir = "lora_fr_en_improved"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
trained_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

results_to_save = {
    "base_model_bleu": base_model_results["bleu"],
    "lora_model_bleu": lora_evaluation_results["bleu"],
    "improvement": lora_evaluation_results["bleu"] - base_model_results["bleu"],
    "training_config": project_config["training_config"],
    "lora_config": project_config["lora_config"],
    "training_time_seconds": training_time
}
with open(f"{output_dir}/evaluation_results.json", "w") as f:
    json.dump(results_to_save, f, indent=4)
print(f"Model and results saved to {output_dir}")

Evaluating LoRA fine-tuned model...


NameError: name 'trained_model' is not defined

In [None]:
torch.save(trained_model.state_dict(), os.path.join(output_dir, "pytorch_model.pth"))


In [None]:
def train_model_bleu(model, train_dataloader, tokenizer, test_dataset, num_epochs=12):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_dataloader) * num_epochs)

    model.train()
    total_training_time = 0
    best_bleu = -1
    best_model = None
    epoch_bleu_scores = []

    for epoch in range(num_epochs):
        print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            if (step + 1) % 10 == 0:
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item():.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")

        epoch_time = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

        # Only evaluate BLEU every 4 epochs
        if (epoch + 1) % 4 == 0:
            model.eval()
            bleu_result = evaluate_model(model, tokenizer, test_dataset)
            epoch_bleu = bleu_result['bleu']
            epoch_bleu_scores.append((epoch + 1, epoch_bleu))

            print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
            model.train()

            if epoch_bleu > best_bleu:
                best_bleu = epoch_bleu
                best_model = model.state_dict()
                print("🎉 New best model found!")

    if best_model is not None:
        model.load_state_dict(best_model)
        print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f}")
    else:
        print(f"\n🏁 Training finished. No BLEU improvement during training.")

    return model, epoch_bleu_scores


In [None]:
def evaluate_model(model, tokenizer, test_dataset, src_lang="fr", tgt_lang="en", force_cpu=False):
    bleu_metric = load("sacrebleu")
    model.eval()

    predictions = []
    references = []

    device_eval = "cpu" if force_cpu else device
    model.to(device_eval)

    print("Evaluating model on test set...")
    for i in range(len(test_dataset)):
        example = test_dataset[i]
        source = example["translation"][src_lang]
        reference = example["translation"][tgt_lang]
        tokenized_input = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=128)
        tokenized_input = {k: v.to(device_eval) for k, v in tokenized_input.items()}

        with torch.no_grad():
            output = model.generate(**tokenized_input, max_length=128, num_beams=5, length_penalty=0.8, early_stopping=True)

        prediction = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append([reference])

        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1}/{len(test_dataset)} examples")

    bleu_result = bleu_metric.compute(predictions=predictions, references=references)
    print(f"BLEU score: {bleu_result['score']:.2f}")
    return {"bleu": bleu_result["score"]}


In [None]:
# from torch.quantization import quantize_dynamic

# def quantize_lora_model(model):
#     print("Starting dynamic quantization of LoRA model...")

#     # Only quantize nn.Linear layers — safe for transformer models
#     quantized_model = quantize_dynamic(
#         model,
#         {torch.nn.Linear},  # Target modules
#         dtype=torch.qint8   # 8-bit quantization
#     )

#     print("Quantization complete.")
#     return quantized_model

# quantized_lora_model = quantize_lora_model(trained_model.cpu())  # Make sure model is on CPU


In [None]:
# quantized_eval_results = evaluate_model(quantized_lora_model, tokenizer, test_dataset, force_cpu=True)
# print(f"Quantized LoRA BLEU Score: {quantized_eval_results['bleu']:.2f}")


In [None]:
# base_model_cpu = base_model.cpu()  # move to CPU
from torch.quantization import quantize_dynamic
from transformers import AutoModelForSeq2SeqLM

base_model_cpu = AutoModelForSeq2SeqLM.from_pretrained(project_config["model_name"])

quantized_base_model = torch.quantization.quantize_dynamic(
    base_model_cpu,
    {torch.nn.Linear},
    dtype=torch.qint8
)


In [None]:
quantized_finetuned_model, quantized_finetuned_training_time = train_model(model, train_dataloader, num_epochs=project_config["training_config"]["epochs"])


TypeError: train_model_bleu() missing 2 required positional arguments: 'tokenizer' and 'test_dataset'

In [None]:
quantized_base_results = evaluate_model(quantized_base_model, tokenizer, test_dataset)
print(f"Quantized base model BLEU: {quantized_base_results['bleu']:.2f}")

In [None]:
quantized_finetuned_results = evaluate_model(quantized_finetuned_model, tokenizer, test_dataset)
print(f"Quantized base model BLEU: {quantized_finetuned_results['bleu']:.2f}")

NameError: name 'quantized_finetuned_model' is not defined

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model_name = "Helsinki-NLP/opus-mt-fr-en"

tokenizer = AutoTokenizer.from_pretrained(model_name)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare for QLoRA
base_model = prepare_model_for_kbit_training(base_model)




In [None]:
lora_config = LoraConfig(
    r=128,
    lora_alpha=128,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

qlora_base_model = get_peft_model(base_model, lora_config)
qlora_base_model.print_trainable_parameters()


trainable params: 4,718,592 || all params: 79,852,544 || trainable%: 5.9091


In [None]:
project_config["training_config"].update({
    "batch_size": 8,       # adjust if CUDA memory allows
    "learning_rate": 2e-5,
    "epochs": 4
})

In [None]:
def train_model_with_bleu(model, train_dataloader, tokenizer, test_dataset, num_epochs=12):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_dataloader) * num_epochs)

    model.train()
    total_training_time = 0
    best_bleu = -1
    best_model = None
    epoch_bleu_scores = []

    for epoch in range(num_epochs):
        print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            if (step + 1) % 10 == 0:
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item():.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")

        epoch_time = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

        # Only evaluate BLEU every 4 epochs
        if (epoch + 1) % 4 == 0:
            model.eval()
            bleu_result = evaluate_model(model, tokenizer, test_dataset)
            epoch_bleu = bleu_result['bleu']
            epoch_bleu_scores.append((epoch + 1, epoch_bleu))

            print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
            model.train()

            if epoch_bleu > best_bleu:
                best_bleu = epoch_bleu
                best_model = model.state_dict()
                print("🎉 New best model found!")

    if best_model is not None:
        print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f}")
    else:
        print(f"\n🏁 Training finished. No BLEU improvement during training.")

    return model, epoch_bleu_scores


In [None]:
# def preprocess_dataset(dataset, tokenizer, src_lang="fr", tgt_lang="en", max_length=128):
#     def preprocess_function(examples):
#         inputs = [ex[src_lang] for ex in examples["translation"]]
#         targets = [ex[tgt_lang] for ex in examples["translation"]]
#         model_inputs = tokenizer(
#             inputs, max_length=max_length, truncation=True, padding="max_length"
#         )
#         with tokenizer.as_target_tokenizer():
#             labels = tokenizer(
#                 targets, max_length=max_length, truncation=True, padding="max_length"
#             )
#         model_inputs["labels"] = labels["input_ids"]
#         return model_inputs

#     print("Tokenizing dataset...")
#     # ✅ Only remove other columns, NOT translation
#     keep_translation = [col for col in dataset.column_names if col != "translation"]
#     processed_dataset = dataset.map(
#         preprocess_function,
#         batched=True,
#         batch_size=project_config["training_config"]["batch_size"],
#         remove_columns=keep_translation
#     )

#     return processed_dataset


# processed_train = preprocess_dataset(train_dataset, tokenizer)
# processed_test = preprocess_dataset(test_dataset, tokenizer)

Tokenizing dataset...
Tokenizing dataset...


In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=256,
    lora_alpha=512,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.01,  # less regularization
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
project_config["lora_config"].update({
    "r": 256,
    "lora_alpha": 512,
    "lora_dropout": 0.01
})

project_config["training_config"].update({
    "batch_size": 8,
    "learning_rate": 2e-5,
    "epochs": 12
})


In [None]:
# from torch.cuda.amp import autocast
# from torch.amp import GradScaler
# from torch.nn import CrossEntropyLoss
# from transformers import DataCollatorForSeq2Seq, get_scheduler
# import torch.nn.functional as F
# import time
# import os

# def train_model_with_bleu(
#     model,
#     tokenizer,
#     train_dataset,
#     test_dataset,
#     output_dir="qlora_best_bleu",
#     num_epochs=12,
#     eval_every=5,
#     patience=3,
#     gradient_accumulation_steps=2,
#     label_smoothing=0.1,
#     use_wandb=False
# ):
#     # ✅ Optional: W&B logging
#     if use_wandb:
#         import wandb
#         wandb.init(project="qlora-fr-en", config=project_config)

#     # ✅ Data collator to handle variable-length & skip 'translation'
#     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#     train_dataloader = DataLoader(
#         train_dataset,
#         batch_size=project_config["training_config"]["batch_size"],
#         shuffle=True,
#         collate_fn=data_collator
#     )

#     optimizer = torch.optim.AdamW(
#         model.parameters(),
#         lr=project_config["training_config"]["learning_rate"],
#         weight_decay=0.01
#     )

#     scaler = GradScaler(device='cuda')
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
#         optimizer, T_0=2, T_mult=2
#     )

#     loss_fn = CrossEntropyLoss(ignore_index=-100, label_smoothing=label_smoothing)

#     model.train()
#     best_bleu = -1
#     best_model = None
#     epoch_bleu_scores = []
#     no_improve_epochs = 0

#     os.makedirs(output_dir, exist_ok=True)

#     for epoch in range(num_epochs):
#         print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
#         epoch_loss = 0
#         start_time = time.time()

#         for step, batch in enumerate(train_dataloader):
#             batch = {k: v.to(device) for k, v in batch.items()}

#             with autocast(dtype=torch.float16):
#               outputs = model(**batch)
#               logits = outputs.logits
#               loss = loss_fn(logits.view(-1, logits.size(-1)), batch["labels"].view(-1))
#               loss = loss / gradient_accumulation_steps


#             scaler.scale(loss).backward()

#             if (step + 1) % gradient_accumulation_steps == 0:
#                 scaler.unscale_(optimizer)
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#                 scaler.step(optimizer)
#                 scaler.update()
#                 scheduler.step()
#                 optimizer.zero_grad()

#             epoch_loss += loss.item() * gradient_accumulation_steps

#             if (step + 1) % 10 == 0:
#                 print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item()*gradient_accumulation_steps:.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")

#         epoch_time = time.time() - start_time
#         avg_loss = epoch_loss / len(train_dataloader)
#         print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

#         if use_wandb:
#             wandb.log({"epoch": epoch+1, "avg_loss": avg_loss, "lr": scheduler.get_last_lr()[0]})

#         if (epoch + 1) % eval_every == 0:
#             model.eval()
#             bleu_result = evaluate_model(model, tokenizer, test_dataset)
#             epoch_bleu = bleu_result['bleu']
#             epoch_bleu_scores.append((epoch + 1, epoch_bleu))
#             print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
#             model.train()

#             if use_wandb:
#                 wandb.log({"epoch": epoch+1, "bleu": epoch_bleu})

#             if epoch_bleu > best_bleu:
#                 best_bleu = epoch_bleu
#                 best_model = model.state_dict()
#                 model.save_pretrained(output_dir)
#                 tokenizer.save_pretrained(output_dir)
#                 print(f"💾 New best model saved to {output_dir} with BLEU: {best_bleu:.2f}")
#                 no_improve_epochs = 0
#             else:
#                 no_improve_epochs += 1
#                 print(f"😕 No improvement. Patience {no_improve_epochs}/{patience}")
#                 if no_improve_epochs >= patience:
#                     print("⛔ Early stopping triggered.")
#                     break

#     print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f}")
#     return model, epoch_bleu_scores


In [None]:
from torch.cuda.amp import autocast, GradScaler
from torch.nn import CrossEntropyLoss
from transformers import get_scheduler
import time

def train_model_with_bleu(
    model,
    train_dataloader,
    tokenizer,
    test_dataset,
    num_epochs=12,
    eval_every=5,
    patience=3,
    gradient_accumulation_steps=2,
    label_smoothing=0.1,
    warmup_steps=200
):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)

    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_scheduler(
        name="linear",
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps,
    )

    scaler = GradScaler()
    loss_fn = CrossEntropyLoss(ignore_index=-100, label_smoothing=label_smoothing)

    model.train()
    best_bleu = -1
    best_model = None
    epoch_bleu_scores = []
    no_improve_epochs = 0

    for epoch in range(num_epochs):
        print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}

            with autocast():
                outputs = model(**batch)
                logits = outputs.logits
                loss = loss_fn(logits.view(-1, logits.size(-1)), batch["labels"].view(-1))
                loss = loss / gradient_accumulation_steps

            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

            epoch_loss += loss.item() * gradient_accumulation_steps

            if (step + 1) % 10 == 0:
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item()*gradient_accumulation_steps:.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")

        epoch_time = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

        if (epoch + 1) % eval_every == 0:
            model.eval()
            bleu_result = evaluate_model(model, tokenizer, test_dataset)
            epoch_bleu = bleu_result['bleu']
            epoch_bleu_scores.append((epoch + 1, epoch_bleu))
            print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
            model.train()

            if epoch_bleu > best_bleu:
                best_bleu = epoch_bleu
                best_model = model.state_dict()
                no_improve_epochs = 0
                print("🎉 New best model found!")
            else:
                no_improve_epochs += 1
                print(f"😕 No improvement. Patience {no_improve_epochs}/{patience}")
                if no_improve_epochs >= patience:
                    print("⛔ Early stopping triggered.")
                    break

    if best_model is not None:
        print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f} (model kept in memory — not reloaded due to quantization)")
    else:
        print(f"\n🏁 Training finished. No BLEU improvement during training.")

    return model, epoch_bleu_scores


In [None]:
trained_qlora_model, qlora_training_time = trained_qlora_model, bleu_scores = train_model_with_bleu(
    model=qlora_base_model,
    train_dataloader=train_dataloader,
    tokenizer=tokenizer,
    test_dataset=test_dataset,
    num_epochs=25
)
# trained_model, bleu_scores = train_model_with_bleu(
#     model=qlora_base_model,
#     tokenizer=tokenizer,
#     train_dataset=processed_train,
#     test_dataset=processed_test,
#     output_dir="qlora_best_bleu",
#     num_epochs=20,
#     # use_wandb=True  # or False if not using W&B
# )


  scaler = GradScaler()
  with autocast():



🔁 Epoch 1/25
  Step 10/473 | Loss: 6.0284 | LR: 0.000001
  Step 20/473 | Loss: 6.7399 | LR: 0.000001
  Step 30/473 | Loss: 6.7809 | LR: 0.000002
  Step 40/473 | Loss: 6.0555 | LR: 0.000002
  Step 50/473 | Loss: 6.5590 | LR: 0.000003
  Step 60/473 | Loss: 5.9080 | LR: 0.000003
  Step 70/473 | Loss: 6.2935 | LR: 0.000003
  Step 80/473 | Loss: 7.0202 | LR: 0.000004
  Step 90/473 | Loss: 5.9373 | LR: 0.000005
  Step 100/473 | Loss: 7.0344 | LR: 0.000005
  Step 110/473 | Loss: 5.1919 | LR: 0.000006
  Step 120/473 | Loss: 5.6854 | LR: 0.000006
  Step 130/473 | Loss: 5.2614 | LR: 0.000007
  Step 140/473 | Loss: 5.3777 | LR: 0.000007
  Step 150/473 | Loss: 5.2681 | LR: 0.000008
  Step 160/473 | Loss: 5.7684 | LR: 0.000008
  Step 170/473 | Loss: 5.4491 | LR: 0.000008
  Step 180/473 | Loss: 3.9313 | LR: 0.000009
  Step 190/473 | Loss: 3.9059 | LR: 0.000010
  Step 200/473 | Loss: 3.8621 | LR: 0.000010
  Step 210/473 | Loss: 3.8142 | LR: 0.000011
  Step 220/473 | Loss: 3.5750 | LR: 0.000011
  Ste

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(bleu_scores) + 1), bleu_scores, marker='o')
plt.xlabel("Epoch")
plt.ylabel("BLEU Score")
plt.title("BLEU Score per Epoch")
plt.grid(True)
plt.show()


In [None]:
trained_qlora_model.save_pretrained("qlora_fr_en")
tokenizer.save_pretrained("qlora_fr_en")


('qlora_fr_en/tokenizer_config.json',
 'qlora_fr_en/special_tokens_map.json',
 'qlora_fr_en/vocab.json',
 'qlora_fr_en/source.spm',
 'qlora_fr_en/target.spm',
 'qlora_fr_en/added_tokens.json')

In [None]:
# def evaluate_model(model, tokenizer, test_dataset, src_lang="fr", tgt_lang="en"):
#     from evaluate import load
#     bleu_metric = load("sacrebleu")
#     model.eval()

#     predictions = []
#     references = []

#     print("Evaluating QLoRA model on test set...")
#     for i in range(len(test_dataset)):
#         example = test_dataset[i]
#         source = example["translation"][src_lang]
#         reference = example["translation"][tgt_lang]

#         tokenized_input = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=128)
#         # Move input tensors to model's device (usually GPU)
#         input_device = next(model.parameters()).device
#         tokenized_input = {k: v.to(input_device) for k, v in tokenized_input.items()}

#         with torch.no_grad():
#             output = model.generate(**tokenized_input, max_length=128, num_beams=5, length_penalty=0.8, early_stopping=True)

#         prediction = tokenizer.decode(output[0], skip_special_tokens=True)
#         predictions.append(prediction)
#         references.append([reference])

#         if (i + 1) % 50 == 0:
#             print(f"Processed {i + 1}/{len(test_dataset)} examples")

#     bleu_result = bleu_metric.compute(predictions=predictions, references=references)
#     print(f"BLEU score: {bleu_result['score']:.2f}")
#     return {"bleu": bleu_result["score"]}


In [None]:
qlora_finetuned_results = evaluate_model(trained_qlora_model, tokenizer, test_dataset)
print(f"QLoRA Model BLEU Score: {qlora_finetuned_results['bleu']:.2f}")



Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 38.22
QLoRA Model BLEU Score: 38.22


In [None]:
qlora_base_results = evaluate_model(qlora_base_model, tokenizer, test_dataset)
print(f"QLoRA Base Model BLEU Score: {qlora_base_results['bleu']:.2f}")

Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 38.06
QLoRA Model BLEU Score: 38.06


In [None]:
qlora_base_results.print_trainable_parameters()

AttributeError: 'dict' object has no attribute 'print_trainable_parameters'

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

student_model_name = "Helsinki-NLP/opus-mt-tiny"

student_tokenizer = AutoTokenizer.from_pretrained(student_model_name)
student_model = AutoModelForSeq2SeqLM.from_pretrained(student_model_name).to(device)


OSError: Helsinki-NLP/opus-mt-tiny is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from torch.nn import functional as F

def train_student_model(student_model, teacher_model, train_dataloader, alpha=0.7, temperature=2.0, epochs=3):
    optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)
    student_model.train()
    teacher_model.eval()

    total_training_time = 0
    best_loss = float('inf')

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass through teacher
            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
                teacher_logits = teacher_outputs.logits / temperature

            # Forward pass through student
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            student_logits = student_outputs.logits / temperature
            student_loss = student_outputs.loss

            # Compute distillation loss (KL divergence)
            distill_loss = F.kl_div(
                input=F.log_softmax(student_logits, dim=-1),
                target=F.softmax(teacher_logits, dim=-1),
                reduction="batchmean"
            ) * (temperature ** 2)

            total_loss = alpha * distill_loss + (1 - alpha) * student_loss

            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

            epoch_loss += total_loss.item()
            if (step + 1) % 10 == 0:
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {total_loss.item():.4f}")

        elapsed = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} done | Avg Loss: {avg_loss:.4f} | Time: {elapsed:.2f}s")

        if avg_loss < best_loss:
            best_loss = avg_loss
            print(f"✅ New best model found with loss: {best_loss:.4f}")

    return student_model


In [None]:
trained_student_model = train_student_model(
    student_model=student_model,
    teacher_model=trained_model,  # Your QLoRA fine-tuned model
    train_dataloader=train_dataloader,
    alpha=0.7,  # 70% distillation, 30% true label
    temperature=2.0,
    epochs=3
)


In [None]:
student_eval_results = evaluate_model(trained_student_model, student_tokenizer, test_dataset)
print(f"Student Model BLEU Score: {student_eval_results['bleu']:.2f}")


In [None]:
quantized_student = torch.quantization.quantize_dynamic(
    trained_student_model.cpu(),
    {torch.nn.Linear},
    dtype=torch.qint8
)

quantized_student.save_pretrained("student_model_quantized")
student_tokenizer.save_pretrained("student_model_quantized")

print("✅ Quantized student model saved for CPU/mobile inference.")
