In [None]:
%pip install datasets transformers peft torch evaluate sacrebleu accelerate bitsandbytes

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB

In [None]:
import os
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import LoraConfig, get_peft_model, TaskType
from torch.utils.data import DataLoader
import torch
from evaluate import load
import gc
import time

In [None]:
project_config = {
    "model_name": "Helsinki-NLP/opus-mt-fr-en",
    "dataset_name": "opus100",
    "language_pair": "en-fr",
    "max_samples": 20000,
    "train_test_split": 0.8,
    "lora_config": {
        "r": 64,  # Increased from 16 to 32 for more capacity
        "lora_alpha": 64,
        "lora_dropout": 0.05,  # Reduced from 0.1 for less regularization
        "target_modules": ["q", "v"]  # Will be set dynamically
    },
    "training_config": {
        "batch_size": 4,
        "learning_rate":  1e-5,  # Reduced from 1e-4 for stability
        "epochs": 6  # Increased from 3 to 5 for more training
    }
}

In [None]:
def clear_memory():
    gc.collect()
    torch.cuda.empty_cache() if torch.cuda.is_available() else None

device = 'cuda' if torch.cuda.is_available()  else 'cpu'
print(f"Using device: {device}")

def analyze_model_size(model):
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_size_mb = total_params * 4 / (1024 * 1024)
    trainable_size_mb = trainable_params * 4 / (1024 * 1024)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,} ({trainable_params/total_params*100:.2f}%)")
    print(f"Total model size: {total_size_mb:.2f} MB")
    print(f"Trainable portion size: {trainable_size_mb:.2f} MB")
    return {"total_params": total_params, "trainable_params": trainable_params, "total_size_mb": total_size_mb, "trainable_size_mb": trainable_size_mb}

Using device: cuda


In [None]:
def load_dataset_for_translation():
    print("Loading dataset...")
    dataset = load_dataset(project_config["dataset_name"], project_config["language_pair"], split=f"train[:{project_config['max_samples']}]")

    # Filter misaligned examples
    def is_aligned(example):
        fr_words = len(example["translation"]["fr"].split())
        en_words = len(example["translation"]["en"].split())
        ratio = fr_words / max(en_words, 1)
        return 0.5 <= ratio <= 2.0  # Stricter alignment check

    dataset = dataset.filter(is_aligned)
    train_size = int(project_config["train_test_split"] * len(dataset))
    train_dataset = dataset.select(range(train_size))
    test_dataset = dataset.select(range(train_size, len(dataset)))
    print(f"Loaded {len(train_dataset)} training and {len(test_dataset)} test samples after filtering")
    return train_dataset, test_dataset

train_dataset, test_dataset = load_dataset_for_translation()

Loading dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/327k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/334k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20000 [00:00<?, ? examples/s]

Loaded 15210 training and 3803 test samples after filtering


In [None]:
def setup_tokenizer_and_model():
    print(f"Loading model: {project_config['model_name']}")
    tokenizer = AutoTokenizer.from_pretrained(project_config["model_name"])
    model = AutoModelForSeq2SeqLM.from_pretrained(project_config["model_name"]).to(device)
    print("\nBase model statistics:")
    base_stats = analyze_model_size(model)
    return tokenizer, model, base_stats

tokenizer, base_model, base_model_stats = setup_tokenizer_and_model()

Loading model: Helsinki-NLP/opus-mt-fr-en


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]


Base model statistics:
Total parameters: 75,133,952
Trainable parameters: 74,609,664 (99.30%)
Total model size: 286.61 MB
Trainable portion size: 284.61 MB


In [None]:
def evaluate_model(model, tokenizer, test_dataset, src_lang="fr", tgt_lang="en"):
    bleu_metric = load("sacrebleu")
    model.eval()
    predictions, references = [], []
    print("Evaluating model on test set...")
    for i in range(len(test_dataset)):
        example = test_dataset[i]
        source = example["translation"][src_lang]
        reference = example["translation"][tgt_lang]
        tokenized_input = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
        with torch.no_grad():
            output = model.generate(**tokenized_input, max_length=128, num_beams=5, length_penalty=0.8, early_stopping=True)
        prediction = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append([reference])
        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1}/{len(test_dataset)} examples")
    bleu_result = bleu_metric.compute(predictions=predictions, references=references)
    print(f"BLEU score: {bleu_result['score']:.2f}")
    return {"bleu": bleu_result["score"]}

print("Evaluating base model...")
base_model_results = evaluate_model(base_model, tokenizer, test_dataset)
clear_memory()

Evaluating base model...


Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 38.24


In [None]:
def setup_lora_model(base_model):
    print("Inspecting model structure...")
    model_keys = list(base_model.state_dict().keys())
    q_modules = [key.replace('.weight', '') for key in model_keys if 'q_proj.weight' in key]
    v_modules = [key.replace('.weight', '') for key in model_keys if 'v_proj.weight' in key]
    target_modules = q_modules + v_modules  # Use all q and v projections
    print(f"Using {len(target_modules)} target modules: {target_modules[:5]}...")

    lora_config = LoraConfig(
        r=project_config["lora_config"]["r"],
        lora_alpha=project_config["lora_config"]["lora_alpha"],
        target_modules=target_modules,
        lora_dropout=project_config["lora_config"]["lora_dropout"],
        bias="none",
        task_type=TaskType.SEQ_2_SEQ_LM
    )
    model = get_peft_model(base_model, lora_config)
    print("\nLoRA-adapted model statistics:")
    lora_stats = analyze_model_size(model)
    size_reduction = (1 - lora_stats["trainable_size_mb"] / base_model_stats["total_size_mb"]) * 100
    print(f"Size reduction through LoRA: {size_reduction:.2f}%")
    return model, lora_stats

lora_model, lora_stats = setup_lora_model(base_model)

Inspecting model structure...
Using 36 target modules: ['model.encoder.layers.0.self_attn.q_proj', 'model.encoder.layers.1.self_attn.q_proj', 'model.encoder.layers.2.self_attn.q_proj', 'model.encoder.layers.3.self_attn.q_proj', 'model.encoder.layers.4.self_attn.q_proj']...

LoRA-adapted model statistics:
Total parameters: 77,493,248
Trainable parameters: 2,359,296 (3.04%)
Total model size: 295.61 MB
Trainable portion size: 9.00 MB
Size reduction through LoRA: 96.86%


In [None]:
def preprocess_dataset(dataset, tokenizer, src_lang="fr", tgt_lang="en", max_length=128):
    def preprocess_function(examples):
        inputs = [ex[src_lang] for ex in examples["translation"]]
        targets = [ex[tgt_lang] for ex in examples["translation"]]
        model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print("Tokenizing dataset...")
    processed_dataset = dataset.map(preprocess_function, batched=True, batch_size=project_config["training_config"]["batch_size"], remove_columns=dataset.column_names)
    return processed_dataset

processed_train = preprocess_dataset(train_dataset, tokenizer)
processed_test = preprocess_dataset(test_dataset, tokenizer)

Tokenizing dataset...


Map:   0%|          | 0/15210 [00:00<?, ? examples/s]



Tokenizing dataset...


Map:   0%|          | 0/3803 [00:00<?, ? examples/s]

In [None]:
print(processed_train)

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3780
})


In [None]:
def create_dataloaders(processed_train, processed_test):
    processed_train.set_format(type="torch")
    processed_test.set_format(type="torch")
    train_dataloader = DataLoader(processed_train, batch_size=project_config["training_config"]["batch_size"], shuffle=True)
    test_dataloader = DataLoader(processed_test, batch_size=project_config["training_config"]["batch_size"])
    print(f"Created dataloaders with batch size {project_config['training_config']['batch_size']}")
    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = create_dataloaders(processed_train, processed_test)

Created dataloaders with batch size 4


Starting epoch 1/6
  Batch 10/945, Loss: 7.1143, LR: 0.000010
  Batch 20/945, Loss: 4.8895, LR: 0.000010
  Batch 30/945, Loss: 6.0929, LR: 0.000010
  Batch 40/945, Loss: 4.6819, LR: 0.000010
  Batch 50/945, Loss: 4.0722, LR: 0.000010
  Batch 60/945, Loss: 5.3589, LR: 0.000010
  Batch 70/945, Loss: 4.7103, LR: 0.000010
  Batch 80/945, Loss: 3.5967, LR: 0.000010
  Batch 90/945, Loss: 4.3438, LR: 0.000010
  Batch 100/945, Loss: 3.4671, LR: 0.000010
  Batch 110/945, Loss: 3.1459, LR: 0.000010
  Batch 120/945, Loss: 2.2820, LR: 0.000010
  Batch 130/945, Loss: 3.4255, LR: 0.000010
  Batch 140/945, Loss: 1.5920, LR: 0.000010
  Batch 150/945, Loss: 1.8451, LR: 0.000010
  Batch 160/945, Loss: 0.9739, LR: 0.000010
  Batch 170/945, Loss: 0.4648, LR: 0.000010
  Batch 180/945, Loss: 0.3856, LR: 0.000010
  Batch 190/945, Loss: 0.3276, LR: 0.000010
  Batch 200/945, Loss: 0.2409, LR: 0.000010
  Batch 210/945, Loss: 0.2843, LR: 0.000010
  Batch 220/945, Loss: 0.4260, LR: 0.000010
  Batch 230/945, Loss:

In [None]:
def train_model(model, train_dataloader, num_epochs):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_dataloader) * num_epochs)
    model.train()
    total_training_time = 0
    best_loss = float('inf')

    for epoch in range(num_epochs):
        print(f"Starting epoch {epoch+1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()
        for batch_idx, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            if (batch_idx + 1) % 10 == 0:
                print(f"  Batch {batch_idx+1}/{len(train_dataloader)}, Loss: {loss.item():.4f}, LR: {scheduler.get_last_lr()[0]:.6f}")
        epoch_time = time.time() - start_time
        total_training_time += epoch_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} completed. Average loss: {avg_loss:.4f}, Time: {epoch_time:.2f}s")
        if avg_loss < best_loss:
            best_loss = avg_loss
            print(f"New best loss: {best_loss:.4f}")
    print(f"Training completed. Total time: {total_training_time:.2f}s")
    return model, total_training_time

In [None]:
trained_model, training_time = train_model(lora_model, train_dataloader, num_epochs=project_config["training_config"]["epochs"])

In [None]:
print("Evaluating LoRA fine-tuned model...")
lora_evaluation_results = evaluate_model(trained_model, tokenizer, test_dataset)
print(f"LoRA model BLEU score: {lora_evaluation_results['bleu']:.2f}")
print(f"Improvement over base model: {lora_evaluation_results['bleu'] - base_model_results['bleu']:.2f}")

output_dir = "lora_fr_en_improved"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
trained_model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

results_to_save = {
    "base_model_bleu": base_model_results["bleu"],
    "lora_model_bleu": lora_evaluation_results["bleu"],
    "improvement": lora_evaluation_results["bleu"] - base_model_results["bleu"],
    "training_config": project_config["training_config"],
    "lora_config": project_config["lora_config"],
    "training_time_seconds": training_time
}
with open(f"{output_dir}/evaluation_results.json", "w") as f:
    json.dump(results_to_save, f, indent=4)
print(f"Model and results saved to {output_dir}")

Evaluating LoRA fine-tuned model...


NameError: name 'trained_model' is not defined

In [None]:
torch.save(trained_model.state_dict(), os.path.join(output_dir, "pytorch_model.pth"))


In [None]:
def train_model_bleu(model, train_dataloader, tokenizer, test_dataset, num_epochs=12):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_dataloader) * num_epochs)

    model.train()
    total_training_time = 0
    best_bleu = -1
    best_model = None
    epoch_bleu_scores = []

    for epoch in range(num_epochs):
        print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            if (step + 1) % 10 == 0:
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item():.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")

        epoch_time = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

        # Only evaluate BLEU every 4 epochs
        if (epoch + 1) % 4 == 0:
            model.eval()
            bleu_result = evaluate_model(model, tokenizer, test_dataset)
            epoch_bleu = bleu_result['bleu']
            epoch_bleu_scores.append((epoch + 1, epoch_bleu))

            print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
            model.train()

            if epoch_bleu > best_bleu:
                best_bleu = epoch_bleu
                best_model = model.state_dict()
                print("🎉 New best model found!")

    if best_model is not None:
        model.load_state_dict(best_model)
        print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f}")
    else:
        print(f"\n🏁 Training finished. No BLEU improvement during training.")

    return model, epoch_bleu_scores


In [None]:
def evaluate_model(model, tokenizer, test_dataset, src_lang="fr", tgt_lang="en", force_cpu=False):
    bleu_metric = load("sacrebleu")
    model.eval()

    predictions = []
    references = []

    device_eval = "cpu" if force_cpu else device
    model.to(device_eval)

    print("Evaluating model on test set...")
    for i in range(len(test_dataset)):
        example = test_dataset[i]
        source = example["translation"][src_lang]
        reference = example["translation"][tgt_lang]
        tokenized_input = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=128)
        tokenized_input = {k: v.to(device_eval) for k, v in tokenized_input.items()}

        with torch.no_grad():
            output = model.generate(**tokenized_input, max_length=128, num_beams=5, length_penalty=0.8, early_stopping=True)

        prediction = tokenizer.decode(output[0], skip_special_tokens=True)
        predictions.append(prediction)
        references.append([reference])

        if (i + 1) % 50 == 0:
            print(f"Processed {i + 1}/{len(test_dataset)} examples")

    bleu_result = bleu_metric.compute(predictions=predictions, references=references)
    print(f"BLEU score: {bleu_result['score']:.2f}")
    return {"bleu": bleu_result["score"]}


In [None]:
# from torch.quantization import quantize_dynamic

# def quantize_lora_model(model):
#     print("Starting dynamic quantization of LoRA model...")

#     # Only quantize nn.Linear layers — safe for transformer models
#     quantized_model = quantize_dynamic(
#         model,
#         {torch.nn.Linear},  # Target modules
#         dtype=torch.qint8   # 8-bit quantization
#     )

#     print("Quantization complete.")
#     return quantized_model

# quantized_lora_model = quantize_lora_model(trained_model.cpu())  # Make sure model is on CPU


In [None]:
# quantized_eval_results = evaluate_model(quantized_lora_model, tokenizer, test_dataset, force_cpu=True)
# print(f"Quantized LoRA BLEU Score: {quantized_eval_results['bleu']:.2f}")


In [None]:
# base_model_cpu = base_model.cpu()  # move to CPU
from torch.quantization import quantize_dynamic
from transformers import AutoModelForSeq2SeqLM

base_model_cpu = AutoModelForSeq2SeqLM.from_pretrained(project_config["model_name"])

quantized_base_model = torch.quantization.quantize_dynamic(
    base_model_cpu,
    {torch.nn.Linear},
    dtype=torch.qint8
)


In [None]:
quantized_finetuned_model, quantized_finetuned_training_time = train_model(model, train_dataloader, num_epochs=project_config["training_config"]["epochs"])


TypeError: train_model_bleu() missing 2 required positional arguments: 'tokenizer' and 'test_dataset'

In [None]:
quantized_base_results = evaluate_model(quantized_base_model, tokenizer, test_dataset)
print(f"Quantized base model BLEU: {quantized_base_results['bleu']:.2f}")

In [None]:
quantized_finetuned_results = evaluate_model(quantized_finetuned_model, tokenizer, test_dataset)
print(f"Quantized base model BLEU: {quantized_finetuned_results['bleu']:.2f}")

NameError: name 'quantized_finetuned_model' is not defined

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model_name = "Helsinki-NLP/opus-mt-fr-en"

tokenizer = AutoTokenizer.from_pretrained(model_name)

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto"
)

# Prepare for QLoRA
base_model = prepare_model_for_kbit_training(base_model)


In [None]:
lora_config = LoraConfig(
    r=128,
    lora_alpha=128,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)

qlora_base_model = get_peft_model(base_model, lora_config)
qlora_base_model.print_trainable_parameters()


trainable params: 4,718,592 || all params: 79,852,544 || trainable%: 5.9091


In [None]:
project_config["training_config"].update({
    "batch_size": 8,       # adjust if CUDA memory allows
    "learning_rate": 2e-5,
    "epochs": 4
})

In [None]:
def train_model_with_bleu(model, train_dataloader, tokenizer, test_dataset, num_epochs=12):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_dataloader) * num_epochs)

    model.train()
    total_training_time = 0
    best_bleu = -1
    best_model = None
    epoch_bleu_scores = []

    for epoch in range(num_epochs):
        print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            epoch_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()

            if (step + 1) % 10 == 0:
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item():.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")

        epoch_time = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

        # Only evaluate BLEU every 4 epochs
        if (epoch + 1) % 4 == 0:
            model.eval()
            bleu_result = evaluate_model(model, tokenizer, test_dataset)
            epoch_bleu = bleu_result['bleu']
            epoch_bleu_scores.append((epoch + 1, epoch_bleu))

            print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
            model.train()

            if epoch_bleu > best_bleu:
                best_bleu = epoch_bleu
                best_model = model.state_dict()
                print("🎉 New best model found!")

    if best_model is not None:
        print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f}")
    else:
        print(f"\n🏁 Training finished. No BLEU improvement during training.")

    return model, epoch_bleu_scores


In [None]:
# def preprocess_dataset(dataset, tokenizer, src_lang="fr", tgt_lang="en", max_length=128):
#     def preprocess_function(examples):
#         inputs = [ex[src_lang] for ex in examples["translation"]]
#         targets = [ex[tgt_lang] for ex in examples["translation"]]
#         model_inputs = tokenizer(
#             inputs, max_length=max_length, truncation=True, padding="max_length"
#         )
#         with tokenizer.as_target_tokenizer():
#             labels = tokenizer(
#                 targets, max_length=max_length, truncation=True, padding="max_length"
#             )
#         model_inputs["labels"] = labels["input_ids"]
#         return model_inputs

#     print("Tokenizing dataset...")
#     # ✅ Only remove other columns, NOT translation
#     keep_translation = [col for col in dataset.column_names if col != "translation"]
#     processed_dataset = dataset.map(
#         preprocess_function,
#         batched=True,
#         batch_size=project_config["training_config"]["batch_size"],
#         remove_columns=keep_translation
#     )

#     return processed_dataset


# processed_train = preprocess_dataset(train_dataset, tokenizer)
# processed_test = preprocess_dataset(test_dataset, tokenizer)

Tokenizing dataset...
Tokenizing dataset...


In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=256,
    lora_alpha=512,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.01,  # less regularization
    bias="none",
    task_type="SEQ_2_SEQ_LM"
)
project_config["lora_config"].update({
    "r": 256,
    "lora_alpha": 512,
    "lora_dropout": 0.01
})

project_config["training_config"].update({
    "batch_size": 8,
    "learning_rate": 2e-5,
    "epochs": 12
})


In [None]:
# from torch.cuda.amp import autocast
# from torch.amp import GradScaler
# from torch.nn import CrossEntropyLoss
# from transformers import DataCollatorForSeq2Seq, get_scheduler
# import torch.nn.functional as F
# import time
# import os

# def train_model_with_bleu(
#     model,
#     tokenizer,
#     train_dataset,
#     test_dataset,
#     output_dir="qlora_best_bleu",
#     num_epochs=12,
#     eval_every=5,
#     patience=3,
#     gradient_accumulation_steps=2,
#     label_smoothing=0.1,
#     use_wandb=False
# ):
#     # ✅ Optional: W&B logging
#     if use_wandb:
#         import wandb
#         wandb.init(project="qlora-fr-en", config=project_config)

#     # ✅ Data collator to handle variable-length & skip 'translation'
#     data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#     train_dataloader = DataLoader(
#         train_dataset,
#         batch_size=project_config["training_config"]["batch_size"],
#         shuffle=True,
#         collate_fn=data_collator
#     )

#     optimizer = torch.optim.AdamW(
#         model.parameters(),
#         lr=project_config["training_config"]["learning_rate"],
#         weight_decay=0.01
#     )

#     scaler = GradScaler(device='cuda')
#     scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
#         optimizer, T_0=2, T_mult=2
#     )

#     loss_fn = CrossEntropyLoss(ignore_index=-100, label_smoothing=label_smoothing)

#     model.train()
#     best_bleu = -1
#     best_model = None
#     epoch_bleu_scores = []
#     no_improve_epochs = 0

#     os.makedirs(output_dir, exist_ok=True)

#     for epoch in range(num_epochs):
#         print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
#         epoch_loss = 0
#         start_time = time.time()

#         for step, batch in enumerate(train_dataloader):
#             batch = {k: v.to(device) for k, v in batch.items()}

#             with autocast(dtype=torch.float16):
#               outputs = model(**batch)
#               logits = outputs.logits
#               loss = loss_fn(logits.view(-1, logits.size(-1)), batch["labels"].view(-1))
#               loss = loss / gradient_accumulation_steps


#             scaler.scale(loss).backward()

#             if (step + 1) % gradient_accumulation_steps == 0:
#                 scaler.unscale_(optimizer)
#                 torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
#                 scaler.step(optimizer)
#                 scaler.update()
#                 scheduler.step()
#                 optimizer.zero_grad()

#             epoch_loss += loss.item() * gradient_accumulation_steps

#             if (step + 1) % 10 == 0:
#                 print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item()*gradient_accumulation_steps:.4f} | LR: {scheduler.get_last_lr()[0]:.6f}")

#         epoch_time = time.time() - start_time
#         avg_loss = epoch_loss / len(train_dataloader)
#         print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

#         if use_wandb:
#             wandb.log({"epoch": epoch+1, "avg_loss": avg_loss, "lr": scheduler.get_last_lr()[0]})

#         if (epoch + 1) % eval_every == 0:
#             model.eval()
#             bleu_result = evaluate_model(model, tokenizer, test_dataset)
#             epoch_bleu = bleu_result['bleu']
#             epoch_bleu_scores.append((epoch + 1, epoch_bleu))
#             print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
#             model.train()

#             if use_wandb:
#                 wandb.log({"epoch": epoch+1, "bleu": epoch_bleu})

#             if epoch_bleu > best_bleu:
#                 best_bleu = epoch_bleu
#                 best_model = model.state_dict()
#                 model.save_pretrained(output_dir)
#                 tokenizer.save_pretrained(output_dir)
#                 print(f"💾 New best model saved to {output_dir} with BLEU: {best_bleu:.2f}")
#                 no_improve_epochs = 0
#             else:
#                 no_improve_epochs += 1
#                 print(f"😕 No improvement. Patience {no_improve_epochs}/{patience}")
#                 if no_improve_epochs >= patience:
#                     print("⛔ Early stopping triggered.")
#                     break

#     print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f}")
#     return model, epoch_bleu_scores


In [None]:
import torch
from torch.cuda.amp import autocast, GradScaler
from torch.nn import CrossEntropyLoss
from torch.optim.lr_scheduler import ReduceLROnPlateau
import time

def train_model_with_bleu(
    model,
    train_dataloader,
    tokenizer,
    test_dataset,
    num_epochs=12,
    eval_every=5,
    patience=3,
    gradient_accumulation_steps=2,
    label_smoothing=0.1
):
    optimizer = torch.optim.AdamW(model.parameters(), lr=project_config["training_config"]["learning_rate"], weight_decay=0.01)

    # Loss-based LR scheduler
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=1, verbose=True)

    scaler = GradScaler()
    loss_fn = CrossEntropyLoss(ignore_index=-100, label_smoothing=label_smoothing)

    model.train()
    best_bleu = -1
    best_model = None
    epoch_bleu_scores = []
    no_improve_epochs = 0

    for epoch in range(num_epochs):
        print(f"\n🔁 Epoch {epoch + 1}/{num_epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            batch = {k: v.to(device) for k, v in batch.items()}

            with autocast():
                outputs = model(**batch)
                logits = outputs.logits
                loss = loss_fn(logits.view(-1, logits.size(-1)), batch["labels"].view(-1))
                loss = loss / gradient_accumulation_steps

            scaler.scale(loss).backward()

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            epoch_loss += loss.item() * gradient_accumulation_steps

            if (step + 1) % 10 == 0:
                current_lr = optimizer.param_groups[0]["lr"]
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {loss.item()*gradient_accumulation_steps:.4f} | LR: {current_lr:.6f}")

        epoch_time = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"✅ Epoch {epoch + 1} complete | Avg Loss: {avg_loss:.4f} | Time: {epoch_time:.2f}s")

        # Update learning rate based on validation loss
        scheduler.step(avg_loss)

        if (epoch + 1) % eval_every == 0:
            model.eval()
            bleu_result = evaluate_model(model, tokenizer, test_dataset)
            epoch_bleu = bleu_result['bleu']
            epoch_bleu_scores.append((epoch + 1, epoch_bleu))
            print(f"📈 BLEU after Epoch {epoch + 1}: {epoch_bleu:.2f}")
            model.train()

            if epoch_bleu > best_bleu:
                best_bleu = epoch_bleu
                best_model = model.state_dict()
                no_improve_epochs = 0
                print("🎉 New best model found!")
            else:
                no_improve_epochs += 1
                print(f"😕 No improvement. Patience {no_improve_epochs}/{patience}")
                if no_improve_epochs >= patience:
                    print("⛔ Early stopping triggered.")
                    break

    if best_model is not None:
        print(f"\n🏁 Training finished. Best BLEU: {best_bleu:.2f} (model kept in memory — not reloaded due to quantization)")
    else:
        print(f"\n🏁 Training finished. No BLEU improvement during training.")

    return model, epoch_bleu_scores


In [None]:
trained_qlora_model, qlora_training_time = trained_qlora_model, bleu_scores = train_model_with_bleu(
    model=qlora_base_model,
    train_dataloader=train_dataloader,
    tokenizer=tokenizer,
    test_dataset=test_dataset,
    num_epochs=25
)
# trained_model, bleu_scores = train_model_with_bleu(
#     model=qlora_base_model,
#     tokenizer=tokenizer,
#     train_dataset=processed_train,
#     test_dataset=processed_test,
#     output_dir="qlora_best_bleu",
#     num_epochs=20,
#     # use_wandb=True  # or False if not using W&B
# )



🔁 Epoch 1/25


  scaler = GradScaler()
  with autocast():
  return fn(*args, **kwargs)


  Step 10/945 | Loss: 6.8759 | LR: 0.000020
  Step 20/945 | Loss: 5.9413 | LR: 0.000020
  Step 30/945 | Loss: 6.2237 | LR: 0.000020
  Step 40/945 | Loss: 5.3457 | LR: 0.000020
  Step 50/945 | Loss: 4.7824 | LR: 0.000020
  Step 60/945 | Loss: 3.7923 | LR: 0.000020
  Step 70/945 | Loss: 2.4274 | LR: 0.000020
  Step 80/945 | Loss: 2.7142 | LR: 0.000020
  Step 90/945 | Loss: 2.6740 | LR: 0.000020
  Step 100/945 | Loss: 2.0217 | LR: 0.000020
  Step 110/945 | Loss: 1.7670 | LR: 0.000020
  Step 120/945 | Loss: 1.8396 | LR: 0.000020
  Step 130/945 | Loss: 1.8169 | LR: 0.000020
  Step 140/945 | Loss: 1.7570 | LR: 0.000020
  Step 150/945 | Loss: 1.7380 | LR: 0.000020
  Step 160/945 | Loss: 1.7485 | LR: 0.000020
  Step 170/945 | Loss: 1.6678 | LR: 0.000020
  Step 180/945 | Loss: 1.7562 | LR: 0.000020
  Step 190/945 | Loss: 1.7110 | LR: 0.000020
  Step 200/945 | Loss: 1.6508 | LR: 0.000020
  Step 210/945 | Loss: 1.6235 | LR: 0.000020
  Step 220/945 | Loss: 1.6441 | LR: 0.000020
  Step 230/945 | Lo

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 38.20
📈 BLEU after Epoch 5: 38.20
🎉 New best model found!

🔁 Epoch 6/25
  Step 10/945 | Loss: 1.5823 | LR: 0.000020
  Step 20/945 | Loss: 1.5345 | LR: 0.000020
  Step 30/945 | Loss: 1.6180 | LR: 0.000020
  Step 40/945 | Loss: 1.6330 | LR: 0.000020
  Step 50/945 | Loss: 1.5554 | LR: 0.000020
  Step 60/945 | Loss: 1.6191 | LR: 0.000020
  Step 70/945 | Loss: 1.7406 | LR: 0.000020
  Step 80/945 | Loss: 1.4862 | LR: 0.000020
  Step 90/945 | Loss: 1.4680 | LR: 0.000020
  S

In [None]:
import matplotlib.pyplot as plt

plt.plot(range(1, len(bleu_scores) + 1), bleu_scores, marker='o')
plt.xlabel("Epoch")
plt.ylabel("BLEU Score")
plt.title("BLEU Score per Epoch")
plt.grid(True)
plt.show()


In [None]:
trained_qlora_model.save_pretrained("qlora_fr_en")
tokenizer.save_pretrained("qlora_fr_en")


('qlora_fr_en/tokenizer_config.json',
 'qlora_fr_en/special_tokens_map.json',
 'qlora_fr_en/vocab.json',
 'qlora_fr_en/source.spm',
 'qlora_fr_en/target.spm',
 'qlora_fr_en/added_tokens.json')

In [None]:
# def evaluate_model(model, tokenizer, test_dataset, src_lang="fr", tgt_lang="en"):
#     from evaluate import load
#     bleu_metric = load("sacrebleu")
#     model.eval()

#     predictions = []
#     references = []

#     print("Evaluating QLoRA model on test set...")
#     for i in range(len(test_dataset)):
#         example = test_dataset[i]
#         source = example["translation"][src_lang]
#         reference = example["translation"][tgt_lang]

#         tokenized_input = tokenizer(source, return_tensors="pt", padding=True, truncation=True, max_length=128)
#         # Move input tensors to model's device (usually GPU)
#         input_device = next(model.parameters()).device
#         tokenized_input = {k: v.to(input_device) for k, v in tokenized_input.items()}

#         with torch.no_grad():
#             output = model.generate(**tokenized_input, max_length=128, num_beams=5, length_penalty=0.8, early_stopping=True)

#         prediction = tokenizer.decode(output[0], skip_special_tokens=True)
#         predictions.append(prediction)
#         references.append([reference])

#         if (i + 1) % 50 == 0:
#             print(f"Processed {i + 1}/{len(test_dataset)} examples")

#     bleu_result = bleu_metric.compute(predictions=predictions, references=references)
#     print(f"BLEU score: {bleu_result['score']:.2f}")
#     return {"bleu": bleu_result["score"]}


In [None]:
qlora_finetuned_results = evaluate_model(trained_qlora_model, tokenizer, test_dataset)
print(f"QLoRA Model BLEU Score: {qlora_finetuned_results['bleu']:.2f}")



Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 38.22
QLoRA Model BLEU Score: 38.22


In [None]:
qlora_base_results = evaluate_model(qlora_base_model, tokenizer, test_dataset)
print(f"QLoRA Base Model BLEU Score: {qlora_base_results['bleu']:.2f}")

Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 38.06
QLoRA Model BLEU Score: 38.06


In [None]:
qlora_base_results.print_trainable_parameters()

AttributeError: 'dict' object has no attribute 'print_trainable_parameters'

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

student_model_name = "Helsinki-NLP/opus-mt-tiny"

student_tokenizer = AutoTokenizer.from_pretrained(student_model_name)
student_model = AutoModelForSeq2SeqLM.from_pretrained(student_model_name).to(device)


OSError: Helsinki-NLP/opus-mt-tiny is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

In [None]:
from torch.nn import functional as F

def train_student_model(student_model, teacher_model, train_dataloader, alpha=0.7, temperature=2.0, epochs=3):
    optimizer = torch.optim.AdamW(student_model.parameters(), lr=5e-5)
    student_model.train()
    teacher_model.eval()

    total_training_time = 0
    best_loss = float('inf')

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        epoch_loss = 0
        start_time = time.time()

        for step, batch in enumerate(train_dataloader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass through teacher
            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask)
                teacher_logits = teacher_outputs.logits / temperature

            # Forward pass through student
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            student_logits = student_outputs.logits / temperature
            student_loss = student_outputs.loss

            # Compute distillation loss (KL divergence)
            distill_loss = F.kl_div(
                input=F.log_softmax(student_logits, dim=-1),
                target=F.softmax(teacher_logits, dim=-1),
                reduction="batchmean"
            ) * (temperature ** 2)

            total_loss = alpha * distill_loss + (1 - alpha) * student_loss

            total_loss.backward()
            torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)
            optimizer.step()
            optimizer.zero_grad()

            epoch_loss += total_loss.item()
            if (step + 1) % 10 == 0:
                print(f"  Step {step+1}/{len(train_dataloader)} | Loss: {total_loss.item():.4f}")

        elapsed = time.time() - start_time
        avg_loss = epoch_loss / len(train_dataloader)
        print(f"Epoch {epoch+1} done | Avg Loss: {avg_loss:.4f} | Time: {elapsed:.2f}s")

        if avg_loss < best_loss:
            best_loss = avg_loss
            print(f"✅ New best model found with loss: {best_loss:.4f}")

    return student_model


In [None]:
trained_student_model = train_student_model(
    student_model=student_model,
    teacher_model=trained_model,  # Your QLoRA fine-tuned model
    train_dataloader=train_dataloader,
    alpha=0.7,  # 70% distillation, 30% true label
    temperature=2.0,
    epochs=3
)


In [None]:
student_eval_results = evaluate_model(trained_student_model, student_tokenizer, test_dataset)
print(f"Student Model BLEU Score: {student_eval_results['bleu']:.2f}")


In [None]:
quantized_student = torch.quantization.quantize_dynamic(
    trained_student_model.cpu(),
    {torch.nn.Linear},
    dtype=torch.qint8
)

quantized_student.save_pretrained("student_model_quantized")
student_tokenizer.save_pretrained("student_model_quantized")

print("✅ Quantized student model saved for CPU/mobile inference.")


In [None]:
from torch.nn import functional as F
from transformers import AutoModelForSeq2SeqLM
from torch.optim import AdamW
from peft import LoraConfig, get_peft_model, TaskType
from transformers import MarianMTModel, MarianConfig


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load pretrained T5-small model & tokenizer as student
student_model_name = "t5-small"
student_tokenizer = AutoTokenizer.from_pretrained(student_model_name)
student_model = AutoModelForSeq2SeqLM.from_pretrained(student_model_name).to(device)


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
def dual_tokenize_dataset(dataset, teacher_tokenizer, student_tokenizer, max_length=128):
    def preprocess_function(examples):
        # For teacher
        teacher_inputs = [ex["fr"] for ex in examples["translation"]]
        teacher_targets = [ex["en"] for ex in examples["translation"]]
        teacher_model_inputs = teacher_tokenizer(teacher_inputs, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
        teacher_labels = teacher_tokenizer(teacher_targets, max_length=max_length, truncation=True, padding="max_length")["input_ids"]

        # For student
        student_inputs = [f"translate French to English: {ex['fr']}" for ex in examples["translation"]]
        student_targets = [ex["en"] for ex in examples["translation"]]
        student_model_inputs = student_tokenizer(student_inputs, max_length=max_length, truncation=True, padding="max_length", return_tensors="pt")
        student_labels = student_tokenizer(student_targets, max_length=max_length, truncation=True, padding="max_length")["input_ids"]
        student_labels = [[(token if token != student_tokenizer.pad_token_id else -100) for token in label_seq] for label_seq in student_labels]

        return {
            "teacher_input_ids": teacher_model_inputs["input_ids"],
            "teacher_attention_mask": teacher_model_inputs["attention_mask"],
            "teacher_labels": teacher_labels,
            "input_ids": student_model_inputs["input_ids"],
            "attention_mask": student_model_inputs["attention_mask"],
            "labels": student_labels
        }

    dataset = dataset.remove_columns([col for col in dataset.column_names if col != "translation"])
    return dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)


In [None]:
processed_train = dual_tokenize_dataset(train_dataset, tokenizer, student_tokenizer)
processed_test = dual_tokenize_dataset(test_dataset, tokenizer, student_tokenizer)

train_dataloader, test_dataloader = create_dataloaders(processed_train, processed_test)


Map:   0%|          | 0/3780 [00:00<?, ? examples/s]

Map:   0%|          | 0/946 [00:00<?, ? examples/s]

Created dataloaders with batch size 1


In [None]:
# # Define LoRA config with target_modules set
# lora_cfg = LoraConfig(
#     r=project_config["lora_config"]["r"],
#     lora_alpha=project_config["lora_config"]["lora_alpha"],
#     lora_dropout=project_config["lora_config"]["lora_dropout"],
#     bias="none",
#     task_type=TaskType.SEQ_2_SEQ_LM,
#     target_modules=["self_attn.k_proj", "self_attn.v_proj", "self_attn.q_proj"]
# )


# # Apply LoRA to student
# student_model = get_peft_model(student_model, lora_cfg)


ValueError: Target modules {'self_attn.v_proj', 'self_attn.k_proj', 'self_attn.q_proj'} not found in the base model. Please check the target modules and try again.

In [None]:

# Freeze teacher model
base_model.eval()
for param in base_model.parameters():
    param.requires_grad = False

In [None]:
analyze_model_size(base_model)

Total parameters: 75,133,952
Trainable parameters: 0 (0.00%)
Total model size: 286.61 MB
Trainable portion size: 0.00 MB


{'total_params': 75133952,
 'trainable_params': 0,
 'total_size_mb': 286.61328125,
 'trainable_size_mb': 0.0}

In [None]:
analyze_model_size(student_model)

Total parameters: 60,506,624
Trainable parameters: 60,506,624 (100.00%)
Total model size: 230.81 MB
Trainable portion size: 230.81 MB


{'total_params': 60506624,
 'trainable_params': 60506624,
 'total_size_mb': 230.814453125,
 'trainable_size_mb': 230.814453125}

In [None]:
# Adjust batch size for safer debugging
project_config["training_config"]["batch_size"] = 1  # Temporarily use batch size = 1
optimizer = AdamW(student_model.parameters(), lr=project_config["training_config"]["learning_rate"])

# Training with distillation + batch debugging
def train_student_with_distillation(teacher_model, student_model, train_loader, tokenizer, epochs=3, alpha=0.5, temperature=2.0):
    print("Starting distillation training...")
    student_model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        start_time = time.time()

        for step, batch in enumerate(train_loader):
            print(f"\n[Batch {step}]")

            # Inspect batch BEFORE .to(device)
            for k, v in batch.items():
                print(f"{k}: shape={v.shape}, dtype={v.dtype}, min={v.min()}, max={v.max()}")
                if k == "labels":
                    print("Labels sample:", v[0])

            # Safely move tensors to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Teacher outputs (no grad)
            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                teacher_logits = teacher_outputs.logits / temperature

            # Student outputs
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            student_logits = student_outputs.logits / temperature
            student_loss = student_outputs.loss

            # Debug: check shape
            if student_logits.shape != teacher_logits.shape:
                print("[ERROR] Shape mismatch!")
                print("Teacher:", teacher_logits.shape)
                print("Student:", student_logits.shape)
                break

            # KL Divergence
            loss_kl = F.kl_div(
                F.log_softmax(student_logits, dim=-1),
                F.softmax(teacher_logits, dim=-1),
                reduction='batchmean'
            ) * (temperature ** 2)

            # Final loss
            loss = alpha * student_loss + (1 - alpha) * loss_kl

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Time: {time.time() - start_time:.2f}s")

    return student_model

# Start training
student_model = train_student_with_distillation(
    base_model, student_model, train_dataloader, student_tokenizer,
    epochs=project_config["training_config"]["epochs"]
)

NameError: name 'project_config' is not defined

In [None]:
student_model_results = evaluate_model(student_model, tokenizer, test_dataset)
print(f"Student Model BLEU Score: {student_model_results['bleu']:.2f}")

Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 0.00


TypeError: 'PeftModelForSeq2SeqLM' object is not subscriptable

In [None]:
print(f"Student Model BLEU Score: {student_model_results['bleu']:.2f}")

Student Model BLEU Score: 0.00


In [None]:
# Full distillation code with separate tokenizers and debug-safe training loop

import os
import gc
import time
import torch
from datasets import load_dataset
from torch.nn import functional as F
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    T5Tokenizer,
    T5ForConditionalGeneration,
)
from torch.optim import AdamW

# Enable full error trace
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# Configuration
project_config = {
    "teacher_model_name": "Helsinki-NLP/opus-mt-fr-en",
    "student_model_name": "t5-small",
    "dataset_name": "opus100",
    "language_pair": "en-fr",
    "max_samples": 5000,
    "train_test_split": 0.8,
    "training_config": {
        "batch_size": 1,  # debug-safe
        "learning_rate": 1e-5,
        "epochs": 3
    }
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

def clear_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

# Load dataset
def load_dataset_for_translation():
    dataset = load_dataset(project_config["dataset_name"], project_config["language_pair"],
                           split=f"train[:{project_config['max_samples']}]")

    def is_aligned(example):
        fr_words = len(example["translation"]["fr"].split())
        en_words = len(example["translation"]["en"].split())
        ratio = fr_words / max(en_words, 1)
        return 0.5 <= ratio <= 2.0

    dataset = dataset.filter(is_aligned)
    train_size = int(project_config["train_test_split"] * len(dataset))
    return dataset.select(range(train_size)), dataset.select(range(train_size, len(dataset)))

train_dataset, test_dataset = load_dataset_for_translation()

# Load models and tokenizers
teacher_tokenizer = AutoTokenizer.from_pretrained(project_config["teacher_model_name"])
# from transformers import BitsAndBytesConfig
# bnb_config = BitsAndBytesConfig(load_in_8bit=True)
# teacher_model = AutoModelForSeq2SeqLM.from_pretrained(
#     project_config["teacher_model_name"],
#     device_map="auto",
#     quantization_config=bnb_config
# )

teacher_model = AutoModelForSeq2SeqLM.from_pretrained(
    project_config["teacher_model_name"],
    device_map="auto"
).to(device)
teacher_model.eval()

student_tokenizer = T5Tokenizer.from_pretrained(project_config["student_model_name"])
student_tokenizer.pad_token = student_tokenizer.eos_token
student_model = T5ForConditionalGeneration.from_pretrained(project_config["student_model_name"]).to(device)

# Preprocessing function
def dual_tokenize_dataset(dataset, teacher_tokenizer, student_tokenizer, max_length=128):
    def preprocess_function(examples):
        teacher_inputs = [ex["fr"] for ex in examples["translation"]]
        teacher_targets = [ex["en"] for ex in examples["translation"]]
        teacher_model_inputs = teacher_tokenizer(teacher_inputs, max_length=max_length, truncation=True, padding="max_length")
        teacher_labels = teacher_tokenizer(teacher_targets, max_length=max_length, truncation=True, padding="max_length")["input_ids"]

        student_inputs = [f"translate French to English: {ex['fr']}" for ex in examples["translation"]]
        student_targets = [ex["en"] for ex in examples["translation"]]
        student_model_inputs = student_tokenizer(student_inputs, max_length=max_length, truncation=True, padding="max_length")
        student_labels = student_tokenizer(student_targets, max_length=max_length, truncation=True, padding="max_length")["input_ids"]
        student_labels = [[(token if token != student_tokenizer.pad_token_id else -100) for token in seq] for seq in student_labels]

        return {
            "teacher_input_ids": teacher_model_inputs["input_ids"],
            "teacher_attention_mask": teacher_model_inputs["attention_mask"],
            "teacher_labels": teacher_labels,
            "input_ids": student_model_inputs["input_ids"],
            "attention_mask": student_model_inputs["attention_mask"],
            "labels": student_labels
        }

    dataset = dataset.remove_columns([col for col in dataset.column_names if col != "translation"])
    return dataset.map(preprocess_function, batched=True, remove_columns=dataset.column_names)

processed_train = dual_tokenize_dataset(train_dataset, teacher_tokenizer, student_tokenizer)
processed_test = dual_tokenize_dataset(test_dataset, teacher_tokenizer, student_tokenizer)

def create_dataloaders(processed_train, processed_test):
    processed_train.set_format(type="torch")
    processed_test.set_format(type="torch")
    return DataLoader(processed_train, batch_size=project_config["training_config"]["batch_size"], shuffle=True), \
           DataLoader(processed_test, batch_size=project_config["training_config"]["batch_size"])

train_dataloader, test_dataloader = create_dataloaders(processed_train, processed_test)



Using device: cuda




In [None]:
from evaluate import load

# Optimizer
optimizer = AdamW(student_model.parameters(), lr=project_config["training_config"]["learning_rate"])

# BLEU evaluator
bleu_metric = load("bleu")

# Decoding helper (removes -100 from labels before decoding)
def decode_predictions(tokenizer, sequences):
    if isinstance(sequences, torch.Tensor):
        sequences = sequences.tolist()
    clean_sequences = [[token for token in seq if token != -100] for seq in sequences]
    return tokenizer.batch_decode(clean_sequences, skip_special_tokens=True)

# BLEU calculation helper
def compute_bleu(preds, labels):
    return bleu_metric.compute(predictions=preds, references=[[ref] for ref in labels])

# 🔁 Training function (BLEU only after final epoch)
def train_student_with_distillation(student_model, teacher_model, train_loader, test_loader, student_tokenizer, epochs=3):
    student_model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        start_time = time.time()

        for step, batch in enumerate(train_loader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Forward pass
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = student_outputs.loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if step % 100 == 0:
                print(f"Step {step}: loss = {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"\n✅ Epoch {epoch+1}/{epochs} completed in {time.time() - start_time:.2f}s - Avg Loss: {avg_loss:.4f}")

        # Save model after each epoch (optional)
        ckpt_path = f"student_model_epoch{epoch+1}.pt"
        torch.save(student_model.state_dict(), ckpt_path)
        print(f"💾 Model checkpoint saved: {ckpt_path}")

    # 📊 Run BLEU score evaluation AFTER training
    print("\n🔍 Evaluating final model BLEU score on test set...")
    student_model.eval()
    predictions, references = [], []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"]

            outputs = student_model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=128)
            decoded_preds = decode_predictions(student_tokenizer, outputs)
            decoded_labels = decode_predictions(student_tokenizer, labels)

            predictions.extend(decoded_preds)
            references.extend(decoded_labels)

    bleu = compute_bleu(predictions, references)
    print(f"\n🌍 Final BLEU Score: {bleu['bleu'] * 100:.2f}")

    return student_model


In [None]:
student_model = train_student_with_distillation(
    student_model=student_model,
    teacher_model=teacher_model,  # passed for completeness, not used in this version
    train_loader=train_dataloader,
    test_loader=test_dataloader,
    student_tokenizer=student_tokenizer,
    epochs=4
)

Step 0: loss = 9.6097
Step 100: loss = 3.2551
Step 200: loss = 3.8772
Step 300: loss = 2.8415
Step 400: loss = 4.2499
Step 500: loss = 2.8886
Step 600: loss = 0.7706
Step 700: loss = 1.1121
Step 800: loss = 3.0050
Step 900: loss = 4.3171
Step 1000: loss = 3.0375
Step 1100: loss = 4.8906
Step 1200: loss = 1.7190
Step 1300: loss = 3.4334
Step 1400: loss = 3.1860
Step 1500: loss = 1.7361
Step 1600: loss = 1.4790


In [None]:
student_model_results = evaluate_model(student_model, student_tokenizer, test_dataset)
print(f"Student Model BLEU Score: {student_model_results['bleu']:.2f}")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Evaluating model on test set...
Processed 50/946 examples
Processed 100/946 examples
Processed 150/946 examples
Processed 200/946 examples
Processed 250/946 examples
Processed 300/946 examples
Processed 350/946 examples
Processed 400/946 examples
Processed 450/946 examples
Processed 500/946 examples
Processed 550/946 examples
Processed 600/946 examples
Processed 650/946 examples
Processed 700/946 examples
Processed 750/946 examples
Processed 800/946 examples
Processed 850/946 examples
Processed 900/946 examples
BLEU score: 7.76
Student Model BLEU Score: 7.76


In [None]:
T5_model = T5ForConditionalGeneration.from_pretrained("t5-small")
student_model_results = evaluate_model(student_model, student_tokenizer, test_dataset)
print(f"Student Model BLEU Score: {student_model_results['bleu']:.2f}")

In [None]:
# Step-by-step: Define and load a smaller MarianMT model (MiniMarian) as the student

from transformers import MarianConfig, MarianMTModel, AutoTokenizer

# Step 1: Use the same tokenizer as the teacher (vocab alignment is critical)
student_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

# Step 2: Define a smaller Marian config manually
mini_marian_config = MarianConfig(
    vocab_size=student_tokenizer.vocab_size,  # ensure compatibility
    encoder_layers=3,
    decoder_layers=3,
    d_model=256,           # hidden size (default: 512)
    encoder_attention_heads=4,
    decoder_attention_heads=4,
    decoder_ffn_dim=512,
    encoder_ffn_dim=512,
    dropout=0.1,
    activation_function="relu",
    max_position_embeddings=512,
    init_std=0.02,
    scale_embedding=True,
    forced_bos_token_id=student_tokenizer.convert_tokens_to_ids(student_tokenizer.bos_token),
    pad_token_id=student_tokenizer.pad_token_id,
    eos_token_id=student_tokenizer.eos_token_id
)

# Step 3: Initialize a student model with this config (weights are random — for distillation training)
student_model = MarianMTModel(mini_marian_config).to("cuda" if torch.cuda.is_available() else "cpu")

# Step 4: Save and reload if needed
student_model.save_pretrained("mini_marian_student_model")
student_tokenizer.save_pretrained("mini_marian_student_model")

student_model.eval()
print("✅ Mini MarianMT student model initialized and ready for distillation.")



✅ Mini MarianMT student model initialized and ready for distillation.


In [None]:
# Re-run everything necessary after kernel reset
import os
import torch
from transformers import MarianConfig, MarianMTModel, AutoTokenizer

# Load tokenizer from Helsinki-NLP Marian model
student_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")

# Define a small MarianMT student configuration
mini_marian_config = MarianConfig(
    vocab_size=student_tokenizer.vocab_size,
    encoder_layers=3,
    decoder_layers=3,
    d_model=256,
    encoder_attention_heads=4,
    decoder_attention_heads=4,
    decoder_ffn_dim=512,
    encoder_ffn_dim=512,
    dropout=0.1,
    activation_function="relu",
    max_position_embeddings=512,
    init_std=0.02,
    scale_embedding=True,
    forced_bos_token_id=student_tokenizer.convert_tokens_to_ids(student_tokenizer.bos_token),
    pad_token_id=student_tokenizer.pad_token_id,
    eos_token_id=student_tokenizer.eos_token_id
)

# Initialize the student model from scratch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
student_model = MarianMTModel(mini_marian_config).to(device)

# Save the new student model and tokenizer
student_model.save_pretrained("mini_marian_student_model")
student_tokenizer.save_pretrained("mini_marian_student_model")

student_model.eval()
"✅ Mini MarianMT student model initialized and saved as 'mini_marian_student_model'."




"✅ Mini MarianMT student model initialized and saved as 'mini_marian_student_model'."

In [None]:
# Adjust batch size for safer debugging
project_config["training_config"]["batch_size"] = 1  # Temporarily use batch size = 1
optimizer = AdamW(student_model.parameters(), lr=project_config["training_config"]["learning_rate"])

# Training with distillation + batch debugging
def train_student_with_distillation(teacher_model, student_model, train_loader, tokenizer, epochs=3, alpha=0.5, temperature=2.0):
    print("Starting distillation training...")
    student_model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        start_time = time.time()

        for step, batch in enumerate(train_loader):
            # print(f"\n[Batch {step}]")

            # Inspect batch BEFORE .to(device)
            # for k, v in batch.items():
            #     print(f"{k}: shape={v.shape}, dtype={v.dtype}, min={v.min()}, max={v.max()}")
            #     if k == "labels":
            #         print("Labels sample:", v[0])

            # Safely move tensors to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            # Teacher outputs (no grad)
            with torch.no_grad():
                teacher_outputs = teacher_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                teacher_logits = teacher_outputs.logits / temperature

            # Student outputs
            student_outputs = student_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            student_logits = student_outputs.logits / temperature
            student_loss = student_outputs.loss

            # Debug: check shape
            # if student_logits.shape != teacher_logits.shape:
            #     print("[ERROR] Shape mismatch!")
            #     print("Teacher:", teacher_logits.shape)
            #     print("Student:", student_logits.shape)
            #     break

            # KL Divergence
            loss_kl = F.kl_div(
                F.log_softmax(student_logits, dim=-1),
                F.softmax(teacher_logits, dim=-1),
                reduction='batchmean'
            ) * (temperature ** 2)

            # Final loss
            loss = alpha * student_loss + (1 - alpha) * loss_kl

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f} - Time: {time.time() - start_time:.2f}s")

    return student_model

# Start training
student_model = train_student_with_distillation(
    base_model, student_model, train_dataloader, student_tokenizer,
    epochs=4
)

Starting distillation training...
Epoch 1/4 - Loss: 39.1861 - Time: 302.14s
Epoch 2/4 - Loss: 32.2857 - Time: 299.22s
Epoch 3/4 - Loss: 31.1791 - Time: 299.07s


KeyboardInterrupt: 

In [None]:
import torch
import torch.nn.functional as F
from transformers import MarianMTModel, MarianConfig, AutoTokenizer
from torch.optim import AdamW
from torch.utils.data import DataLoader
from datasets import load_dataset
import evaluate
import time

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define project_config (assuming this exists in your environment; adjust as needed)
project_config = {"training_config": {"batch_size": 8, "learning_rate": 5e-5}}

# Step 1: Load teacher model and tokenizer
teacher_model = MarianMTModel.from_pretrained("Helsinki-NLP/opus-mt-fr-en").to(device)
teacher_tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-fr-en")
teacher_model.eval()

# Step 2: Define and initialize a smaller student model
student_tokenizer = teacher_tokenizer  # Reuse tokenizer for vocab alignment
mini_marian_config = MarianConfig(
    vocab_size=student_tokenizer.vocab_size,
    encoder_layers=4,
    decoder_layers=4,
    d_model=384,
    encoder_attention_heads=6,
    decoder_attention_heads=6,
    decoder_ffn_dim=1024,
    encoder_ffn_dim=1024,
    dropout=0.1,
    activation_function="relu",
    max_position_embeddings=512,
    init_std=0.02,
    scale_embedding=True,
    forced_bos_token_id=student_tokenizer.convert_tokens_to_ids(student_tokenizer.bos_token),
    pad_token_id=student_tokenizer.pad_token_id,
    eos_token_id=student_tokenizer.eos_token_id
)
student_model = MarianMTModel(mini_marian_config).to(device)

# Step 3: Load and preprocess dataset
train_dataset = load_dataset("wmt14", "fr-en", split="train[:1%]")  # ~408,000 examples
test_dataset = load_dataset("wmt14", "fr-en", split="test[:100]")   # Small test set for demo

def preprocess_dataset(dataset, tokenizer, src_lang="fr", tgt_lang="en", max_length=128):
    def preprocess_function(examples):
        inputs = [ex[src_lang] for ex in examples["translation"]]
        targets = [ex[tgt_lang] for ex in examples["translation"]]
        model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print("Tokenizing dataset...")
    processed_dataset = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=project_config["training_config"]["batch_size"],
        remove_columns=dataset.column_names
    )
    return processed_dataset

# Preprocess datasets
processed_train = preprocess_dataset(train_dataset, student_tokenizer)
processed_test = preprocess_dataset(test_dataset, student_tokenizer)

# Step 4: Create dataloaders
def create_dataloaders(processed_train, processed_test):
    processed_train.set_format(type="torch")
    processed_test.set_format(type="torch")
    train_dataloader = DataLoader(processed_train, batch_size=project_config["training_config"]["batch_size"], shuffle=True)
    test_dataloader = DataLoader(processed_test, batch_size=project_config["training_config"]["batch_size"])
    print(f"Created dataloaders with batch size {project_config['training_config']['batch_size']}")
    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = create_dataloaders(processed_train, processed_test)

# Step 5: Distillation training function
def train_student_with_distillation(teacher_model, student_model, train_loader, epochs=5, alpha=0.7, temperature=2.0):
    optimizer = AdamW(student_model.parameters(), lr=project_config["training_config"]["learning_rate"])
    student_model.train()
    print("Starting distillation training...")

    for epoch in range(epochs):
        total_loss = 0.0
        start_time = time.time()

        for step, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Shift labels for decoder input
            decoder_input_ids = student_model._shift_right(labels)

            # Teacher outputs
            with torch.no_grad():
                teacher_outputs = teacher_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    decoder_input_ids=decoder_input_ids
                )
                teacher_logits = teacher_outputs.logits / temperature

            # Student outputs
            student_outputs = student_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids
            )
            student_logits = student_outputs.logits / temperature
            student_loss = student_outputs.loss

            # KL Divergence
            loss_kl = F.kl_div(
                F.log_softmax(student_logits, dim=-1),
                F.softmax(teacher_logits, dim=-1),
                reduction="batchmean"
            ) * (temperature ** 2)

            # Combined loss
            loss = alpha * student_loss + (1 - alpha) * loss_kl

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)
            optimizer.step()

            total_loss += loss.item()
            if step % 50 == 0:
                print(f"[Epoch {epoch+1}, Step {step}] Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Avg Loss: {avg_loss:.4f} - Time: {time.time() - start_time:.2f}s")

    return student_model

# Step 6: Train the student model
student_model = train_student_with_distillation(
    teacher_model, student_model, train_dataloader,
    epochs=5, alpha=0.7, temperature=2.0
)

# Step 7: Save the student model
student_model.save_pretrained("mini_marian_distilled")
student_tokenizer.save_pretrained("mini_marian_distilled")
print("✅ Student model saved as 'mini_marian_distilled'.")

# Step 8: Evaluate BLEU score
bleu = evaluate.load("bleu")
def evaluate_model(model, tokenizer, test_loader):
    model.eval()
    predictions = []
    references = []
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        outputs = model.generate(input_ids, max_length=128, num_beams=4)
        pred = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ref = tokenizer.batch_decode(batch["labels"].to(device), skip_special_tokens=True)
        predictions.extend([p.split() for p in pred])
        references.extend([[r.split()] for r in ref])
    return bleu.compute(predictions=predictions, references=references)

# Evaluate on test set
bleu_score = evaluate_model(student_model, student_tokenizer, test_dataloader)
print(f"BLEU Score: {bleu_score['bleu']:.4f}")

# Compare model sizes
teacher_params = sum(p.numel() for p in teacher_model.parameters())
student_params = sum(p.numel() for p in student_model.parameters())
print(f"Teacher Params: {teacher_params:,} | Student Params: {student_params:,} | Reduction: {teacher_params/student_params:.2f}x")

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/30 [00:00<?, ?it/s]

Tokenizing dataset...


Map:   0%|          | 0/408367 [00:00<?, ? examples/s]



Tokenizing dataset...


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Created dataloaders with batch size 8
Starting distillation training...


AttributeError: 'MarianMTModel' object has no attribute '_shift_right'

In [None]:

def preprocess_dataset(dataset, tokenizer, src_lang="fr", tgt_lang="en", max_length=128):
    def preprocess_function(examples):
        inputs = [ex[src_lang] for ex in examples["translation"]]
        targets = [ex[tgt_lang] for ex in examples["translation"]]
        model_inputs = tokenizer(inputs, max_length=max_length, truncation=True, padding="max_length")
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(targets, max_length=max_length, truncation=True, padding="max_length")
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    print("Tokenizing dataset...")
    processed_dataset = dataset.map(
        preprocess_function,
        batched=True,
        batch_size=project_config["training_config"]["batch_size"],
        remove_columns=dataset.column_names
    )
    return processed_dataset

# Preprocess datasets
processed_train = preprocess_dataset(train_dataset, student_tokenizer)
processed_test = preprocess_dataset(test_dataset, student_tokenizer)

# Step 4: Create dataloaders
def create_dataloaders(processed_train, processed_test):
    processed_train.set_format(type="torch")
    processed_test.set_format(type="torch")
    train_dataloader = DataLoader(processed_train, batch_size=project_config["training_config"]["batch_size"], shuffle=True)
    test_dataloader = DataLoader(processed_test, batch_size=project_config["training_config"]["batch_size"])
    print(f"Created dataloaders with batch size {project_config['training_config']['batch_size']}")
    return train_dataloader, test_dataloader

train_dataloader, test_dataloader = create_dataloaders(processed_train, processed_test)

# Custom shift_right function for MarianMT
def shift_right(input_ids, bos_token_id, pad_token_id):
    """
    Shift input_ids right and prepend bos_token_id (or fallback) for decoder input.
    """
    batch_size, seq_len = input_ids.shape
    shifted = torch.full((batch_size, seq_len), pad_token_id, dtype=torch.long, device=input_ids.device)
    effective_bos_token_id = bos_token_id if bos_token_id is not None else pad_token_id
    shifted[:, 0] = effective_bos_token_id  # Prepend BOS or pad token
    shifted[:, 1:] = input_ids[:, :-1]  # Shift right, dropping the last token
    return shifted

# Step 5: Distillation training function
def train_student_with_distillation(teacher_model, student_model, train_loader, epochs=5, alpha=0.7, temperature=2.0):
    optimizer = AdamW(student_model.parameters(), lr=project_config["training_config"]["learning_rate"])
    student_model.train()
    print("Starting distillation training...")

    bos_token_id = student_tokenizer.bos_token_id  # May be None
    pad_token_id = student_tokenizer.pad_token_id

    for epoch in range(epochs):
        total_loss = 0.0
        start_time = time.time()

        for step, batch in enumerate(train_loader):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Shift labels for decoder input
            decoder_input_ids = shift_right(labels, bos_token_id, pad_token_id)

            # Teacher outputs (with labels for consistency)
            with torch.no_grad():
                teacher_outputs = teacher_model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    decoder_input_ids=decoder_input_ids,
                    labels=labels  # Pass labels to compute loss
                )
                teacher_logits = teacher_outputs.logits / temperature

            # Student outputs (with labels to compute student_loss)
            student_outputs = student_model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                decoder_input_ids=decoder_input_ids,
                labels=labels  # Pass labels to compute loss
            )
            student_logits = student_outputs.logits / temperature
            student_loss = student_outputs.loss  # Now this should not be None

            # KL Divergence
            loss_kl = F.kl_div(
                F.log_softmax(student_logits, dim=-1),
                F.softmax(teacher_logits, dim=-1),
                reduction="batchmean"
            ) * (temperature ** 2)

            # Combined loss
            loss = alpha * student_loss + (1 - alpha) * loss_kl

            # Backprop
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(student_model.parameters(), 1.0)
            optimizer.step()

            total_loss += loss.item()
            if step % 50 == 0:
                print(f"[Epoch {epoch+1}, Step {step}] Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}/{epochs} - Avg Loss: {avg_loss:.4f} - Time: {time.time() - start_time:.2f}s")

    return student_model

# Step 6: Train the student model
student_model = train_student_with_distillation(
    teacher_model, student_model, train_dataloader,
    epochs=5, alpha=0.7, temperature=2.0
)

# Step 7: Save the student model
student_model.save_pretrained("mini_marian_distilled")
student_tokenizer.save_pretrained("mini_marian_distilled")
print("✅ Student model saved as 'mini_marian_distilled'.")

# Step 8: Evaluate BLEU score
bleu = evaluate.load("bleu")
def evaluate_model(model, tokenizer, test_loader):
    model.eval()
    predictions = []
    references = []
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        outputs = model.generate(input_ids, max_length=128, num_beams=4)
        pred = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        ref = tokenizer.batch_decode(batch["labels"].to(device), skip_special_tokens=True)
        predictions.extend([p.split() for p in pred])
        references.extend([[r.split()] for r in ref])
    return bleu.compute(predictions=predictions, references=references)

# Evaluate on test set
bleu_score = evaluate_model(student_model, student_tokenizer, test_dataloader)
print(f"BLEU Score: {bleu_score['bleu']:.4f}")

# Compare model sizes
teacher_params = sum(p.numel() for p in teacher_model.parameters())
student_params = sum(p.numel() for p in student_model.parameters())
print(f"Teacher Params: {teacher_params:,} | Student Params: {student_params:,} | Reduction: {teacher_params/student_params:.2f}x")

Tokenizing dataset...
Tokenizing dataset...
Created dataloaders with batch size 8
Starting distillation training...
[Epoch 1, Step 0] Loss: 48.0017
[Epoch 1, Step 50] Loss: 32.4909
[Epoch 1, Step 100] Loss: 28.5807
[Epoch 1, Step 150] Loss: 23.3613
[Epoch 1, Step 200] Loss: 19.2672
[Epoch 1, Step 250] Loss: 18.5126
[Epoch 1, Step 300] Loss: 17.1889
[Epoch 1, Step 350] Loss: 15.5781
[Epoch 1, Step 400] Loss: 15.9402
[Epoch 1, Step 450] Loss: 17.4205
[Epoch 1, Step 500] Loss: 13.5171
[Epoch 1, Step 550] Loss: 18.7437
[Epoch 1, Step 600] Loss: 18.3456
[Epoch 1, Step 650] Loss: 14.3741
[Epoch 1, Step 700] Loss: 15.4193
[Epoch 1, Step 750] Loss: 13.9459
[Epoch 1, Step 800] Loss: 11.6753
[Epoch 1, Step 850] Loss: 13.6115
[Epoch 1, Step 900] Loss: 12.3071
[Epoch 1, Step 950] Loss: 13.7299
[Epoch 1, Step 1000] Loss: 16.0068
[Epoch 1, Step 1050] Loss: 12.4517
[Epoch 1, Step 1100] Loss: 13.0098
[Epoch 1, Step 1150] Loss: 16.2655
[Epoch 1, Step 1200] Loss: 13.1244
[Epoch 1, Step 1250] Loss: 13.06