In [9]:
# Install required packages for Excel handling
%pip install openpyxl pandas --quiet

Note: you may need to restart the kernel to use updated packages.


# Knowledge Distillation from Python Script

In [10]:
import torch
import torch.nn.functional as F
from datasets import load_dataset, Dataset
from transformers import (
    Trainer, TrainingArguments, DataCollatorForLanguageModeling,
    get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import pandas as pd
from torch.cuda.amp import autocast, GradScaler
import numpy as np
from tqdm.auto import tqdm

# Import your model loading functions
from llama_8b import load_llama_8b
from llama_1b import load_llama_1b

# Basic settings
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory allocated: {torch.cuda.memory_allocated()/1e9:.2f} GB")

Using device: cuda
CUDA available: True
GPU: NVIDIA GeForce RTX 4090
Memory allocated: 8.84 GB


In [11]:
class OptimizedDistillationTrainer(Trainer):
    """
    Optimized Trainer for knowledge distillation with improved loss calculation
    and mixed precision training.
    """
    def __init__(self, teacher_model=None, temperature=2.0, alpha=0.5, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        self.teacher.to(self.model.device)
        self.teacher.eval()
        self.temperature = temperature
        self.alpha = alpha
        self.scaler = GradScaler()
        
    def compute_loss(self, model, inputs, return_outputs=False):
        """
        Improved loss computation with better numerical stability
        """
        with autocast(enabled=True):
            # Get student outputs
            student_outputs = model(**inputs)
            student_logits = student_outputs.logits
            
            # Get teacher outputs (in eval mode)
            with torch.no_grad():
                teacher_outputs = self.teacher(**inputs)
                teacher_logits = teacher_outputs.logits

            # Temperature scaling
            T = self.temperature
            
            # Compute soft targets (teacher probabilities)
            teacher_probs = F.softmax(teacher_logits / T, dim=-1)
            
            # Compute student log probabilities
            student_log_probs = F.log_softmax(student_logits / T, dim=-1)
            
            # Compute KL divergence loss (soft targets)
            distillation_loss = F.kl_div(
                student_log_probs,
                teacher_probs,
                reduction='batchmean',
                log_target=False
            ) * (T * T)
            
            # Hard targets loss (standard cross-entropy with actual labels)
            hard_loss = F.cross_entropy(
                student_logits.view(-1, student_logits.size(-1)),
                inputs['labels'].view(-1),
                ignore_index=self.tokenizer.pad_token_id
            )
            
            # Combine losses with alpha
            loss = (self.alpha * hard_loss) + ((1 - self.alpha) * distillation_loss)

        if return_outputs:
            return loss, student_outputs
        return loss

    def training_step(self, model, inputs, *args, **kwargs):
        """
        Overridden training step to use mixed precision
        """
        model.train()
        inputs = self._prepare_inputs(inputs)

        with autocast(enabled=True):
            loss = self.compute_loss(model, inputs)

        self.scaler.scale(loss).backward()
        
        return loss.detach()

In [12]:
def main():
    # --- Step 1: Load and Optimize Models ---
    print("\n--- Loading Models ---")
    student_model, tokenizer = load_llama_1b()
    teacher_model, _ = load_llama_8b()
    
    # Optimize student model
    student_model = prepare_model_for_kbit_training(student_model)
    student_model.to(device)
    teacher_model.to(device)
    
    # Enhanced LoRA config for better knowledge transfer
    lora_config = LoraConfig(
        r=16,  # Increased rank
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )
    student_model = get_peft_model(student_model, lora_config)
    print("Models prepared for training")

    # --- Step 2: Load Dataset from Excel ---
    print("\n--- Loading Dataset from Excel ---")
    excel_path = "D:/DeepmindForge/Dataset expansion/merged_dataset(JBB+Adv).csv.xlsx"
    df = pd.read_excel(excel_path)
    
    # Try to automatically identify question/answer columns
    possible_q_cols = [c for c in df.columns if any(x in c.lower() for x in ['prompt', 'question', 'input'])]
    possible_a_cols = [c for c in df.columns if any(x in c.lower() for x in ['response', 'answer', 'output'])]
    
    q_col = possible_q_cols[0] if possible_q_cols else df.columns[0]
    a_col = possible_a_cols[1] if len(possible_a_cols) > 1 else (possible_a_cols[0] if possible_a_cols else df.columns[1])
    
    print(f"Using columns -> Question: {q_col}, Answer: {a_col}")
    
    # Clean and prepare data
    df = df[[q_col, a_col]].dropna()
    
    # Convert to HuggingFace dataset format with improved prompt format
    dataset_dict = {
        'text': [f"Question: {q}\nAnswer: {a}" for q, a in zip(df[q_col], df[a_col])]
    }
    dataset = Dataset.from_dict(dataset_dict)
    
    # Split into train/validation with stratification
    split_dataset = dataset.train_test_split(test_size=0.1, shuffle=True, seed=42)
    train_dataset = split_dataset['train']
    val_dataset = split_dataset['test']
    
    print(f"Dataset size: {len(train_dataset)} train, {len(val_dataset)} validation")

    # --- Step 3: Enhanced Preprocessing ---
    def preprocess_function(examples):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=256,
            padding="max_length",
            return_tensors="pt"
        )

    # Process datasets
    train_dataset = train_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=["text"],
        desc="Tokenizing train data"
    )
    val_dataset = val_dataset.map(
        preprocess_function,
        batched=True,
        remove_columns=["text"],
        desc="Tokenizing validation data"
    )

    print("--- Dataset Ready ---\n")

    # --- Step 4: Training Setup ---
    training_args = TrainingArguments(
        output_dir="./distilled_llama_1b_results",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        weight_decay=0.01,
        logging_steps=10,
        save_steps=100,
        fp16=True
    )

    # Initialize trainer
    trainer = OptimizedDistillationTrainer(
        teacher_model=teacher_model,
        model=student_model,
        args=training_args,
        train_dataset=train_dataset,
        temperature=2.0,
        alpha=0.3
    )

    # --- Step 5: Training ---
    print("--- Starting Knowledge Distillation ---")
    trainer.train()
    print("--- Distillation Complete ---")

    # --- Step 6: Save Final Model ---
    final_model_path = "./distilled_llama_1b_final"
    trainer.save_model(final_model_path)
    tokenizer.save_pretrained(final_model_path)
    print(f"Distilled student model saved to {final_model_path}")

In [13]:
main()


--- Loading Models ---
--- Loading Student Model (Llama 1B) ---
--- Student Model Loaded ---
--- Loading Teacher Model (Llama 8B) ---
--- Student Model Loaded ---
--- Loading Teacher Model (Llama 8B) ---


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

--- Teacher Model Loaded ---
Models prepared for training

--- Loading Dataset from Excel ---
Using columns -> Question: prompt, Answer: prompt
Dataset size: 1 train, 1 validation
Using columns -> Question: prompt, Answer: prompt
Dataset size: 1 train, 1 validation


Tokenizing train data:   0%|          | 0/1 [00:00<?, ? examples/s]

Tokenizing validation data:   0%|          | 0/1 [00:00<?, ? examples/s]

--- Dataset Ready ---



No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
  self.scaler = GradScaler()
  self.scaler = GradScaler()


--- Starting Knowledge Distillation ---


TypeError: OptimizedDistillationTrainer.training_step() takes 3 positional arguments but 4 were given