In [1]:
# üì¶ Install Dependencies
import subprocess
import sys

def install_package(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

packages = [
    "torch",
    "transformers==4.36.0",
    "peft==0.7.1",
    "datasets==2.16.0",
    "accelerate==0.25.0",
    "bitsandbytes==0.41.3",
    "tokenizers",
    "tqdm",
    "scikit-learn"
]

print("üì¶ Installing packages...")
for pkg in packages:
    try:
        install_package(pkg)
    except Exception as e:
        print(f"‚ö†Ô∏è  Warning: Failed to install {pkg}: {e}")

print("‚úÖ Installation complete!\n")

üì¶ Installing packages...
‚úÖ Installation complete!

‚úÖ Installation complete!



In [2]:
# üîç System Check
from dotenv import load_dotenv
import torch
import os

load_dotenv()
HUGGING_FACE_TOKEN = os.getenv("HUGGINGFACE_HUB_TOKEN")

print("="*70)
print("üñ•Ô∏è  SYSTEM INFORMATION")
print("="*70)
print(f"PyTorch: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("‚ùå No GPU detected! This script requires CUDA.")
    raise RuntimeError("GPU is required for this training script")

print("="*70 + "\n")

üñ•Ô∏è  SYSTEM INFORMATION
PyTorch: 2.7.1+cu118
CUDA Available: True
GPU: NVIDIA RTX A1000 Laptop GPU
VRAM: 4.3 GB
CUDA Version: 11.8



In [3]:
# ‚öôÔ∏è Configuration
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import gc
from tqdm.auto import tqdm
import json
from huggingface_hub import login

# Disable unnecessary warnings
import warnings
warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

class Config:
    """Training configuration"""
    # Models
    TEACHER_MODEL = "meta-llama/Llama-2-13b-hf"
    STUDENT_MODEL = "mistralai/Mistral-7B-v0.1"
    
    # HuggingFace Token (replace with yours)
    HF_TOKEN = HUGGING_FACE_TOKEN
    
    # Dataset
    DATASET_NAME = "gsm8k"
    DATASET_CONFIG = "main"
    MAX_SAMPLES = 2000
    MAX_LENGTH = 512
    
    # Training
    BATCH_SIZE = 2
    GRADIENT_ACCUM = 8
    LEARNING_RATE = 2e-4
    NUM_EPOCHS = 3
    WARMUP_STEPS = 100
    
    # Distillation
    ALPHA_OUTPUT = 0.5
    BETA_LATENT = 0.5
    TEMPERATURE = 2.0
    LATENT_LAYERS = [8, 16, 24]  # Match these layers
    
    # LoRA
    LORA_R = 16
    LORA_ALPHA = 32
    LORA_DROPOUT = 0.05
    LORA_TARGET_MODULES = ["q_proj", "k_proj", "v_proj", "o_proj"]
    
    # Local paths
    OUTPUT_DIR = "./distill_output"
    LATENT_CACHE_DIR = "./latent_cache"
    
    DEVICE = "cuda"

config = Config()

# Create directories
os.makedirs(config.OUTPUT_DIR, exist_ok=True)
os.makedirs(config.LATENT_CACHE_DIR, exist_ok=True)

# Login to HuggingFace
try:
    login(token=config.HF_TOKEN, add_to_git_credential=False)
    print("‚úÖ HuggingFace authenticated\n")
except Exception as e:
    print(f"‚ö†Ô∏è  HF login warning: {e}\n")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
W1206 14:31:47.301000 4644 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
W1206 14:31:47.301000 4644 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


‚úÖ HuggingFace authenticated



## üì• Download Models for Offline Use

If you want to use models offline, run this cell **once** to download them:

```python
from huggingface_hub import snapshot_download

# Download Teacher Model (Llama-2-13B) - ~26GB
snapshot_download(
    repo_id="meta-llama/Llama-2-13b-hf",
    local_dir="./models/Llama-2-13b-hf",
    token="YOUR_HF_TOKEN"  # Required for Llama-2
)

# Download Student Model (Mistral-7B) - ~14GB
snapshot_download(
    repo_id="mistralai/Mistral-7B-v0.1",
    local_dir="./models/Mistral-7B-v0.1"
)
```

After downloading:
1. Set `USE_LOCAL_MODELS = True` in Config cell
2. Models will load from `./models/` directory

**Total disk space needed: ~40GB**

In [None]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="meta-llama/Llama-2-13b-hf",
    local_dir="./models/Llama-2-13b-hf",
    token=HUGGING_FACE_TOKEN,
    resume_download=True
)
print("‚úÖ Teacher downloaded!\n")

snapshot_download(
    repo_id="mistralai/Mistral-7B-v0.1",
    local_dir="./models/Mistral-7B-v0.1",
    resume_download=True
)
print("‚úÖ Student downloaded!\n")

Fetching 19 files:  21%|‚ñà‚ñà        | 4/19 [00:03<00:11,  1.36it/s]

In [5]:
# üìä Load Dataset
print("üì¶ Loading GSM8K dataset...")

dataset = load_dataset(config.DATASET_NAME, config.DATASET_CONFIG, trust_remote_code=True)

train_data = dataset['train'].select(range(min(config.MAX_SAMPLES, len(dataset['train']))))
test_data = dataset['test'].select(range(min(500, len(dataset['test']))))

print(f"‚úÖ Train samples: {len(train_data)}")
print(f"‚úÖ Test samples: {len(test_data)}\n")

# Verify data structure
sample = train_data[0]
print("üìã Data sample:")
print(f"   Question: {sample['question'][:80]}...")
print(f"   Answer: {sample['answer'][:80]}...\n")

üì¶ Loading GSM8K dataset...
‚úÖ Train samples: 2000
‚úÖ Test samples: 500

üìã Data sample:
   Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many...
   Answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>...



In [None]:
# üß† Teacher Model Loading & Extraction

def prepare_prompt(question: str, answer: str = None) -> str:
    """Format prompt for math reasoning"""
    prompt = f"Question: {question}\n\nLet's solve this step by step:\n"
    if answer:
        prompt += answer
    return prompt

def check_cache_exists(cache_dir, expected_count):
    """Check if cache is complete"""
    if not os.path.exists(cache_dir):
        return 0
    files = [f for f in os.listdir(cache_dir) if f.startswith('latent_') and f.endswith('.pt')]
    return len(files)

def load_teacher_model():
    """Load teacher with 4-bit quantization"""
    print("üîÑ Loading Teacher (Llama-2-13B, 4-bit)...")
    
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        config.TEACHER_MODEL,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        use_auth_token=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        config.TEACHER_MODEL,
        use_auth_token=True
    )
    tokenizer.pad_token = tokenizer.eos_token
    
    print(f"‚úÖ Teacher loaded ({model.num_parameters() / 1e9:.1f}B params)\n")
    return model, tokenizer

def extract_latent_states(model, tokenizer, data, output_dir):
    """Extract and cache teacher latents"""
    print(f"üß† Extracting latents to: {output_dir}")
    
    # Check cache
    existing = check_cache_exists(output_dir, len(data))
    if existing == len(data):
        print(f"‚úÖ Cache complete ({existing} files)\n")
        return
    
    print(f"   Found: {existing}/{len(data)} cached")
    print(f"   Extracting: {len(data) - existing} samples\n")
    
    model.eval()
    
    with torch.no_grad():
        for idx in tqdm(range(len(data)), desc="Extracting", ncols=80):
            cache_path = os.path.join(output_dir, f"latent_{idx}.pt")
            
            if os.path.exists(cache_path):
                continue
            
            item = data[idx]
            prompt = prepare_prompt(item['question'], item.get('answer'))
            
            inputs = tokenizer(
                prompt,
                return_tensors='pt',
                truncation=True,
                max_length=config.MAX_LENGTH,
                padding='max_length'
            ).to(model.device)
            
            outputs = model(**inputs, output_hidden_states=True, return_dict=True)
            
            # Extract specified layers
            latent_states = {}
            for layer_idx in config.LATENT_LAYERS:
                if layer_idx < len(outputs.hidden_states):
                    hidden = outputs.hidden_states[layer_idx]
                    pooled = hidden.mean(dim=1).cpu()
                    latent_states[f'layer_{layer_idx}'] = pooled
            
            torch.save(latent_states, cache_path)
            
            if idx % 100 == 0 and idx > 0:
                torch.cuda.empty_cache()
    
    print(f"‚úÖ Extraction complete!\n")

# Execute extraction
existing_cache = check_cache_exists(config.LATENT_CACHE_DIR, len(train_data))

if existing_cache < len(train_data):
    teacher_model, teacher_tokenizer = load_teacher_model()
    extract_latent_states(teacher_model, teacher_tokenizer, train_data, config.LATENT_CACHE_DIR)
    
    # Free memory
    del teacher_model, teacher_tokenizer
    gc.collect()
    torch.cuda.empty_cache()
    print("üóëÔ∏è  Teacher freed from memory\n")
else:
    print(f"‚úÖ Cache found ({existing_cache} files)\n")

üîÑ Loading Teacher (Llama-2-13B, 4-bit)...


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# üéì Student Model Setup

class ReasoningDataset(Dataset):
    """Dataset with cached teacher latents"""
    def __init__(self, data, tokenizer, max_length=512, latent_dir=None):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.latent_dir = latent_dir
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        prompt = prepare_prompt(item['question'], item.get('answer'))
        
        encoding = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        
        result = {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'idx': idx
        }
        
        # Load cached latents
        if self.latent_dir:
            latent_path = os.path.join(self.latent_dir, f"latent_{idx}.pt")
            if os.path.exists(latent_path):
                result['teacher_latents'] = torch.load(latent_path, map_location='cpu')
        
        return result

def setup_student_model():
    """Load student with LoRA"""
    print("üéì Loading Student (Mistral-7B + LoRA)...")
    
    model = AutoModelForCausalLM.from_pretrained(
        config.STUDENT_MODEL,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    )
    
    tokenizer = AutoTokenizer.from_pretrained(config.STUDENT_MODEL)
    tokenizer.pad_token = tokenizer.eos_token
    
    # Apply LoRA
    lora_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        r=config.LORA_R,
        lora_alpha=config.LORA_ALPHA,
        lora_dropout=config.LORA_DROPOUT,
        target_modules=config.LORA_TARGET_MODULES,
        bias="none"
    )
    
    model = get_peft_model(model, lora_config)
    
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    
    print(f"‚úÖ Student loaded")
    print(f"   Total params: {total_params / 1e6:.1f}M")
    print(f"   Trainable: {trainable_params / 1e6:.1f}M ({100 * trainable_params / total_params:.2f}%)\n")
    
    return model, tokenizer

student_model, student_tokenizer = setup_student_model()

üéì Loading Student (Mistral-7B + LoRA)...


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# üî• Training Setup

class DistillationTrainer(Trainer):
    """Custom trainer with latent distillation"""
    
    def compute_loss(self, model, inputs, return_outputs=False):
        outputs = model(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            labels=inputs['input_ids'],
            output_hidden_states=True,
            return_dict=True
        )
        
        loss_output = outputs.loss
        loss_latent = 0.0
        
        # Latent matching
        if 'teacher_latents' in inputs:
            teacher_latents = inputs['teacher_latents']
            student_hidden = outputs.hidden_states
            
            matched_layers = 0
            for layer_idx in config.LATENT_LAYERS:
                layer_key = f'layer_{layer_idx}'
                if layer_key in teacher_latents and layer_idx < len(student_hidden):
                    student_h = student_hidden[layer_idx].mean(dim=1)
                    teacher_h = teacher_latents[layer_key].to(student_h.device)
                    
                    loss_latent += F.mse_loss(student_h, teacher_h)
                    matched_layers += 1
            
            if matched_layers > 0:
                loss_latent /= matched_layers
        
        total_loss = config.ALPHA_OUTPUT * loss_output + config.BETA_LATENT * loss_latent
        
        return (total_loss, outputs) if return_outputs else total_loss

# Prepare datasets
train_dataset = ReasoningDataset(
    train_data,
    student_tokenizer,
    max_length=config.MAX_LENGTH,
    latent_dir=config.LATENT_CACHE_DIR
)

test_dataset = ReasoningDataset(
    test_data,
    student_tokenizer,
    max_length=config.MAX_LENGTH,
    latent_dir=None
)

# Training arguments
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.NUM_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUM,
    learning_rate=config.LEARNING_RATE,
    warmup_steps=config.WARMUP_STEPS,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    evaluation_strategy="steps",
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,
    report_to="none",
    remove_unused_columns=False,
    logging_dir=f"{config.OUTPUT_DIR}/logs",
    disable_tqdm=False
)

trainer = DistillationTrainer(
    model=student_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

print("‚úÖ Training configured\n")

In [None]:
# üöÄ Start Training

print("="*70)
print("üî• STARTING TRAINING")
print("="*70)
print(f"Epochs: {config.NUM_EPOCHS}")
print(f"Batch size: {config.BATCH_SIZE} √ó {config.GRADIENT_ACCUM} = {config.BATCH_SIZE * config.GRADIENT_ACCUM}")
print(f"Learning rate: {config.LEARNING_RATE}")
print("="*70 + "\n")

trainer.train()

# Save final model
trainer.save_model(f"{config.OUTPUT_DIR}/final_model")
student_tokenizer.save_pretrained(f"{config.OUTPUT_DIR}/final_model")

print("\n" + "="*70)
print("‚úÖ TRAINING COMPLETE")
print("="*70 + "\n")

In [None]:
# üìä Evaluation

def evaluate_reasoning(model, tokenizer, test_data, num_samples=50):
    """Evaluate on reasoning tasks"""
    model.eval()
    correct = 0
    
    print(f"üìä Evaluating on {num_samples} samples...")
    
    with torch.no_grad():
        for idx in tqdm(range(min(num_samples, len(test_data))), desc="Evaluating", ncols=80):
            item = test_data[idx]
            prompt = prepare_prompt(item['question'])
            
            inputs = tokenizer(
                prompt,
                return_tensors='pt',
                truncation=True,
                max_length=256
            ).to(model.device)
            
            outputs = model.generate(
                **inputs,
                max_new_tokens=128,
                temperature=0.7,
                do_sample=False,
                pad_token_id=tokenizer.eos_token_id
            )
            
            generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
            ground_truth = str(item['answer'])
            
            if ground_truth in generated:
                correct += 1
    
    accuracy = correct / num_samples
    print(f"‚úÖ Accuracy: {accuracy:.2%} ({correct}/{num_samples})\n")
    return accuracy

accuracy = evaluate_reasoning(student_model, student_tokenizer, test_data)

# Save results
results = {
    'accuracy': float(accuracy),
    'config': {
        'teacher': config.TEACHER_MODEL,
        'student': config.STUDENT_MODEL,
        'lora_r': config.LORA_R,
        'alpha_output': config.ALPHA_OUTPUT,
        'beta_latent': config.BETA_LATENT,
        'epochs': config.NUM_EPOCHS
    }
}

with open(f"{config.OUTPUT_DIR}/results.json", 'w') as f:
    json.dump(results, f, indent=2)

print(f"üìÅ Results saved to: {config.OUTPUT_DIR}/results.json")

In [None]:
# üß™ Test Inference

def inference(question: str):
    """Single question inference"""
    prompt = prepare_prompt(question)
    inputs = student_tokenizer(prompt, return_tensors='pt').to(student_model.device)
    
    with torch.no_grad():
        outputs = student_model.generate(
            **inputs,
            max_new_tokens=200,
            temperature=0.7,
            do_sample=True,
            pad_token_id=student_tokenizer.eos_token_id
        )
    
    result = student_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result[len(prompt):]

# Test examples
test_questions = [
    "If John has 5 apples and gives 2 to Mary, how many does he have left?",
    "A train travels 60 miles in 1 hour. How far will it travel in 3.5 hours?",
    "Sarah has $50. She spends $15 on lunch and $12 on a book. How much money does she have left?"
]

print("="*70)
print("üß™ INFERENCE EXAMPLES")
print("="*70 + "\n")

for i, q in enumerate(test_questions, 1):
    print(f"Q{i}: {q}")
    answer = inference(q)
    print(f"A{i}: {answer}\n")
    print("-"*70 + "\n")

print(f"‚úÖ Model saved to: {config.OUTPUT_DIR}/final_model")
print("="*70)