# Knowledge Distillation for Math Reasoning
Teacher: Llama3.2

Student: Qwen2

Loss function: KL divergence + cross entropy


In [3]:
! pip install pytorch-lightning transformers datasets wandb flash-attn vllm[triton]



In [4]:
import re
import os
from datetime import datetime
from typing import Optional, Union
from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

from torch.utils.data import DataLoader
from datasets import Dataset
from datasets import load_dataset

from transformers import AutoTokenizer, AutoModelForCausalLM
from vllm import LLM, SamplingParams

INFO 04-17 03:35:10 [__init__.py:239] Automatically detected platform cuda.


### Setup

In [None]:
# from dotenv import load_dotenv
# load_dotenv()  # Load environment variables from .env file

In [5]:
from google.colab import drive
drive.mount('/content/drive/')
save_dir = "/content/drive/My Drive/Columbia/Distillation/"  # TODO: serialization path, change to evaluate

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
# HuggingFace login
import huggingface_hub
huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
# Weights & Bias login
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mtpzl0222[0m ([33mtptrix29[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["VLLM_USE_V1"] = "0"  # For A100 GPU

In [6]:
student_id, teacher_id = "Qwen/Qwen2.5-0.5B-Instruct", "meta-llama/Llama-3.2-1B-Instruct"

In [7]:
class FewShotEvaluator:
    """
    Few-shot evaluator for math reasoning tasks.
    """
    def __init__(self, dataset: Dataset, n_shots: int = 3, device: str = "cuda", batch_size: int = 16) -> None:
        self.dataset = dataset
        self.n_shots = n_shots
        self.device = device
        self.batch_size = batch_size
        self.fewshot_prompt = self.get_fewshot_prompt()

    def get_fewshot_prompt(self) -> str:
        prompt = "Solve these math problems:\n\n"
        for i in range(self.n_shots):
            example = self.dataset[i]
            prompt += f"Question: {example['question']}\nAnswer: {example['answer']}" + "\n\n"
        return prompt

    def preprocess_eval(self, examples: dict) -> dict:
        # Preprocess the example to include the few-shot prompt
        return {
            "prompt": [self.fewshot_prompt + f"Question: {question}\nAnswer:\n" for question in examples["question"]]
        }

    def parse_answer(self, answer: str) -> Optional[str]:
        # Extract the answer from the generated text
        try:
            predicted_answer = re.search(r"#### (\d+\.?\d*)", answer).group(1)
        except:
            predicted_answer = None
        return predicted_answer

    # def eval(self, model_path: str, tokenizer: AutoTokenizer, device: str = "cuda", temperature: float = 0.7, top_p: float = 0.95, max_tokens: int = 256) -> float:
    def eval(self, model_path: str, dtype: str = "auto", device: str = "cuda", temperature: float = 0.7, top_p: float = 0.95, max_tokens: int = 256) -> float:
        """
        Evaluate exact match accuracy
        """
        # Load dataset
        eval_dataset = self.dataset.select(range(self.n_shots, len(self.dataset)))
        eval_dataset = eval_dataset.map(self.preprocess_eval, batched=True)
        eval_dataloader = DataLoader(eval_dataset, batch_size=self.batch_size, shuffle=False)

        # Load model
        llm = LLM(model=model_path, dtype=dtype)
        # Shared or individual sampling settings
        sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_tokens)

        correct = 0
        num_questions = 0

        answers = []

        # batch inference
        for _, batch in tqdm(enumerate(eval_dataloader), desc="Eval Inference: ", total=len(eval_dataloader)):
            # inputs = tokenizer(batch["prompt"], return_tensors="pt", max_length=256, padding="max_length", truncation=True).to(device)
            # outputs = model.generate(**inputs, max_new_tokens=256)
            # batch_answers = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            # answers.extend(batch_answers)
            prompts = batch["prompt"]
            outputs = llm.generate(prompts, sampling_params)
            batch_answers = [output.outputs[0].text.strip() for output in outputs]
            answers.extend(batch_answers)
            torch.cuda.empty_cache()

        # text parse for exact match
        for i, (correct_answer, generated_answer) in tqdm(enumerate(zip(eval_dataset['answer'], answers)), desc="Evaluating Exact Match Accuracy: ", total=len(eval_dataset)):
            # # Remove the input tokens from the output for transformers inference
            # generated_answer = generated_answer[len(eval_dataset['prompt'][i]):]

            # Extract final answer
            predicted_answer = self.parse_answer(generated_answer)
            ground_truth = self.parse_answer(correct_answer)

            # Check if the predicted answer matches the ground truth
            if ground_truth:
                num_questions += 1
                if predicted_answer and predicted_answer == ground_truth:
                    correct += 1

        return correct / num_questions if num_questions > 0 else 0

### Pre-Train Evaluation

In [8]:
eval_ds = load_dataset("openai/gsm8k", "main", split="test", num_proc=4)
evaluator = FewShotEvaluator(eval_ds, n_shots=3, device="cuda", batch_size=128)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
model = "distilled-llama4-0.4B-llama3.2-1B-20250416-191034"
model_path = f"{save_dir}{model}"
# qem_test = evaluator.eval(model_path, device="cuda")
# qem_test

In [9]:
llm = LLM(model=student_id, dtype="auto")

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

INFO 04-17 03:35:38 [config.py:689] This model supports multiple tasks: {'classify', 'reward', 'embed', 'score', 'generate'}. Defaulting to 'generate'.
INFO 04-17 03:35:38 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.


tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

INFO 04-17 03:35:47 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=Qwen/Qwen2.5-0.5B-Instruct, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=True, chunked_prefill_enabled=Tru

ValueError: Cannot cast <zmq.Socket(zmq.ROUTER) at 0x7b3d7ff9d8d0> to int

In [7]:
llm = LLM(model=model_path, dtype="auto")

INFO 04-17 03:33:11 [config.py:2832] Downcasting torch.float32 to torch.float16.
INFO 04-17 03:33:25 [config.py:689] This model supports multiple tasks: {'classify', 'reward', 'score', 'embed', 'generate'}. Defaulting to 'generate'.
INFO 04-17 03:33:25 [config.py:1901] Chunked prefill is enabled with max_num_batched_tokens=8192.


You have set `use_cache` to `False`, but cache_implementation is set to hybrid. cache_implementation will have no effect.


INFO 04-17 03:33:28 [core.py:61] Initializing a V1 LLM engine (v0.8.4) with config: model='/content/drive/My Drive/Columbia/Distillation/distilled-llama4-0.4B-llama3.2-1B-20250416-191034', speculative_config=None, tokenizer='/content/drive/My Drive/Columbia/Distillation/distilled-llama4-0.4B-llama3.2-1B-20250416-191034', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=131072, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=

ValueError: Cannot cast <zmq.Socket(zmq.ROUTER) at 0x7b3e3417f8c0> to int

In [None]:
# pretrain evaluation
qem1 = evaluator.eval(student_id, device="cuda")
qem1

INFO 04-15 03:22:28 [config.py:689] This model supports multiple tasks: {'classify', 'generate', 'reward', 'embed', 'score'}. Defaulting to 'generate'.
INFO 04-15 03:22:28 [llm_engine.py:243] Initializing a V0 LLM engine (v0.8.4) with config: model='Qwen/Qwen2.5-0.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-0.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=Non

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 03:22:35 [loader.py:458] Loading weights took 0.34 seconds
INFO 04-15 03:22:35 [model_runner.py:1146] Model loading took 0.9277 GiB and 3.365625 seconds
INFO 04-15 03:22:37 [worker.py:267] Memory profiling takes 1.01 seconds
INFO 04-15 03:22:37 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.90) = 35.60GiB
INFO 04-15 03:22:37 [worker.py:267] model weights take 0.93GiB; non_torch_memory takes 0.09GiB; PyTorch activation peak memory takes 1.44GiB; the rest of the memory reserved for KV Cache is 33.14GiB.
INFO 04-15 03:22:37 [executor_base.py:112] # cuda blocks: 180996, # CPU blocks: 21845
INFO 04-15 03:22:37 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 88.38x
INFO 04-15 03:22:40 [model_runner.py:1456] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the C

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 04-15 03:23:12 [model_runner.py:1598] Graph capturing finished in 32 secs, took 0.16 GiB
INFO 04-15 03:23:12 [llm_engine.py:449] init engine (profile, create kv cache, warmup model) took 37.30 seconds


Eval Inference:   0%|          | 0/11 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/36 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Evaluating Exact Match Accuracy:   0%|          | 0/1316 [00:00<?, ?it/s]

0.3394216133942161

## Student Model Training

In [None]:
# Load models and tokenizer
tokenizer = AutoTokenizer.from_pretrained(teacher_id)
tokenizer.pad_token = tokenizer.eos_token
teacher_model = AutoModelForCausalLM.from_pretrained(teacher_id, device_map="auto")
student_model = AutoModelForCausalLM.from_pretrained(student_id, device_map="auto")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


In [None]:
teacher_model.config.vocab_size, student_model.config.vocab_size

(128256, 151936)

In [None]:
if teacher_model.config.vocab_size != student_model.config.vocab_size:
    student_model.resize_token_embeddings(teacher_model.config.vocab_size)
# subustitute last layer of student model with that of teacher model
# student_model.lm_head = nn.Linear(student_model.config.hidden_size, teacher_model.lm_head.weight.size(0), bias=False)

In [None]:
teacher_model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm): LlamaRMSNorm((2048,), eps=1e-05)
    (rotary_emb):

In [None]:
student_model.eval()

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(128256, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (rotary_emb): Qwen2RotaryEmbe

In [None]:
def eval_size(model):
    """
    Function to evaluate the size of the model in terms of number of parameters.
    """
    return sum(p.numel() for p in model.parameters())  / 10**9

eval_size(student_model), eval_size(teacher_model)  # Check the number of parameters in the models

(0.472815488, 1.2358144)

In [None]:
class GSM8KDataModule(pl.LightningDataModule):
    def __init__(self, tokenizer: AutoTokenizer, batch_size: int = 2) -> None:
        super().__init__()
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.batch_size = batch_size
        self.num_workers = os.cpu_count() - 1 if os.cpu_count() else 0


    def setup(self, stage=None) -> None:
        dataset = load_dataset("openai/gsm8k", "main", split="train", num_proc=4)

        def preprocess_training(examples: dict) -> dict:
            """
            Preprocess training corpus.
            Input: ids, attention_mask
            Output: labels
            """
            inputs = ["Question: " + q + "\nAnswer:" for q in examples["question"]]
            model_inputs = self.tokenizer(
                inputs,
                padding="max_length",
                truncation=True,
                max_length=256,
            )

            with self.tokenizer.as_target_tokenizer():
                labels = self.tokenizer(
                    examples["answer"],
                    padding="max_length",
                    truncation=True,
                    max_length=256,
                )

            model_inputs["labels"] = labels["input_ids"]
            return model_inputs

        self.train_dataset = dataset.map(preprocess_training, batched=True)
        self.train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


    def train_dataloader(self) -> DataLoader:
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    # def val_dataloader(self) -> DataLoader:
    #     pass


class DistillationLightningModule(pl.LightningModule):
    def __init__(self,
                 student_model: AutoModelForCausalLM,
                 teacher_model: AutoModelForCausalLM,
                 tokenizer: AutoTokenizer,
                 alpha: float = 0.3,
                 temperature: float = 2.0,
                 learning_rate: float = 5e-5
                 ) -> None:
        super().__init__()
        self.student = student_model
        self.teacher = teacher_model
        self.alpha = alpha
        self.temperature = temperature
        self.learning_rate = learning_rate
        self.student.train()
        self.teacher.eval()
        # for param in self.teacher.parameters():
        #     param.requires_grad = False

    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        return self.student(input_ids=input_ids, attention_mask=attention_mask)

    def training_step(self, batch: dict, batch_idx: int) -> torch.Tensor:
        input_ids = batch["input_ids"].to(self.device)
        attention_mask = batch["attention_mask"].to(self.device)

        with torch.no_grad():
            teacher_logits = self.teacher(input_ids=input_ids, attention_mask=attention_mask).logits

        student_logits = self.student(input_ids=input_ids, attention_mask=attention_mask).logits

        student_log_probs = F.log_softmax(student_logits / self.temperature, dim=-1)
        teacher_probs = F.softmax(teacher_logits / self.temperature, dim=-1)
        loss_kl = F.kl_div(student_log_probs, teacher_probs, reduction="batchmean") * (self.temperature ** 2)

        labels = batch["labels"].to(self.device)
        loss_ce = F.cross_entropy(student_logits.view(-1, student_logits.size(-1)), labels.view(-1))

        loss = (1 - self.alpha) * loss_kl + self.alpha * loss_ce

        self.log("loss/train_loss_kl", loss_kl, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("loss/train_loss_ce", loss_ce, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log("loss/train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    # def validation_step(self, batch: dict, batch_idx: int) -> torch.Tensor:
    #     pass

    def configure_optimizers(self) -> torch.optim.Optimizer:
        return torch.optim.AdamW(self.student.parameters(), lr=self.learning_rate)


In [None]:
gsm8k_dm = GSM8KDataModule(tokenizer=tokenizer, batch_size=16)
distill_model = DistillationLightningModule(student_model, teacher_model, tokenizer=tokenizer, alpha=0.3, temperature=2.0, learning_rate=5e-5)

In [None]:
# Trainer
timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
trainer = pl.Trainer(
    max_epochs=1,  # increase epoch for training from scratch
    precision="16-mixed",
    log_every_n_steps=10,
    logger=pl.loggers.WandbLogger(project="KD-COMS6998", name=f"qwen2.5-0.5B-llama3.2-1B-{timestamp}"),
    accelerator="gpu",
    devices=1,
    accumulate_grad_batches=1,
)

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:You are using the plain ModelCheckpoint callback. Consider using LitModelCheckpoint which with seamless uploading to Model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [None]:
trainer.fit(
    distill_model,
    datamodule=gsm8k_dm,
)

INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA A100-SXM4-40GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | student | Qwen2ForCausalLM | 472 M  | train
1 | teacher | LlamaForCausalLM | 1.2 B  | eval 
-----------------------------------------------------
1.7 B     Trainable params
0         Non-trainable params
1.7 B     Total params
6,834.520 Total estimated model params size (MB)
319       Modules in train mode
215       Modules in eval mode


Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=1` reached.


In [None]:
wandb.finish()

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_loss_ce_epoch,▁
loss/train_loss_ce_step,█▇▆▆▅▂▃▂▂▂▂▂▃▃▃▃▁▂▂▃▃▂▂▂▃▁▂▃▃▃▂▃▂▂▂▂▃▃▂▃
loss/train_loss_epoch,▁
loss/train_loss_kl_epoch,▁
loss/train_loss_kl_step,█▄▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss/train_loss_step,█▄▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
epoch,0.0
loss/train_loss_ce_epoch,8.72363
loss/train_loss_ce_step,8.67335
loss/train_loss_epoch,256.28586
loss/train_loss_kl_epoch,362.38412
loss/train_loss_kl_step,248.24403
loss/train_loss_step,176.37283
trainer/global_step,467.0


### Post-Training

In [None]:
save_model_name = f"distilled-qwen2.5-0.5B-llama3.2-1B-{timestamp}"
save_path = f"{save_dir}{save_model_name}"
distill_model.student.save_pretrained(save_path)
distill_model.student.config.save_pretrained(save_path)
gsm8k_dm.tokenizer.save_pretrained(save_path)
os.listdir(save_path)

['config.json',
 'generation_config.json',
 'model.safetensors',
 'tokenizer_config.json',
 'special_tokens_map.json',
 'tokenizer.json']

In [None]:
# post-training evaluation
qem2 = evaluator.eval(save_path, device="cuda")
qem2

INFO 04-15 03:45:36 [config.py:2832] Downcasting torch.float32 to torch.float16.
INFO 04-15 03:45:49 [config.py:689] This model supports multiple tasks: {'score', 'generate', 'embed', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 04-15 03:45:49 [llm_engine.py:243] Initializing a V0 LLM engine (v0.8.4) with config: model='/content/drive/My Drive/Columbia/distilled-qwen2.5-0.5B-llama3.2-1B-20250415-033948', speculative_config=None, tokenizer='/content/drive/My Drive/Columbia/distilled-qwen2.5-0.5B-llama3.2-1B-20250415-033948', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', rea

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-15 03:45:54 [loader.py:458] Loading weights took 1.29 seconds
INFO 04-15 03:45:54 [model_runner.py:1146] Model loading took 0.8886 GiB and 1.353209 seconds
INFO 04-15 03:45:55 [worker.py:267] Memory profiling takes 0.68 seconds
INFO 04-15 03:45:55 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.56GiB) x gpu_memory_utilization (0.90) = 35.60GiB
INFO 04-15 03:45:55 [worker.py:267] model weights take 0.89GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.23GiB; the rest of the memory reserved for KV Cache is 33.49GiB.
INFO 04-15 03:45:56 [executor_base.py:112] # cuda blocks: 182875, # CPU blocks: 21845
INFO 04-15 03:45:56 [executor_base.py:117] Maximum concurrency for 32768 tokens per request: 89.29x
INFO 04-15 03:45:59 [model_runner.py:1456] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the C

Capturing CUDA graph shapes:   0%|          | 0/35 [00:00<?, ?it/s]

INFO 04-15 03:46:33 [model_runner.py:1598] Graph capturing finished in 34 secs, took 0.16 GiB
INFO 04-15 03:46:33 [llm_engine.py:449] init engine (profile, create kv cache, warmup model) took 39.29 seconds


Eval Inference:   0%|          | 0/11 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/128 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Processed prompts:   0%|          | 0/36 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Evaluating Exact Match Accuracy:   0%|          | 0/1316 [00:00<?, ?it/s]

0.0