# 08: Finetune LLMs

## 1. Imports

In [1]:
import gc
import copy
import random
# import os
# CACHE_PATH = '<path_to_cache>'
# os.environ["TORCH_HOME"] = CACHE_PATH
# os.environ["HF_HOME"] = CACHE_PATH
# os.environ["HUGGINGFACE_HUB_CACHE"] = CACHE_PATH
# os.environ["HUGGINGFACE_ASSETS_CACHE"] = CACHE_PATH
# os.environ["TRANSFORMERS_CACHE"] = CACHE_PATH

import matplotlib.pyplot as plt
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
# from huggingface_hub import login; login(token="<hf_token>")

import pruna_pro
from pruna_pro import SmashConfig
from pruna_pro import smash
from pruna.data.pruna_datamodule import PrunaDataModule
from pruna.data.utils import split_train_into_train_val_test
from pruna.evaluation.evaluation_agent import EvaluationAgent
from pruna.evaluation.metrics.metric_elapsed_time import ElapsedTimeMetric
from pruna.evaluation.metrics.metric_torch import TorchMetricWrapper
from pruna.evaluation.metrics.metric_energy import EnergyMetric
from pruna.evaluation.metrics.metric_memory import GPUMemoryMetric
from pruna.evaluation.metrics.metric_model_architecture import ModelArchitectureMetric
from pruna.evaluation.task import Task

## 2. Utils

The utils functions help for:
- Load from a list of (small) models.  eel free to try other models until the GPU memory is not enough!
- Make plots.
- Iterate over evaluation and model configurations.

In [3]:
model_ids = [
    "facebook/opt-125m",
    "facebook/opt-350m", 
    "facebook/opt-1.3b",
    "facebook/opt-2.7b",
    "meta-llama/Llama-3.2-1B",
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "google/gemma-3-1b-it",
    "google/gemma-3-4b-it",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    # "microsoft/Phi-4-mini-instruct",
    # "HuggingFaceTB/SmolLM-135M",
    # "HuggingFaceTB/SmolLM-135M-instruct",
    # "HuggingFaceTB/SmolLM-360M", 
    # "HuggingFaceTB/SmolLM-360M-Instruct",
    # "HuggingFaceTB/SmolLM-1.7B",
    # "HuggingFaceTB/SmolLM-1.7B-Instruct",
    # "HuggingFaceTB/SmolLM2-135M",
    # "HuggingFaceTB/SmolLM2-135M-Instruct",
    # "HuggingFaceTB/SmolLM2-360M",
    # "HuggingFaceTB/SmolLM2-360M-Instruct", 
    # "HuggingFaceTB/SmolLM2-1.7B",
    # "HuggingFaceTB/SmolLM2-1.7B-Instruct",
    # "PleIAs/Pleias-350m-Preview",
    # "PleIAs/Pleias-Pico",
    # "PleIAs/Pleias-1.2b-Preview",
    # "PleIAs/Pleias-Nano",
    # "PleIAs/Pleias-3b-Preview",
]

# Load model and tokenizer for first model in list
model_id = model_ids[1]
model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)

## 3. Finetune LLM

We recommend to checkout the [Pruna documentation](https://pruna.readthedocs.io/en/latest/index.html) for access to AI efficiency functions.

### 3.1 Evaluate the base model quality

**Implementation task:**
- Evaluate the base model quality and latency with the perplexity metric on the WikiText dataset.
- Repeat the experiment with other LLMs and/or datasets.

In [10]:
def smash_evaluate_perplexity_time(model, tokenizer, smash_config, dataset="WikiText"):
    ### To Complete ###
    model_copy = copy.deepcopy(model)

    if smash_config:
        model_copy = smash(model_copy, smash_config)
    metrics = [
    ElapsedTimeMetric(n_iterations = 100,
                    n_warmup_iterations = 10,
                    device = "cuda",
                    timing_type = "sync",),
    TorchMetricWrapper(metric_name="perplexity", call_type="y_gt")
    ]
    task = Task(metrics, datamodule=PrunaDataModule.from_string(dataset, tokenizer=tokenizer))
    eval_agent = EvaluationAgent(task)
    results = eval_agent.evaluate(model_copy)
    
    del model_copy
    torch.cuda.empty_cache()
    gc.collect()
    ### End of To Complete ###
    
    return results

In [10]:
### To Complete ###
results = smash_evaluate_perplexity_time(model, tokenizer, None, dataset="WikiText")
print(results)
### End of To Complete ###

INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f4955a23370>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
INFO - Using provided list of metric instances.
INFO - Evaluating a base model.
INFO - Detected transformers model. Using TransformerHandler.
- The first element of the batch is passed as input.
- The generated outputs are expected to have .logits attribute.
I

{'perplexity_y_gt': 39.13822937011719, 'inference_elapsed_time_ms_@1': 1270.105492591858, 'inference_latency_ms_@1': 12.701054925918578, 'inference_throughput_batches_per_ms_@1': 0.07873361746978486}


### 3.2 Finetune LLM quality with in-distribution data

**Implementation task:**
- Finetune in-place or by adding paramters a quantized LLM with Quanto and evaluate its quality and latency metric on the WikiText dataset.

**Questions:**
- Is there a performance improvement after finetuning?
- Do you observe a latency difference? How could you explain it?

In [11]:
### To Complete ###
smash_config = SmashConfig()
smash_config.add_tokenizer(model_id)
smash_config.add_data("WikiText", tokenizer=tokenizer)
smash_config['quantizer'] = 'quanto'
smash_config['quanto_weight_bits'] = "qint4"
smash_config['recoverer'] = "text_to_text_inplace_perp"
# smash_config['recoverer'] = "text_to_text_perp"
model = model.to('cuda')

results = smash_evaluate_perplexity_time(model, tokenizer, smash_config, dataset="WikiText")
print(results)
### End of To Complete ###

INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f4955a23370>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
INFO - Verifying Pruna token.
INFO - You have used 345 hours this month.
INFO - Starting quantizer quanto...
INFO - quantizer quanto was applied successfully.
INFO - Star

Converting train dataset to ChatML:   0%|          | 0/17556 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/17556 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/17556 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/17556 [00:00<?, ? examples/s]

Step,Training Loss
2194,3.9552
4388,3.829
6582,3.8045
8776,3.7472
10970,3.7273
13164,3.7345
15358,3.711
17552,3.6917


INFO - recoverer text_to_text_inplace_perp was applied successfully.
INFO - You have used 348 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f4955a23370>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
INFO - Using provided list of metric instances.
INFO - Evaluating a smashed model.
INFO - Detected transformers model. Using TransformerHandler.
- T

{'perplexity_y_gt': 29.485769271850586, 'inference_elapsed_time_ms_@1': 5949.783557891846, 'inference_latency_ms_@1': 59.497835578918455, 'inference_throughput_batches_per_ms_@1': 0.01680733408652473}


### 3.3 Finetune LLM quality with more/less in-distribution data

**Implementation task:**
- Finetune in-place or by adding paramters a quantized LLM with Quanto and evaluate its quality and latency metric on the WikiText dataset.

**Questions:**
- Is there a performance improvement after finetuning?
- Do you observe a latency difference? How could you explain it?

In [12]:
### To Complete ###
train_ds, val_ds, test_ds = load_dataset("mikasenghaas/wikitext-2", split=["train", "validation", "test"])
train_ds = train_ds.select(range(1000))

smash_config = SmashConfig()
smash_config.add_tokenizer(model_id)
smash_config.add_data(
    (train_ds, val_ds, test_ds),
    collate_fn="text_generation_collate"
)
smash_config['quantizer'] = 'quanto'
smash_config['quanto_weight_bits'] = "qint4"
smash_config['recoverer'] = "text_to_text_inplace_perp"
# smash_config['recoverer'] = "text_to_text_perp"

results = smash_evaluate_perplexity_time(model, tokenizer, smash_config, dataset="WikiText")
print(results)
### End of To Complete ###

INFO - Using max_seq_len of tokenizer: None
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f4955a23370>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
INFO - Verifying Pruna token.
INFO - You have used 348 hours this month.
INFO - Starting quantizer quanto...
INFO - quantizer 

Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss
125,4.3019
250,4.2493
375,4.0035
500,3.9661
625,3.8754
750,3.9567
875,3.8156
1000,3.809


INFO - recoverer text_to_text_inplace_perp was applied successfully.
INFO - You have used 348 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f4955a23370>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
INFO - Using provided list of metric instances.
INFO - Evaluating a smashed model.
INFO - Detected transformers model. Using TransformerHandler.
- T

{'perplexity_y_gt': 35.11149215698242, 'inference_elapsed_time_ms_@1': 6988.028305053711, 'inference_latency_ms_@1': 69.88028305053712, 'inference_throughput_batches_per_ms_@1': 0.01431018817248929}


### 3.3 Finetune LLM quality with random data

**Implementation task:**
- Finetune in-place or by adding paramters a quantized LLM with Quanto and evaluate its quality and latency metric on the WikiText dataset.

**Questions:**
- Is there a performance improvement after finetuning?
- Do you observe a latency difference? How could you explain it?

In [11]:
### To Complete ###
from datasets import Dataset
import random

dataset = Dataset.from_dict({
    "text": ["".join([chr(random.randint(97, 122)) for _ in range(100)]) for _ in range(1000)]
})
train_ds, val_ds, test_ds = split_train_into_train_val_test(dataset, seed=42)

model = model.to("cuda")
smash_config = SmashConfig()
smash_config.add_tokenizer(model_id)
smash_config.add_data(
    (train_ds, val_ds, test_ds),
    collate_fn="text_generation_collate"
)
smash_config['device'] = 'cuda'
smash_config['quantizer'] = 'quanto'
smash_config['quanto_weight_bits'] = "qint4"
smash_config['recoverer'] = "text_to_text_inplace_perp"
# smash_config['recoverer'] = "text_to_text_perp"

results = smash_evaluate_perplexity_time(model, tokenizer, smash_config, dataset="WikiText")
print(results)
### End of To Complete ###

INFO - Loaded only training, splitting train 80/10/10 into train, validation and test...
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f8538f52dd0>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
INFO - Verifying Pruna token.
INFO - You have used 359 hours this month.
INFO - S

Converting train dataset to ChatML:   0%|          | 0/800 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/800 [00:00<?, ? examples/s]

Step,Training Loss
100,6.3768
200,6.0315
300,5.8231
400,5.8731
500,5.8169
600,5.8043
700,5.8126
800,5.8678


INFO - recoverer text_to_text_inplace_perp was applied successfully.
INFO - You have used 360 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Using max_seq_len of tokenizer: None
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f8538f52dd0>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maxim

{'perplexity_y_gt': 167.92860412597656, 'inference_elapsed_time_ms_@1': 3896.4342079162598, 'inference_latency_ms_@1': 38.9643420791626, 'inference_throughput_batches_per_ms_@1': 0.025664490830316914}


### 3.3 Finetune LLM quality with out-of-distribution data

**Implementation task:**
- Finetune in-place or by adding paramters a quantized LLM with Quanto and evaluate its quality and latency metric on the WikiText dataset.

**Questions:**
- Is there a performance improvement after finetuning?
- Do you observe a latency difference? How could you explain it?

In [13]:
### To Complete ###
train_ds = load_dataset("SamuelYang/bookcorpus")["train"]
train_ds, val_ds, test_ds = split_train_into_train_val_test(train_ds, seed=42)
train_ds = train_ds.select(range(1000))

model = model.to("cuda")
smash_config = SmashConfig()
smash_config.add_tokenizer(model_id)
smash_config.add_data(
    (train_ds, val_ds, test_ds),
    collate_fn="text_generation_collate"
)
smash_config['device'] = 'cuda'
smash_config['quantizer'] = 'quanto'
smash_config['quanto_weight_bits'] = "qint4"
smash_config['recoverer'] = "text_to_text_inplace_perp"
# smash_config['recoverer'] = "text_to_text_perp"

results = smash_evaluate_perplexity_time(model, tokenizer, smash_config, dataset="WikiText")
print(results)
### End of To Complete ###

INFO - Loaded only training, splitting train 80/10/10 into train, validation and test...
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f8538f52dd0>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
INFO - Verifying Pruna token.
INFO - You have used 360 hours this month.
INFO - S

Converting train dataset to ChatML:   0%|          | 0/1000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

Step,Training Loss
125,5.2634
250,4.2522
375,4.0946
500,4.0582
625,4.063
750,3.8801
875,3.8654
1000,3.7615


INFO - recoverer text_to_text_inplace_perp was applied successfully.
INFO - You have used 360 hours this month.
INFO - Using call_type: y_gt for metric perplexity
INFO - Testing compatibility with functools.partial(<function text_generation_collate at 0x7f8538f52dd0>, tokenizer=GPT2TokenizerFast(name_or_path='facebook/opt-350m', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, max_seq_len=None)...
INFO - Using provided list of metric instances.
INFO - Evaluating a smashed model.
INFO - Detected transformers model. Using TransformerHandler.
- T

{'perplexity_y_gt': 70.56623840332031, 'inference_elapsed_time_ms_@1': 3814.258159637451, 'inference_latency_ms_@1': 38.14258159637451, 'inference_throughput_batches_per_ms_@1': 0.026217417860752535}
