### Practice: Parameter Efficient Fine-Tuning
In this notebook, you're gonna fine-tune large language models within limited GPU memory.

In [1]:
%pip install --quiet transformers accelerate sentencepiece optimum peft bitsandbytes

import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

import torch
import torch.nn as nn
import torch.nn.functional as F

import transformers
from tqdm.auto import tqdm, trange
import warnings
warnings.filterwarnings("ignore")
import gc
assert torch.cuda.is_available(), "you need cuda for this part"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m425.8/425.8 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m96.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m69.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m55.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# !pip install numba

# from numba import cuda
# device = cuda.get_current_device()
# device.reset()

In [3]:
model_name = "Qwen/Qwen3-4B"
quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True)

# loading Qwen tokenizer ...
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.pad_token_id = tokenizer.eos_token_id

# ... and the model itself
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map=device, low_cpu_mem_usage=True, offload_state_dict=True,
    quantization_config=quantization_config, torch_dtype=torch.float32,
)
for param in model.parameters():
    param.requires_grad=False

model.gradient_checkpointing_enable()  # only store a small subset of activations, re-compute the rest.
model.enable_input_require_grads()     # override an implementation quirk in gradient checkpoints that disables backprop unless inputs require grad
# more on gradient checkpointing: https://pytorch.org/docs/stable/checkpoint.html https://arxiv.org/abs/1604.06174

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

2025-10-05 07:21:27.107182: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759648887.423982      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759648887.511892      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

### Prompt tuning: the story of a fox (2 pts)

![img](https://i.imgur.com/Ux3qQAu.png) (source: theodd1souts.fandom.com)

In [4]:
prompt = 'A quick brown fox'
batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)

for i in range(10):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
    batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

print("\nOutput:", tokenizer.decode(batch['input_ids'][0].cpu().numpy().tolist()))


Output: A quick brown fox jumped over the lazy dog. That's the famous


What a blatant lie! This particular fox assures you that it didn't in fact jump over the lazy dog. No, sir! The fox was just minding its own business. __Your task is to train the model to say truth: no dog was jumped over today.__

In [5]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)
outputs = model(**batch)

next_word_logits = outputs.logits[:, :-1]
true_next_tokens = batch['input_ids'][:, 1:]
loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))

print("Loss:", loss)

Loss: tensor(4.3350, device='cuda:0', grad_fn=<NllLossBackward0>)


Except, we can't train the entire model - that would be 28GB gradients in float32. Instead, let's run [prompt tuning](https://arxiv.org/abs/2104.08691).

![img](https://i.imgur.com/VwNNKnb.png)


In [6]:
class WordEmbeddingsWithLearnedPrompts(nn.Module):
    """
    To perform prompt tuning, you will need to replace model's original word embeddings with a layer - THIS layer
     - that inserts trainable prompts instead of the first N token embeddings. """

    def __init__(self, word_embeddings: nn.Embedding, num_prompts: int):
        super().__init__()
        self.original_word_embeddings = word_embeddings
        self.num_prompts = num_prompts
        self.learnable_prompts = nn.Parameter(
            torch.randn(1, num_prompts, word_embeddings.embedding_dim), requires_grad=True)

    def forward(self, input_ids: torch.LongTensor):
        # input_ids shape: [batch_size, seq length]
        assert input_ids.dtype == torch.int64
        assert input_ids.shape[1] > self.num_prompts
        assert torch.all(input_ids[:, :self.num_prompts] == tokenizer.pad_token_id).item(), "don't forget to prepend several BOS tokens to input_ids"

        # Your task: embed input_ids, but replace the first :num_prompts: tokens with self.learnable_prompts
        # This is because we will prepend :num_prompts: padding tokens at the beginning

        # After you are done, you must produce a word embedding vector for each token in input_ids,
        # except that the first :num_prompts: vectors should equal learnable_prompts;
        # any additional vectors after first :num_prompts: ones should be embedded as usual
        # Note: since you're dealing with trainable params, please torch.cat instead of item assignment

        orig_outp = self.original_word_embeddings(input_ids)
        return torch.cat((self.learnable_prompts, orig_outp[:, self.num_prompts:]), dim=1)

In [7]:
num_prompts = 16
test_emb_layer = WordEmbeddingsWithLearnedPrompts(model.model.embed_tokens, num_prompts=num_prompts).to(device)
test_input_ids = tokenizer("a cat say on a may", return_tensors='pt')['input_ids'].to(device)

space_for_prompts = torch.full([len(test_input_ids), num_prompts], fill_value=tokenizer.pad_token_id,
                               dtype=torch.int64, device=device)
test_inputs_with_prompts = torch.cat([space_for_prompts, test_input_ids], dim=1)

with torch.amp.autocast('cuda'):
  test_prompt_embeddings = test_emb_layer(test_inputs_with_prompts)

assert test_prompt_embeddings.shape[:2] == test_inputs_with_prompts.shape
assert test_prompt_embeddings.shape[-1] == model.config.hidden_size
assert torch.allclose(test_prompt_embeddings[:, :num_prompts], test_emb_layer.learnable_prompts.float())
assert torch.allclose(test_prompt_embeddings[:, num_prompts:], model.model.embed_tokens(test_input_ids).float())
print("Looks legit!")

Looks legit!


__Now that it works,__ let's inject learnable prompts into the main model and teach it about foxes.

In [8]:
assert isinstance(model.model.embed_tokens, nn.Embedding), "you have already replaced the embedding layer. If the replacement is broken, please reload the model"

model.model.embed_tokens = WordEmbeddingsWithLearnedPrompts(model.model.embed_tokens, num_prompts=num_prompts).to(device)

opt = torch.optim.Adam([model.model.embed_tokens.learnable_prompts], lr=0.01)

In [9]:
the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
num_steps = 200
for i in range(num_steps):
    batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)
    space_for_prompts = torch.full([len(test_input_ids), num_prompts], fill_value=tokenizer.pad_token_id,
                                   dtype=torch.int64, device=device)
    batch['input_ids'] = torch.cat([space_for_prompts, batch['input_ids']], dim=1)
    batch['attention_mask'] = torch.cat([torch.ones_like(space_for_prompts), batch['attention_mask']], dim=1)
    
    outputs = model(**batch)
    next_word_logits = outputs.logits[:, num_prompts : -1, :]
    true_next_tokens = batch['input_ids'][:, num_prompts + 1:]
    loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
    loss.backward()

    opt.step()
    opt.zero_grad()
    if (i + 1) % 20 == 0:
        print(f"Step: {i + 1}, loss: {loss}.")

# raise NotImplemented("Your task: iteratively train the model to reduce loss using prompt optimizer (opt)")
assert loss.item() <= 0.1
print("Good job!")

Step: 20, loss: 3.340280055999756.
Step: 40, loss: 2.1509807109832764.
Step: 60, loss: 0.8044292330741882.
Step: 80, loss: 0.0049959332682192326.
Step: 100, loss: 0.0010111827868968248.
Step: 120, loss: 0.0005855258787050843.
Step: 140, loss: 0.000455891655292362.
Step: 160, loss: 0.0003850536304526031.
Step: 180, loss: 0.0003361439739819616.
Step: 200, loss: 0.00029881467344239354.
Good job!


In [10]:
prompt = 'A quick brown fox'
batch = tokenizer(prompt, return_tensors='pt', return_token_type_ids=False).to(device)
batch['input_ids'] = torch.cat([space_for_prompts, batch['input_ids']], dim=1)
batch['attention_mask'] = torch.cat([torch.ones_like(space_for_prompts), batch['attention_mask']], dim=1)


for i in range(15):
    next_token = model(**batch).logits[0, -1].argmax(-1).reshape(1, 1)
    batch['input_ids'] = torch.cat([batch['input_ids'], next_token], dim=-1)
    batch['attention_mask'] = torch.cat([batch['attention_mask'], torch.ones_like(next_token)], dim=-1)

print("\nOutput:", tokenizer.decode(batch['input_ids'][0, num_prompts:].cpu().numpy().tolist()))

# if you did everything right, the model will deny that the fox jumped over the lazy dog


Output: A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway


In [11]:
del model
del opt
torch.cuda.empty_cache()
gc.collect()

125

### Using HuggingFace PEFT (2 points)

[`peft`](https://huggingface.co/docs/peft/index) is a transformer's sister library that allows you to apply various __p__arameter __e__fficient __f__ine-__t__uning methods to pre-trained transformers. The library imlements both prompt tuning, prefix tuning, as well as several adapter-based techniques under a common interface:



In [12]:
import peft

# reloading model
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map=device, low_cpu_mem_usage=True, offload_state_dict=True,
    quantization_config=quantization_config, torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad=False
    
assert isinstance(model.model.embed_tokens, nn.Embedding), "please reload the model"

peft_config = peft.PromptTuningConfig(task_type=peft.TaskType.CAUSAL_LM, num_virtual_tokens=16)
model = peft.get_peft_model(model, peft_config)  # note: for most peft methods, this line also modifies model in-place
print("Trainable parameters:", sum(p.numel() for p in model.parameters() if p.requires_grad))
print("Total parameters (excluding quantization):", sum(p.numel() for p in model.parameters()))

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Trainable parameters: 40960
Total parameters (excluding quantization): 2205851136


In [13]:
opt = torch.optim.Adam(model.parameters(), lr=0.01)

In [14]:
# Your task: optimize the PEFT-wrapped model to achieve next token prediction loss < 0.1, but this time using PEFT
# Please note: you no longer need to prepend PAD tokens, but you still need to skip :num_virtual_tokens: first logits.
# Finally, generate the sentence to make sure that the model learned the truth.

the_truth = "A quick brown fox did not jump over the lazy dog. Besides, that dog deserved it anyway!"
num_steps = 200
for i in range(num_steps):
    batch = tokenizer(the_truth, return_tensors='pt', return_token_type_ids=False).to(device)    
    outputs = model(**batch)
    next_word_logits = outputs.logits[:, num_prompts : -1, :]
    true_next_tokens = batch['input_ids'][:, 1:]
    loss = F.cross_entropy(next_word_logits.flatten(0, 1), true_next_tokens.flatten(0, 1))
    loss.backward()

    opt.step()
    opt.zero_grad()
    if (i + 1) % 20 == 0:
        print(f"Step: {i + 1}, loss: {loss}.")

# raise NotImplemented("Your task: iteratively train the model to reduce loss using prompt optimizer (opt)")
assert loss.item() <= 0.1
print("Good job!")

Step: 20, loss: 3.5473380088806152.
Step: 40, loss: 2.696080207824707.
Step: 60, loss: 1.0062546730041504.
Step: 80, loss: 0.6050422787666321.
Step: 100, loss: 0.35903388261795044.
Step: 120, loss: 0.3504926860332489.
Step: 140, loss: 0.007468266878277063.
Step: 160, loss: 0.0019193228799849749.
Step: 180, loss: 0.0011071553453803062.
Step: 200, loss: 0.0008447786094620824.
Good job!


In [15]:
del model
del opt
torch.cuda.empty_cache()
gc.collect()

5386

### Parameter-efficient finetuning with LoRA (2 points)

When training on more serious tasks, you can use low-rank adapters based on the [LoRA paper](https://arxiv.org/pdf/2106.09685.pdf).

The core idea is to add low-rank adapters __in parallel with existing linear layers,__ like this:
<center><img src="https://i.imgur.com/6bQLNiG.png" width=240px></center>

In the original LoRA paper, the adapters were only added to attention projection matrices. However, [subsequent works](https://arxiv.org/abs/2305.14314) show that it is useful to adapt FFNs as well. But before we do any training, we need to implement the basic LoRA layer.

In [16]:
# re-load the model to remove any previous PEFT tuners
model_name = "Qwen/Qwen3-4B"
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map=device, low_cpu_mem_usage=True, offload_state_dict=True,
    quantization_config=quantization_config, torch_dtype=torch.float32,  # weights are 4-bit; layernorms and activations are fp32
)
for param in model.parameters():
    param.requires_grad=False
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
class LoRALayer(nn.Module):
    """Wraps a linear layer with LoRA-like adapter. Wraps an existing OPT linear layer"""
    def __init__(self, module: nn.Linear, rank: int):
        super().__init__()
        self.module = module  # pre-trained (frozen) linear layer
        self.adapter_A = nn.Parameter(torch.empty(module.in_features, rank, device=module.weight.device))
        nn.init.kaiming_uniform_(self.adapter_A, a=5 ** 0.5)
        self.adapter_B = nn.Parameter(torch.zeros(rank, module.out_features, device=module.weight.device))

    def forward(self, inp):
        # Apply self.module and LoRA adapter, return the sum (self.module outputs + adapter outputs)
        return self.module(inp) + inp @ self.adapter_A @ self.adapter_B

In [3]:
# test your implementation
test_linear = nn.Linear(128, 128)
test_linear.weight.data[...] = torch.eye(128)
test_adapter = LoRALayer(test_linear, rank=8)

assert torch.allclose(test_adapter(torch.ones(1, 1, 128)), test_linear.bias + 1), "please check your forward pass"

test_adapter.adapter_A.data[...] = torch.linspace(0.1, -0.5, 128 * 8).view(128, 8)
test_adapter.adapter_B.data[...] = torch.linspace(0.5, -0.1, 128 * 8).view(8, 128)
test_linear.bias.data[...] = torch.linspace(1., -1., 128)

dummy_loss = F.mse_loss(test_adapter(torch.ones(1, 128) / 128).squeeze(), torch.linspace(-1, 1, 128))
assert torch.allclose(dummy_loss, torch.tensor(1.3711389), rtol=0, atol=1e-4)
dummy_loss.backward()
assert all(w.grad is not None for w in [test_adapter.adapter_A, test_adapter.adapter_B]), "some adapter weights have no grad"
assert torch.allclose(test_adapter.adapter_A.grad.sum(), torch.tensor(-0.60158), rtol=0, atol=1e-4), "bad grad w.r.t. A"
assert torch.allclose(test_adapter.adapter_B.grad.sum(), torch.tensor(0.9931), rtol=0, atol=1e-4), "bad grad w.r.t. B"
# note: bad grad means that your code is different from LoRA paper OR that your code is not autograd-friendly (e.g. no_grad)
del dummy_loss, test_linear, test_adapter
print("All tests passed!")

All tests passed!


### Apply LoRA to the model

The code below applies LoRA adapters on top of Q/K/V linear layers in Qwen attention. You may also choose to modify other layers:
* self_attn.o_proj - attention output projection
* mlp.up_proj, mlp.gate_proj, mlp.down_proj - transformer feedforward layers
* lm_head - output LM head

__Note:__ please scroll down for the homework task

In [19]:
model

Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 2560)
    (layers): ModuleList(
      (0-35): 36 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear4bit(in_features=2560, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=2560, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=2560, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (up_proj): Linear4bit(in_features=2560, out_features=9728, bias=False)
          (down_proj): Linear4bit(in_features=9728, out_features=2560, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen3RMSNorm((2560,), eps=1e-06)
        (po

In [20]:
lora_rank = 8
for name, module in model.model.layers.named_modules():
    if 'Qwen3DecoderLayer' in repr(type(module)):
        module.self_attn.q_proj = LoRALayer(module.self_attn.q_proj, rank=lora_rank).to(device)
        module.self_attn.k_proj = LoRALayer(module.self_attn.k_proj, rank=lora_rank).to(device)
        module.self_attn.v_proj = LoRALayer(module.self_attn.v_proj, rank=lora_rank).to(device)

assert sum(isinstance(module, LoRALayer) for module in model.modules()) == 108  # for Qwen3-8b

In [21]:
batch = tokenizer("This model wants to share its greatest secret:", return_tensors='pt', return_token_type_ids=False).to(device)
# test a single training step, make sure we get meaningful gradients
with torch.amp.autocast('cuda', dtype=torch.float32):
    out = model.forward(**batch)
    (out.logits.norm() / 100).backward()

for i, module in enumerate(model.modules()):
    if isinstance(module, LoRALayer):
        assert module.adapter_B.grad is not None
        assert module.adapter_B.grad.norm().item() > 0

model.zero_grad(set_to_none=True)
print("Grad check successful, well done!")

Grad check successful, well done!


In [22]:
def count_trainable_parameters(model):  
    return sum(p.numel() for p in model.parameters() if p.requires_grad)  

trainable_params = count_trainable_parameters(model)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params}, in percent: {trainable_params / all_params * 100:.3f}.") 

Trainable parameters: 3981312, in percent: 0.180.


### (example) How to train your model

The example below shows how to train the LoRA adapters on a dummy dataset. You will need to run a _similar_ training task later.

__Note:__ please scroll down for the homework task

In [None]:
# checking if the model can learn. Change max_steps for proper training
import datasets
data = datasets.load_dataset("Abirate/english_quotes", split="train[:64]") # 64 lines
data = data.map(lambda samples: tokenizer(samples['quote'], max_length=256, padding=True), batched=True)

model._hf_peft_config_loaded = True  # silence a warning from HF trainer
model.config.use_cache = False
model.hf_device_map[''] = torch.device("cuda:0")   # fix accelerate error

trainer = transformers.Trainer(
    model=model, train_dataset=data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4,
        warmup_steps=50, max_steps=20, learning_rate=5e-5, fp16=True,
        logging_steps=5, output_dir='outputs', report_to="none"),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

# NOTE: this is just an example! you do not have to wait for this progressbar to finish :)

### Final task: *actually* train the model (4 points)

Your task is to fine-tune the model to _generate python code_. Please use the above examples for inspiration. More specifically,

* __dataset:__ use [codeparrot-clean](https://huggingface.co/datasets/codeparrot/codeparrot-clean) or any other data containing python code. Since you do not need much data for this excercise, it is enough to use just shorter validation subset of `codeparrots`
* __preprocessing:__ select python code based on file extentions (.py)  (may skip in case of codeparrot - it is 100% python)
* __short lines:__ please take the first 512 characters of each line
* __adapter type:__ please use LoRA as defined above __plus at least one of:__
   - extra adapter on lm_head
   - extra adapter on MLP components (mlp.*)
   - trainable input embeddings (requires tweaking memory usage)

* __training:__ you do not have to train to convergence. If all goes well, your model should `.generate` code after 500 steps. Please use batch size of at least 4 (4 x 1 x 512 tokens) using `gradient_accumulation_steps=4`.


Note: the peft library also has LoRA implementation. However, we ask that for this assignment you show at least one complete training run with your own LoRA code.

__Alternative assignment:__ Instead of doing python code, feel free to substitute the task with any other dataset, e.g. your favorite artist or podcast, as long as it's ethical. If you choose your own task, please show examples of what your model learned - or did not learn, akin to the code examples below.

In [4]:
!wget https://huggingface.co/datasets/codeparrot/codeparrot-clean/resolve/main/file-000000000001.json.gz
!gzip -d file-000000000001.json.gz
!rm file-000000000001.json.gz

--2025-10-05 10:00:13--  https://huggingface.co/datasets/codeparrot/codeparrot-clean/resolve/main/file-000000000001.json.gz
Resolving huggingface.co (huggingface.co)... 13.226.251.112, 13.226.251.20, 13.226.251.81, ...
Connecting to huggingface.co (huggingface.co)|13.226.251.112|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cas-bridge.xethub.hf.co/xet-bridge-us/621ffdd236468d709f183925/50e79501f1fc8cd1dd00644073f04a6b7f07e1ca7d78021c84945a0f03629d4e?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=cas%2F20251005%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20251005T100013Z&X-Amz-Expires=3600&X-Amz-Signature=2f4da7f37c98508e34cde63c7502a085533eff31dd8b6ab30670f740ca9ce591&X-Amz-SignedHeaders=host&X-Xet-Cas-Uid=public&response-content-disposition=inline%3B+filename*%3DUTF-8%27%27file-000000000001.json.gz%3B+filename%3D%22file-000000000001.json.gz%22%3B&response-content-type=application%2Fgzip&x-id=GetObject&Exp

In [5]:
import datasets
data_files = ['file-000000000001.json']#, 'file-000000000002.json', 'file-000000000003.json']
codeparrot_data = datasets.load_dataset('json', data_files=data_files)
codeparrot_data = codeparrot_data.remove_columns(
    ['repo_name', 'path', 'copies', 'size', 'license', 'hash', 'line_mean', 'line_max', 'alpha_frac', 'autogenerated']
)
codeparrot_data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 100000
    })
})

In [16]:
model_name = 'unsloth/Llama-3.2-3B'
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, device_map=device)
tokenizer.pad_token_id = tokenizer.eos_token_id
quantization_config = transformers.BitsAndBytesConfig(load_in_4bit=True)

model = transformers.AutoModelForCausalLM.from_pretrained(
    model_name, device_map=device, low_cpu_mem_usage=True, offload_state_dict=True,
    quantization_config=quantization_config, torch_dtype=torch.float32,
)
for param in model.parameters():
    param.requires_grad=False

model.gradient_checkpointing_enable()  # only store a small subset of activations, re-compute the rest.
model.enable_input_require_grads()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
MAX_LENGTH = 512

def prepare_code(example):
    return tokenizer(example['content'], padding='max_length', max_length=MAX_LENGTH, truncation=True)

codeparrot_data = codeparrot_data.map(prepare_code, batched=True)

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

In [8]:
prompts = ['', 'import', 'from', 'while', 'try', 'if', 'for', 'torch']
inputs = tokenizer(prompts, return_tensors='pt', padding=True).to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
texts_before_finetuning = tokenizer.batch_decode(outputs, skip_special_tokens=True)
for p, text in zip(prompts, texts_before_finetuning):
    print("PROMPT:", p)
    print(text)

PROMPT: 
Question:
Let h = -2.9 - 0.1. Let z = 0.5 + 1.5. Let q = 2.3 - z. Which is the closest to q?  (a) 
PROMPT: import
import React, { useEffect } from'react';
import { connect } from'react-redux';
import { Button, Col, Form, Row } from'react-bootstrap';
import { addPost, editPost, getPosts, getPostsByCategory, get
PROMPT: from
from django.shortcuts import render, redirect, get_object_or_404
from.models import Article
from.forms import ArticleForm
from django.contrib.auth.decorators import login_required
from django.contrib.auth import get_user_model
from django.contrib import messages

PROMPT: while
while True:
    try:
        a, b = map(int, input().split())
        if a > b:
            a, b = b, a
        print(a * b)
    except:
        break

PROMPT: try
try:
    from setuptools import setup
except ImportError:
    from distutils.core import setup

config = {
    'description': 'POC',
    'author': 'Yash Khandelwal',
   'version': '0.1',

PROMPT: if
if you are looking for th

In [11]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm

In [17]:
lora_rank = 4

for name, module in model.model.layers.named_modules():
    if 'LlamaDecoderLayer' in repr(type(module)):
        module.self_attn.q_proj = LoRALayer(module.self_attn.q_proj, rank=lora_rank).to(device)
        module.self_attn.k_proj = LoRALayer(module.self_attn.k_proj, rank=lora_rank).to(device)
        module.self_attn.v_proj = LoRALayer(module.self_attn.v_proj, rank=lora_rank).to(device)

        module.mlp.gate_proj = LoRALayer(module.mlp.gate_proj, rank=lora_rank).to(device)
        module.mlp.up_proj = LoRALayer(module.mlp.up_proj, rank=lora_rank).to(device)
        module.mlp.down_proj = LoRALayer(module.mlp.down_proj, rank=lora_rank).to(device)

In [18]:
def count_trainable_parameters(model):  
    return sum(p.numel() for p in model.parameters() if p.requires_grad)  

trainable_params = count_trainable_parameters(model)
all_params = sum(p.numel() for p in model.parameters())
print(f"Trainable parameters: {trainable_params}, in percent: {trainable_params / all_params * 100:.3f}.") 

Trainable parameters: 5390336, in percent: 0.298.


In [19]:
model._hf_peft_config_loaded = True  # silence a warning from HF trainer
model.config.use_cache = False
model.hf_device_map[''] = torch.device("cuda:0")   # fix accelerate error

trainer = transformers.Trainer(
    model=model, train_dataset=codeparrot_data['train'],
    args=transformers.TrainingArguments(
        per_device_train_batch_size=4, gradient_accumulation_steps=4,
        # note: if you want larger batch size, increase gradient_accumulation_steps
        warmup_steps=50, max_steps=150, learning_rate=5e-4, fp16=True,
        logging_steps=10, output_dir='outputs', report_to="none",
        weight_decay=0.01),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
# if you see cache warnings, set `model.config.use_cache = False` to silence them. Please re-enable for inference!

trainer.train()

Step,Training Loss
10,0.6569
20,0.6642
30,0.621
40,0.5956
50,0.6176
60,0.6078
70,0.6197
80,0.6475
90,0.659
100,0.6172


In [22]:
model.config.use_cache = True
prompts = ['', 'import', 'from', 'while', 'try', 'if', 'for', 'torch']
inputs = tokenizer(prompts, return_tensors='pt', padding=True).to(device)
outputs = model.generate(**inputs, max_new_tokens=50)
texts_after_finetuning = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# for p, text in zip(prompts, texts_before_finetuning):
#     print("PROMPT:", p)
#     print(text)

In [23]:
# This template helps to compare generated code samples in pretty table form
# feel free to present your work in other forms

from IPython.display import HTML, display
table_template = """<table style="border:1px solid black" >
  <tr>
    <th style="text-align: center; border:1px solid black">PROMPT</th>
    <th style="text-align: center; border:1px solid black">BEFORE</th>
    <th style="text-align: center; border:1px solid black">AFTER</th>
  </tr>
{}
</table>"""

row_template = '''  <tr>
    <td style="width:20%; border:1px solid black"><pre align="left">`{}`</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
    <td style="width:40%; border:1px solid black"><pre align="left">{}</pre></td>
  </tr>'''

rows = []

for p, b, a in zip(prompts, texts_before_finetuning, texts_after_finetuning):
    # replace placeholders in the format() arguments
    rows.append(row_template.format(p, b, a))

display(HTML(table_template.format('\n'.join(rows))))

PROMPT,BEFORE,AFTER
``,Question: Let h = -2.9 - 0.1. Let z = 0.5 + 1.5. Let q = 2.3 - z. Which is the closest to q? (a),from __future__ import unicode_literals import json import logging import re import time import requests import simplejson from django.conf import settings from django.core.exceptions import ImproperlyConfigured from django.core.urlresolvers
`import`,"import React, { useEffect } from'react'; import { connect } from'react-redux'; import { Button, Col, Form, Row } from'react-bootstrap'; import { addPost, editPost, getPosts, getPostsByCategory, get","import numpy as np import matplotlib.pyplot as plt def read_data(file_name):  data = np.loadtxt(file_name)  return data def plot_data(data, file_name):  plt.figure()  plt.plot(data[:, 0], data[:,"
`from`,"from django.shortcuts import render, redirect, get_object_or_404 from.models import Article from.forms import ArticleForm from django.contrib.auth.decorators import login_required from django.contrib.auth import get_user_model from django.contrib import messages","from __future__ import absolute_import, division, print_function import os import sys import unittest import numpy as np from scipy import sparse from scipy import stats from scipy import linalg from scipy import optimize from scipy import"
`while`,"while True:  try:  a, b = map(int, input().split())  if a > b:  a, b = b, a  print(a * b)  except:  break",while True:  n = int(input())  if n == 0:  break  else:  count = 0  for i in range(n):  for j in range(n):  if i == j:  count +=
`try`,"try:  from setuptools import setup except ImportError:  from distutils.core import setup config = {  'description': 'POC',  'author': 'Yash Khandelwal',  'version': '0.1',",try:  import Tkinter except:  import tkinter as Tkinter from tkinter import ttk import os import os.path import sys import time import random import threading import re import shutil import threading import socket
`if`,"if you are looking for the best and most comfortable bed, you should consider the benefits of the adjustable bed. Adjustable beds are an excellent way to provide the best possible rest for your family members, but they are also an excellent way to provide the best possible",if __name__ == '__main__':  # This file is executed only when the program is run as a script  # (e.g. by typing `python -m mymodule` at the command line)  from mymodule import *
`for`,"for _ in range(int(input())):  n = int(input())  a = list(map(int, input().split()))  a.sort()  if n % 2 == 0:  print(a[(n // 2) - 1","for _ in range(int(input())):  n, k = list(map(int, input().split()))  if k == 0:  print(n)  continue  if n % k == 0:  print(n)  continue"
`torch`,"torch.nn.functional.conv2d (x, weight, bias=None, stride=1, padding=0, dilation=1, groups=1, bias_attr=None, out=None, data_format=None, *args, **kwargs) ¶ Performs","torch.nn.functional.conv2d ¶ torch.nn.functional. conv2d ( input, weight, bias=None, stride=1, padding=0, dilation=1, transposed=False, output_padding=0, groups=1, bias_shape=None"


**All generations in python**