In [1]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained("/data/mfx/huggingface/meta-llama/Llama-3.2-1B", device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained("/data/mfx/huggingface/meta-llama/Llama-3.2-1B")

In [3]:
inputs = tokenizer("Here I'll write a poem about the sea.", return_tensors="pt").to("cuda")

In [4]:
output = model.generate(**inputs, max_new_tokens=128, do_sample=False)
print(tokenizer.batch_decode(output)[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


<|begin_of_text|>Here I'll write a poem about the sea. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of


In [5]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0183,  0.0071,  0.0219,  ..., -0.0070, -0.0089,  0.0149],
        [ 0.0112,  0.0593,  0.0630,  ..., -0.0334, -0.0148,  0.0058],
        [ 0.0182,  0.0141,  0.0361,  ..., -0.0432, -0.0388, -0.0233],
        ...,
        [ 0.0305,  0.0289,  0.0801,  ..., -0.0767, -0.0311, -0.0334],
        [ 0.0242, -0.0325,  0.0369,  ..., -0.0123, -0.0269, -0.0151],
        [-0.0264, -0.0498, -0.0210,  ...,  0.0601,  0.0130, -0.0007]],
       device='cuda:0', requires_grad=True)

In [6]:
lora_config = LoraConfig(r=32, target_modules=['q_proj'], pissaquant_config={'pissaquant_bits': 4, 'apply_quantization': False}, init_lora_weights='PiSSAQuant')

In [7]:
lora_config

LoraConfig(task_type=None, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, inference_mode=False, r=32, target_modules={'q_proj'}, exclude_modules=None, lora_alpha=8, lora_dropout=0.0, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights='PiSSAQuant', layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, pissaquant_config={'pissaquant_bits': 4, 'apply_quantization': False}, eva_config=None, corda_config=None, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False)

In [8]:
peft_model = get_peft_model(model, lora_config)

In [9]:
peft_model.base_model.model.model.layers[0].self_attn.q_proj.lora_S.default

Parameter containing:
tensor([210.2891,  17.0796,  11.6353,   9.5772,   8.9909,   8.3452,   8.1227,
          8.0238,   7.9361,   7.7079,   7.5841,   7.1897,   7.1564,   7.0057,
          6.7993,   6.7189,   6.6833,   6.4882,   6.4230,   6.3611,   6.3275,
          6.1197,   6.0687,   6.0429,   5.9617,   5.8813,   5.8114,   5.7684,
          5.6735,   5.5499,   5.5069,   5.3306], device='cuda:0',
       requires_grad=True)

In [10]:
output = peft_model.generate(**inputs, max_new_tokens=128, do_sample=False)
print(tokenizer.batch_decode(output)[0])

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


<|begin_of_text|>Here I'll write a poem about the sea. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of a haiku. I'll write it in the form of


In [11]:
model = peft_model.merge_and_unload()
# output = model.generate(**inputs, max_new_tokens=128, do_sample=False)
# print(tokenizer.batch_decode(output)[0])

In [12]:
model.model.layers[0].self_attn.q_proj.weight

Parameter containing:
tensor([[-0.0183,  0.0071,  0.0219,  ..., -0.0070, -0.0089,  0.0149],
        [ 0.0112,  0.0593,  0.0630,  ..., -0.0334, -0.0148,  0.0058],
        [ 0.0182,  0.0141,  0.0361,  ..., -0.0432, -0.0388, -0.0233],
        ...,
        [ 0.0305,  0.0289,  0.0801,  ..., -0.0767, -0.0311, -0.0334],
        [ 0.0242, -0.0325,  0.0369,  ..., -0.0123, -0.0269, -0.0151],
        [-0.0264, -0.0498, -0.0210,  ...,  0.0601,  0.0130, -0.0007]],
       device='cuda:0')

In [None]:
peft_model.peft_config['default'].init_lora_weights = "PiSSAQuant_load"
peft_model.save_pretrained("PiSSAQuant-Llama-3.2-1B/pissaquant_init")
model = peft_model.unload()
model.save_pretrained("PiSSAQuant-Llama-3.2-1B")
tokenizer.save_pretrained("PiSSAQuant-Llama-3.2-1B")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("PiSSAQuant-Llama-3.2-1B", device_map='cuda')
tokenizer = AutoTokenizer.from_pretrained("PiSSAQuant-Llama-3.2-1B")
from peft import PeftModel
peft_model = PeftModel.from_pretrained(model, "PiSSAQuant-Llama-3.2-1B/pissaquant_init")

In [None]:
output = peft_model.generate(**inputs, max_new_tokens=128, do_sample=False)
print(tokenizer.batch_decode(output)[0])