In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from trl import SFTTrainer, SFTConfig
import torch

In [None]:
model_name = "meta-llama/Llama-3.2-3B-Instruct"
target_modules = ["q_proj", "v_proj"]

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [None]:
device_map = {"": 0}
foundation_model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config=bnb_config,
                    device_map=device_map,
                    )

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from datasets import load_dataset

#Create the Dataset to create prompts.
data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")

data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)
train_sample = data.select(range(1000))
# train_sample = data


display(train_sample)

In [None]:
import peft
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8, #As bigger the R bigger the parameters to train.
    lora_alpha=8, # a scaling factor that adjusts the magnitude of the weight matrix. It seems that as higher more weight have the new training.
    target_modules=target_modules,
    lora_dropout=0.05, #Helps to avoid Overfitting.
    bias="none", # this specifies if the bias parameter should be trained.
    task_type="CAUSAL_LM"
)

In [None]:
#Create a directory to contain the Model
import os
working_dir = './'
output_directory = os.path.join(working_dir, "peft_lab_outputs")

In [None]:
import transformers 
training_args = SFTConfig(
    output_dir=output_directory,
    auto_find_batch_size=True, 
    learning_rate= 2e-4,
    num_train_epochs=1,
    dataset_text_field="text",
)

In [None]:
tokenizer.pad_token = tokenizer.eos_token
trainer = SFTTrainer(
    model=foundation_model,
    args=training_args,
    train_dataset=train_sample,
    peft_config = lora_config,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()

In [None]:
#Save the model.
peft_model_path = os.path.join(output_directory, f"lora_model")


In [None]:
#Save the model.
trainer.model.save_pretrained(peft_model_path)

In [None]:
from peft import *
model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(model_name).to("cuda"), peft_model_path)

In [None]:
merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained('qlora')
tokenizer.save_pretrained('qlora')

In [None]:
from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, AwqConfig

device = 'cuda'
model_id = "qlora"

tokenizer = AutoTokenizer.from_pretrained(model_id)


quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
model = AutoAWQForCausalLM.from_pretrained(model_id)
model.quantize(tokenizer, quant_config=quant_config)
model.model.config.quantization_config = AwqConfig(
    w_bit=quant_config["w_bit"],
    group_size=quant_config["q_group_size"],
    zero_point=quant_config["zero_point"],
    version=quant_config["version"],
)



# Save quantized model
quant_path = 'awq_wbit4_gs128'

model.save_quantized(quant_path)
tokenizer.save_pretrained(quant_path)