In [4]:
import json
import os
from pprint import pprint # pretty print
import bitsandbytes as bnb # custom module for quantization and optimization
import torch
import torch.nn as nn

from huggingface_hub import login
from peft import (
    LoraConfig,  # Configuration for LoRA (Low-Rank Adaptation)
    PeftConfig,  # Base configuration class for PEFT (Parameter-Efficient Fine-Tuning)
    PeftModel,   # Base model class for PEFT
    get_peft_model,  # Function to get a PEFT model
    prepare_model_for_kbit_training  # Function to prepare a model for k-bit training
)
from transformers import (
    AutoConfig,  # Auto configuration class
    AutoModelForCausalLM,  # Auto model class for causal language modeling
    AutoTokenizer,  # Auto tokenizer class
    BitsAndBytesConfig  # Configuration class for bitsandbytes
)


os.environ["CUDA_VISIBLE_DEVICES"] = "0"
MODEL_NAME = "mistralai/Mistral-7B-v0.1" 

def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainables%: {100 * trainable_params / all_param}"
    )

if __name__ == "__main__":
    # Log in to Hugging Face Hub
    login()

    # Configure bitsandbytes for 4-bit quantization
    # bnb_config = BitsAndBytesConfig(
    #     load_in_4bit=True,
    #     bnb_4bit_use_double_quant=True,
    #     bnb_4bit_quant_type="nf4",
    #     bnb_4bit_compute_dtype=torch.bfloat16
    # )

    # Load the pre-trained model with the specified configuration
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        device_map="auto",
        trust_remote_code=True,
        # quantization_config=bnb_config
        # low_cpu_mem_usage=True
    )

    # Load the tokenizer for the specified model
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    # Enable gradient checkpointing for the model
    model.gradient_checkpointing_enable()

    # Prepare the model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA for the model
    config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["q_proj", "k_proj", "v_proj", "out_proj", "fc_in", "fc_out", "wte"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model.gradient_checkpointing_enable()
    model = prepare_model_for_kbit_training(model)

    model = get_peft_model(model, config)
    print_trainable_parameters(model)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.33s/it]
Some parameters are on the meta device because they were offloaded to the disk.


trainable params: 9437184 || all params: 7251169280 || trainables%: 0.13014706505376192


In [5]:
prompt = """
<human>: midjourney prompt for a girl sit on the mountain
<assistant>:
""".strip()

In [6]:
generation_config = model.generation_config
generation_config.max_new_tokens = 200
generation_config.temperature = 0.7
generation_config.top_p = 0.7
generation_config.num_return_sequences = 1
generation_config.pad_token_id = tokenizer.eos_token_id
generation_config.eos_token_id = tokenizer.eos_token_id

In [9]:
%%time
# device = "cuda:0"

encoding = tokenizer(prompt, return_tensors="pt").to(device)
with torch.inference_mode():
  outputs = model.generate(
      input_ids = encoding.input_ids,
      attention_mask = encoding.attention_mask,
      generation_config = generation_config
  )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

AssertionError: Torch not compiled with CUDA enabled