# LLM File.

# Install Requirements if this is your first time.

In [1]:
#if your first run, uncomment this stuff.
#!pip uninstall -y transformers
#!pip uninstall -y accelerate
#!pip uninstall -y peft
#!pip uninstall -y bitsandbytes
#!pip uninstall -y torch

#!pip install torch==1.13.0
#!pip install transformers 
#!pip install peft
#!pip install bitsandbytes
#!pip install accelerate

# Import dependencies.

In [2]:
import os, transformers, peft, torch

  from .autonotebook import tqdm as notebook_tqdm


# Some Global Variables.

In [3]:
platform = "colab" #"mac"
llmname = "meta-llama/Llama-2-7b-chat-hf"
device = "cuda:0" if platform == "colab" else "mps:0"
modelstore = "./models"
max_seq_len = 4096
alpha = 16
rank = 8
if not os.path.exists(modelstore):
    os.makedirs(modelstore)

# Some Useful Functions.

In [40]:
def get_token():
     return "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"

def get_tokenizer(name: str = llmname, model_max_length: int = max_seq_len):
	tok = transformers.AutoTokenizer.from_pretrained(
		name,
		cache_dir = modelstore,
		model_max_length = model_max_length,
		token = get_token()
	)
	tok.padding_side = 'right'
	tok.model_max_length = max_seq_len
	return tok

def get_model(name: str = llmname, quantize: bool | str = "qlora"):
    if isinstance(quantize, bool):
        model = transformers.AutoModelForCausalLM.from_pretrained(
            name,
            cache_dir = modelstore,
            token = get_token()
        )
    if quantize == True:
        model = model.to(torch.float16)
    elif quantize == "qlora":
        nf4_config = transformers.BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype = torch.float16,
            )
        lora_config = peft.LoraConfig(
                r = rank,
                lora_alpha = alpha,
                target_modules = ["q_proj", "v_proj"],
                bias = "none",
                task_type = "CAUSAL_LM",
            )
        model = transformers.AutoModelForCausalLM.from_pretrained(
            name,
            cache_dir = modelstore,
            quantization_config = nf4_config,
            token = get_token(),
        )
        model = peft.get_peft_model(model, lora_config)
        model = model.to(device)
    return model

def get_model_output(model, tok, prompt):
    """
    Desc:
        Take in LLM and tokenizer and prompt and give me output.
    Args:
        1. model: llm.
        2. tok: tokenizer.
        3. min_length: minimum length of output.
    """
    prompt = f"""You are a rockstar. Chat with the user bruv. User: '{prompt}'. Pls don't make your reply too long. Reply:"""
    plen = len(prompt)
    prompt = tok(prompt, return_tensors = "pt").to(model.device)
    return tok.decode(model.generate(prompt["input_ids"], do_sample = True).squeeze(), skip_special_tokens = True)[plen:]
    

In [34]:
#get model and tokenizer.
tok = get_tokenizer()
model = get_model()

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████| 2/2 [00:02<00:00,  1.18s/it]


In [41]:
print(get_model_output(model, tok, "Hi! Wassup mate?"))

 'Hey! Not much, just chillin' like a villain. What's up with you? 🤘'
