## Login to HugginFace

In [1]:
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
import torch

hf_auth = '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'

login(hf_auth)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/eng/s/ssm220008/.cache/huggingface/token
Login successful


## LLM Definition and properties


We have to initialize a text-generation pipeline with Hugging Face transformers. The pipeline requires the following three things that you must initialize:

* A LLM, in this case it will be `meta-llama/Llama-2-7b-chat-hf`.
* The respective tokenizer for the model.
* A stopping criteria object.

You have to initialize the model and move it to `CUDA-enabled` GPU.

In [2]:
compute_dtype = getattr(torch, "float16")

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=True,
)

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
base_model = "meta-llama/Llama-2-7b-chat-hf"



tokenizer = AutoTokenizer.from_pretrained("./Llama-2-7b-chat-hf-Property-Classification", trust_remote_code=True)
#tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct", trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained("deepseek-ai/deepseek-coder-6.7b-instruct", trust_remote_code=True).to(device)

model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map= 'auto' # device
)

adapter_name = model.load_adapter("./Llama-2-7b-chat-hf-Property-Classification")
model.active_adapters = adapter_name



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Set the stopping token by editing the tokenizer.json

In [None]:
starting_part = """<s>[INST] Determine whether the following sentence defines a property or non-property type sentence for a design documentation:\n"""
main_text = """The debug system follows the execution-based debug approach described in the RISC-V Debug Specification 0.13.2 and provides the following features."""
end_part = """\n [\/INST]:<\\s>"""


input_text = starting_part + main_text + end_part

In [32]:
from transformers import StoppingCriteria, StoppingCriteriaList
import torch

class StoppingCriteriaSub(StoppingCriteria):

    def __init__(self, stops = [], encounters=1):
        super().__init__()
        self.stops = [stop.to("cuda:0") for stop in stops]

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        last_token = input_ids[0][-1] # the generated token
        for stop in self.stops: # stop words
            if tokenizer.decode(stop[-1]) == tokenizer.decode(last_token): # if the generated token is in the stop words
                # print ("HIT")
                # print(tokenizer.decode(stop[-1]))
                return True # stop the generation
        return False


stop_words = ["<\s>", "T]","OT","[EOT]"]
stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
print(stop_words_ids)
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])

[tensor([    1,   529, 29905, 29879, 29958]), tensor([    1,   323, 29962]), tensor([    1,   438, 29911]), tensor([    1,   518, 29923,  2891, 29962])]


In [39]:
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_length=500,stopping_criteria=stopping_criteria, eos_token_id=tokenizer.eos_token_id,early_stopping=True, pad_token_id=tokenizer.pad_token_id)
print(tokenizer.decode(outputs[0][:-1],  # remove the token the LLM stops at
                       skip_special_tokens=True)[len(input_text):])

/INST]: 
 <\s
