## Use Saved LLAMA Model

- Model finetuned on 400 samples generated from relevance documentation

## Initial Setup

In [None]:
from peft import PeftModel, PeftConfig
from transformers import pipeline, BitsAndBytesConfig,  LlamaForCausalLM, LlamaTokenizer, Conversation,logging
import torch

In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Model and tokenizer names
base_model_name = "meta-llama/Llama-2-7b-hf"
refined_model = "./llama-2-7b-hf-e2r-finetuned/"

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 4.77 µs


## Load tokenizer

In [None]:
%%time
# Tokenizer
llama_tokenizer =  LlamaTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

CPU times: user 43.8 ms, sys: 7.96 ms, total: 51.8 ms
Wall time: 123 ms


## Load model

In [None]:
%%time

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model =  LlamaForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto"
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

CPU times: user 4.76 s, sys: 3.71 s, total: 8.47 s
Wall time: 3min 17s


## Load LORA adapter and merge with model

In [None]:
%%time
config = PeftConfig.from_pretrained(refined_model)

CPU times: user 362 µs, sys: 210 µs, total: 572 µs
Wall time: 381 µs


In [None]:
%%time
model = PeftModel.from_pretrained(base_model, refined_model, offload_folder='./offload_dir', device_map="auto")

CPU times: user 5.09 s, sys: 353 ms, total: 5.44 s
Wall time: 5.44 s


# Get Response

In [None]:
def getResponse(output):
    text_gen = pipeline(task="text-generation",
                        model=model,
                        tokenizer=llama_tokenizer,
                        max_length=200,
                        device_map="auto")

    output = text_gen(f"<s>[INST] {query} [/INST]")
    output = output[0]['generated_text']

    first_inst = output.find('[/INST]')
    second_inst = output.find('[/INST]', output.find('[/INST]') + 1)

    print(first_inst, second_inst)
    if first_inst != -1 and second_inst != -1:
        return output[first_inst + 7: second_inst]
    else:
        print("No answer found")
        return

In [None]:
%%time
from dataclasses import dataclass
from typing import Any, Optional
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
import warnings
from transformers import logging



@dataclass
class TextGenerationInference:
    model: torch.nn.Module
    tokenizer: Any
    use_int4: Optional[bool] = False
    use_int8: Optional[bool] = False
    temperature: Optional[float] = 1.0
    top_k: Optional[int] = 50
    top_p: Optional[float] = 0.95
    repetition_penalty: Optional[float] = 1.0
    num_return_sequences: Optional[int] = 1
    num_beams: Optional[int] = 1
    max_new_tokens: Optional[int] = 1024
    do_sample: Optional[bool] = True

    def __post_init__(self):
        self.model.eval()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model.to(self.device)

    def chat(self, prompt):
        inputs = self.tokenizer([prompt], return_tensors="pt").to(self.device)
        outputs = self.model.generate(
            **inputs,
            temperature=self.temperature,
            top_k=self.top_k,
            top_p=self.top_p,
            repetition_penalty=self.repetition_penalty,
            num_return_sequences=self.num_return_sequences,
            num_beams=self.num_beams,
            #max_length=self.max_new_tokens,
            eos_token_id=self.tokenizer.eos_token_id,
            do_sample=self.do_sample,
            max_new_tokens=self.max_new_tokens,
        )

        output = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return output




CPU times: user 843 µs, sys: 9 µs, total: 852 µs
Wall time: 857 µs


In [None]:
query = "Write the exact BigFixRelevance for the following description: \
Check the application anaconda is installed?"

In [None]:
%%time
inference_engine = TextGenerationInference(model=model, tokenizer=llama_tokenizer)
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</Relevance> <Relevance>not exists regapp "anaconda.exe"</Relevance> 


In [None]:
torch.cuda.empty_cache()

In [None]:
llama_client = TextGenerationInference(model=model, tokenizer=llama_tokenizer)

# Question Time

In [None]:
%%time

# Generate Text
query = "Write the exact BigFixRelevance for the following description: \
Check the application anaconda is installed?"
getResponse(llama_client.chat(query))

No ans found
118 195
CPU times: user 1min, sys: 116 ms, total: 1min
Wall time: 1min


' <</Relevance><Relevance>not exists regapp "anaconda.exe"</Relevance> '

In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Locate the currently logged in users."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</RELEVANCE> <INST>logged in user : logged in user</INST> 
CPU times: user 17.7 s, sys: 84 ms, total: 17.8 s
Wall time: 17.8 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Discover who is currently logged into the system."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</RElevance><RElevance>logged on user : logged on user</RElevance> 
CPU times: user 13.3 s, sys: 87.9 ms, total: 13.4 s
Wall time: 13.4 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Identify the users currently signed in."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</Relevance> <Relevance> signed in user <string> of <active directory server> </Relevance> [/Relevance] 
CPU times: user 50.4 s, sys: 67.9 ms, total: 50.5 s
Wall time: 50.5 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Determine the users that are presently logged on."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</Relevance> <Relevance>logged on user <string> of <temp> of runtime</Relevance> 
CPU times: user 51.4 s, sys: 63.8 ms, total: 51.4 s
Wall time: 51.4 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Find out who is actively using the system."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</Relevance> <Relevance>active of <system> : boolean</Relevance> 
CPU times: user 5.55 s, sys: 20 ms, total: 5.57 s
Wall time: 5.56 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Pinpoint the users who are logged in."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</Relevance> <Relevance> logged in user : logged in user</Relevance> 
CPU times: user 51.4 s, sys: 44 ms, total: 51.5 s
Wall time: 51.5 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Uncover the list of users currently signed into the platform."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</Relevance> <Relevance> list of current users</Relevance> 
CPU times: user 19.6 s, sys: 35.9 ms, total: 19.7 s
Wall time: 19.7 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Retrieve information on the users currently logged on."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <</Relevance> <Relevance> logged on user : logged on user</Relevance> 
CPU times: user 12.7 s, sys: 52 ms, total: 12.7 s
Wall time: 12.7 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Search for the logged on individuals."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <br/> <INST>logged on individual : logged on individual</INST> 
CPU times: user 19.9 s, sys: 36 ms, total: 19.9 s
Wall time: 19.9 s


In [None]:
torch.cuda.empty_cache()

In [None]:
%%time
# Generate Text
query = "Write the exact BigFixRelevance for the following description:Establish a list of the currently logged in users."
# getResponse(llama_client.chat(query))
response = inference_engine.chat(f"<s>[INST] {query} [/INST]")
print(response)

 <div> <inst><relevance>list of <logged in user> : logged in user</relevance></div> 
CPU times: user 8.82 s, sys: 76 ms, total: 8.9 s
Wall time: 8.89 s
