In [2]:
from google.cloud import pubsub_v1
import torch

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from huggingface_hub import login
from hf_token import hf_token
from utils.lora_ckpt import load_lora
from peft import LoraConfig, get_peft_model

access_token = hf_token
login(token=access_token)

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=access_token)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", token=access_token)

lora_modules = ['q_proj', 'v_proj']
lora_r = 16
lora_alpha = lora_r
lora_dropout = 0.05

weights_path = './weights/lora_llm.ckpt'

lora_config = LoraConfig(
        r = lora_r,
        lora_alpha = lora_alpha,
        target_modules = lora_modules,
        lora_dropout = lora_dropout,
        bias = "none"
)
lora_llm = get_peft_model(model=model, peft_config=lora_config)
load_lora(lora_llm, weights_path, end_of_epoch=None)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/pkk2125/.cache/huggingface/token
Login successful




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LoRA loaded from  ./weights/lora_llm.ckpt


In [2]:
def read_prompt(llm, tokenizer, prompt, device='cuda'):
    # Tokenize
    tokens = tokenizer(
        prompt, padding=True, return_tensors='pt'
    )['input_ids'].to(device)
    
    # Encode
    words_embed = llm(
        tokens, output_hidden_states=True
    ).hidden_states[-1] # last layer

    return words_embed[:, -1, :] # last or EOS token

tokenizer.pad_token = '[PAD]'

In [8]:
from google.cloud import pubsub_v1

In [9]:
# Publisher config

project_id = "eecse6992-yolov4-tiny-pkk2125"

subscription_name = "LCCE-inference-sub"
subscriber = pubsub_v1.SubscriberClient()
subscription_path = subscriber.subscription_path(project_id, subscription_name)

prompt_buffer = None # save input prompt into this buffer
def sub_callback(message):
    # prompt_buffer.append(message.data)
    prompt_buffer = message.data
    message.ack()
    
subscriber.subscribe(subscription_path, callback=sub_callback)

<StreamingPullFuture at 0x7fa7cc264f10 state=pending>

In [10]:
prompt_test = "Yes this is a test"

In [11]:
def get_embedding(prompt, llm=model, tokenizer=tokenizer, device='cpu'): # get embedding from self queue
        embedding = read_prompt(llm, tokenizer, prompt, device)
        return embedding
if(prompt_buffer is not None):    
    prompt_embedding = get_embedding(prompt_buffer)

In [16]:
def tobytes(prompt_embedding):
    return prompt_embedding.detach().numpy().tobytes()



In [28]:
prompt_embedding = prompt_embedding.reshape(-1)

In [30]:
import time

In [42]:
# prompt_data = tobytes(prompt_embedding)
prompt_data = "testing_prompt"
i=0
while(True):
    time.sleep(5)
    prompt_data += f'_{i}'
    prompt_data = prompt_data.encode('utf-8')

    print(f'Sending {prompt_data}')
    pub_topic_name = "LCEE_prompt_publish"
    publisher = pubsub_v1.PublisherClient()
    pub_topic_path = publisher.topic_path(project_id, pub_topic_name)
    future = publisher.publish(pub_topic_path, prompt_data)
    future.result()
    prompt_data = "testing_prompt"
    i+=1
    if(i==10):
        break

Sending b'testing_prompt_0'
Sending b'testing_prompt_1'
Sending b'testing_prompt_2'
Sending b'testing_prompt_3'
Sending b'testing_prompt_4'
Sending b'testing_prompt_5'
Sending b'testing_prompt_6'
Sending b'testing_prompt_7'
Sending b'testing_prompt_8'


KeyboardInterrupt: 