In [1]:
import torch
from sentence_transformers import util
from peft import PeftModel
from transformers import AutoModel, AutoTokenizer
from safetensors.torch import load_file
from sklearn.preprocessing import normalize

import faiss
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader, PyPDFium2Loader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.prompts.chat import ChatPromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


# Some custom definition you need to define here.
1. `model_name`: mode repo from hugging face 
2. `query_prompt_name`: optional, only needed for stella model
3. `model_path`: the saved fine-tuned model path
4. `dense_path`: the saved weights of adapter layers in the fine-tuned model
5. `faiss_db_path`: the vector database stored location
6. `output_json_fp`: the llm response output json filepath

In [2]:
model_name = "dunzhang/stella_en_400M_v5"
llm_model_name = "tiiuae/falcon-7b-instruct"
query_prompt_name = "s2p_query"
model_path = "../models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08"
dense_path = "../models/stella_en_400M_v5/finetune_pair_2025-01-02_22-18-08/2_Dense/model.safetensors"
faiss_db_path = "../vector_database/faiss_stella_triplet"
device = "mps"
output_json_fp = '../results/llm_result.json'


In [3]:
human_query = "Could you explain the key differences in performance and handling across Porsche’s models, such as the Macan, Cayenne, 911, and Taycan? I’d like to understand how they cater to different driving styles and purposes."


# Load Embedding Model and VectorDatabase

In [4]:
model = AutoModel.from_pretrained(model_name,
                            trust_remote_code=True, 
                            device_map=device,
                            use_memory_efficient_attention=False,
                            unpad_inputs=False)

lora_model = PeftModel.from_pretrained(model, model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path,
                            trust_remote_code=True,  
                            device_map=device,
                            use_memory_efficient_attention=False,
                            unpad_inputs=False)

vector_linear = torch.nn.Linear(in_features=lora_model.config.hidden_size, out_features=1024)
vector_linear_dict = {
    k.replace("linear.", ""): v for k, v in
    load_file(dense_path).items()
}
vector_linear.load_state_dict(vector_linear_dict)
vector_linear.to(device)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Linear(in_features=1024, out_features=1024, bias=True)

In [5]:
def get_embedding(text, iTokenizer, iModel, iVector):
    with torch.no_grad():
        input_data = iTokenizer(text, padding="longest", truncation=True, max_length=512, return_tensors="pt")
        input_data = {k: v.to(device) for k, v in input_data.items()}
        attention_mask = input_data["attention_mask"]
        last_hidden_state = iModel(**input_data)[0]
        last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
        query_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        query_vectors = normalize(iVector(query_vectors).cpu().numpy())
        return query_vectors[0]

In [6]:
def prompt_template(context, query):
    SYSTEM_MESSAGE = """
        System: Here is some important context which can help inform the questions the Human asks.
        Make sure to not make anything up to answer the question if it is not provided in the context.

        Context: {}

        """.format(context)
    HUMAN_MESSAGE = "Human: {}".format(query)

    prompt = SYSTEM_MESSAGE + HUMAN_MESSAGE + "\nAnswer:"

    return prompt

In [7]:
vector_store = FAISS.load_local(
    faiss_db_path, 
    lambda texts: get_embedding(texts, tokenizer, lora_model, vector_linear), 
    allow_dangerous_deserialization=True
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [8]:
search_results = vector_store.similarity_search(human_query, k=3)
context_string = '\n\n'.join([f'Document {ind+1}: ' + i.page_content for ind, i in enumerate(search_results)])



In [9]:
promt_text = prompt_template(context_string, human_query)
print(promt_text)


        System: Here is some important context which can help inform the questions the Human asks.
        Make sure to not make anything up to answer the question if it is not provided in the context.

        Context: Document 1: 32 Drive and chassis
Porsche Stability Management (PSM). 
PSM is an automatic control system for maintaining 
stability at the limits of dynamic driving performance. 
Sensors continuously monitor the direction, speed, 
yaw velocity, and lateral acceleration of the car. 
Using this information, PSM calculates the actual 
direction of travel at any given moment and applies 
selective braking on individual wheels to help restore 
stability. When accelerating on road surfaces with 
varying grip, PSM improves traction using the 
Automatic Brake Dierential (ABD) system and 
Anti-Slip Regulation (ASR), providing a high level 
of driving stability and safety—and extraordinary 
agility at the same time.
Porsche Active Suspension Management (PASM). 
Standard on all 

# Load LLM

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

llm_tokenizer = AutoTokenizer.from_pretrained(llm_model_name)


In [11]:
pipeline = transformers.pipeline(
    "text-generation",
    model=llm_model_name,
    tokenizer=llm_tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=device,
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading checkpoint shards: 100%|██████████| 2/2 [00:46<00:00, 23.02s/it]


In [12]:
sequences = pipeline(
    promt_text,
    max_length=1200,
    do_sample=True,
    top_k=3,
    num_return_sequences=1,
    eos_token_id=llm_tokenizer.eos_token_id,
)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


### Save result to output file

In [13]:
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

import json
content = []
with open(output_json_fp, 'r') as f:
    try:
        content = json.load(f, content)
    except:
        pass

with open(output_json_fp, 'w') as f:
    content.append(
        {
            "question": human_query,
            "response": sequences[0]['generated_text'].replace(promt_text, '')
        }
    )

    json.dump(content, f, indent=4)


Result: 
        System: Here is some important context which can help inform the questions the Human asks.
        Make sure to not make anything up to answer the question if it is not provided in the context.

        Context: Document 1: 32 Drive and chassis
Porsche Stability Management (PSM). 
PSM is an automatic control system for maintaining 
stability at the limits of dynamic driving performance. 
Sensors continuously monitor the direction, speed, 
yaw velocity, and lateral acceleration of the car. 
Using this information, PSM calculates the actual 
direction of travel at any given moment and applies 
selective braking on individual wheels to help restore 
stability. When accelerating on road surfaces with 
varying grip, PSM improves traction using the 
Automatic Brake Dierential (ABD) system and 
Anti-Slip Regulation (ASR), providing a high level 
of driving stability and safety—and extraordinary 
agility at the same time.
Porsche Active Suspension Management (PASM). 
Standard