# Load Embedding Model and VectorDatabase

In [1]:
import torch
from sentence_transformers import util
from peft import PeftModel
from transformers import AutoModel, AutoTokenizer
from safetensors.torch import load_file
from sklearn.preprocessing import normalize

import faiss
from langchain_community.document_loaders import PyMuPDFLoader, PyPDFLoader, PyPDFium2Loader
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain.prompts.chat import ChatPromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
query_prompt_name = "s2p_query"
model_path = "models/stella_en_400M_v5/finetune_triplets_2025-01-02_18-06-49"
dense_path = "models/stella_en_400M_v5/finetune_triplets_2025-01-02_18-06-49/2_Dense/model.safetensors"

In [3]:
model = AutoModel.from_pretrained("dunzhang/stella_en_400M_v5",
                            trust_remote_code=True, 
                            device_map='mps',
                            use_memory_efficient_attention=False,
                            unpad_inputs=False)

lora_model = PeftModel.from_pretrained(model, model_path)

tokenizer = AutoTokenizer.from_pretrained(model_path,
                            trust_remote_code=True,  
                            device_map='mps',
                            use_memory_efficient_attention=False,
                            unpad_inputs=False)

vector_linear = torch.nn.Linear(in_features=lora_model.config.hidden_size, out_features=1024)
vector_linear_dict = {
    k.replace("linear.", ""): v for k, v in
    load_file(dense_path).items()
}
vector_linear.load_state_dict(vector_linear_dict)
vector_linear.to("mps")

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Linear(in_features=1024, out_features=1024, bias=True)

In [4]:
def get_embedding(text, iTokenizer, iModel, iVector):
    with torch.no_grad():
        input_data = iTokenizer(text, padding="longest", truncation=True, max_length=512, return_tensors="pt")
        input_data = {k: v.to("mps") for k, v in input_data.items()}
        attention_mask = input_data["attention_mask"]
        last_hidden_state = iModel(**input_data)[0]
        last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
        query_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
        query_vectors = normalize(iVector(query_vectors).cpu().numpy())
        return query_vectors[0]

In [5]:
def prompt_template(context, query):
    SYSTEM_MESSAGE = """
        System: Here is some important context which can help inform the questions the Human asks.
        Make sure to not make anything up to answer the question if it is not provided in the context.

        Context: {}

        """.format(context)
    HUMAN_MESSAGE = "Human: {}".format(query)

    prompt = SYSTEM_MESSAGE + HUMAN_MESSAGE

    return prompt

In [6]:
vector_store = FAISS.load_local(
    "vector_database/faiss_stella", 
    lambda texts: get_embedding(texts, tokenizer, lora_model, vector_linear), 
    allow_dangerous_deserialization=True
)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


In [7]:
human_query = "I am new to Porsche. Which model should I have?"

In [8]:
search_results = vector_store.similarity_search(human_query, k=3)
context_string = '\n\n'.join([f'Document {ind+1}: ' + i.page_content for ind, i in enumerate(search_results)])



In [9]:
promt_text = prompt_template(context_string, human_query)
print(promt_text)


        System: Here is some important context which can help inform the questions the Human asks.
        Make sure to not make anything up to answer the question if it is not provided in the context.

        Context: Document 1: 4 The dream of the sports car 6 Porsche Codes
10 718 Cayman & 718 Boxster
16 718 Cayman GT4
22 718 Cayman GT4 RS
28 718 Spyder
34 911 Carrera & 911 Targa
40 911 Turbo
46 911 GT3
52 Taycan
58 Panamera
64 Macan
70 Cayenne
78 Porsche Exclusive Manufaktur
82 Porsche Tequipment & 
Porsche Car Configurator
86 Specifications

Document 2: 30 Models
| 718 31
Six-cylinder naturally aspirated boxer engine
Porsche Doppelkupplung (PDK) 
or manual transmission
Rear-wheel drive
Lightweight convertible top
Two seats
Two luggage 
compartments
Adaptive GT sports suspension
GT brake system
Mid-engine concept
For efficiency class, fuel consumption and CO₂ emissions, please refer to page 86 onwards.

Document 3: 48 Models
| 911 49
Six-cylinder high-revving naturally aspirated e

# Load LLM

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "tiiuae/falcon-7b-instruct"

llm_tokenizer = AutoTokenizer.from_pretrained(model)


In [11]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=llm_tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="mps",
)



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Loading checkpoint shards: 100%|██████████| 2/2 [00:34<00:00, 17.48s/it]


In [18]:
sequences = pipeline(
    promt_text,
    max_length=500,
    do_sample=False,
    top_k=3,
    num_return_sequences=1,
    eos_token_id=llm_tokenizer.eos_token_id,
)


Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


In [19]:
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Result: 
        System: Here is some important context which can help inform the questions the Human asks.
        Make sure to not make anything up to answer the question if it is not provided in the context.

        Context: Document 1: 4 The dream of the sports car 6 Porsche Codes
10 718 Cayman & 718 Boxster
16 718 Cayman GT4
22 718 Cayman GT4 RS
28 718 Spyder
34 911 Carrera & 911 Targa
40 911 Turbo
46 911 GT3
52 Taycan
58 Panamera
64 Macan
70 Cayenne
78 Porsche Exclusive Manufaktur
82 Porsche Tequipment & 
Porsche Car Configurator
86 Specifications

Document 2: 30 Models
| 718 31
Six-cylinder naturally aspirated boxer engine
Porsche Doppelkupplung (PDK) 
or manual transmission
Rear-wheel drive
Lightweight convertible top
Two seats
Two luggage 
compartments
Adaptive GT sports suspension
GT brake system
Mid-engine concept
For efficiency class, fuel consumption and CO₂ emissions, please refer to page 86 onwards.

Document 3: 48 Models
| 911 49
Six-cylinder high-revving naturally asp