In [1]:
 # Embedding model initialization
 
from sentence_transformers import SentenceTransformer, models
import torch

local_model_path = "./models/sentence_transformer_all_mpnet_base_v2"
transformer_model = models.Transformer(
        model_name_or_path=local_model_path,
        tokenizer_args={"local_files_only": True}
    )
pooling_model = models.Pooling(transformer_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[transformer_model, pooling_model])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
print(f"Embedding model loaded on: {device}")


  from .autonotebook import tqdm as notebook_tqdm


Embedding model loaded on: cuda


In [2]:
# custome embeddings class for SentenceTransformer

import numpy as np
class CustomSentenceTransformerEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        
        if isinstance(texts, str):
            texts = [texts]
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        
        
        if embeddings.ndim == 1:
            return [embeddings.tolist()]
        
        # If it's a 2D array, return the list of lists directly. no need to wrap it in list as output will be a list of lists.
        elif embeddings.ndim == 2:
            return embeddings.tolist()
        else:
            # Fallback, though typically not needed.
            return [emb.tolist() for emb in embeddings]

    def embed_query(self, text):
        embedding = self.model.encode(text, convert_to_numpy=True)
        # If the embedding comes back as a 2D array (e.g., shape [1, d]), get the first element.
      
        if isinstance(embedding, np.ndarray):
            if embedding.ndim == 2:
                embedding = embedding[0]
            return embedding.tolist()
        return embedding
  
 # Vector store initialization
from langchain.vectorstores import Chroma        
persist_directory = r"D:\ML\Thesis_chatbot\Data\out\chroma_db"
vectorstore = Chroma(
        persist_directory=persist_directory,
        collection_name="my_collection",
        embedding_function=CustomSentenceTransformerEmbeddings(model)
    )
print("Vector store loaded.")

  vectorstore = Chroma(


Vector store loaded.


In [3]:
    
from langchain_community.llms import LlamaCpp


# Load the model
llm = LlamaCpp(
    model_path=r"D:\ML\Thesis_chatbot\Models\TheBloke_OpenHermes-2.5-Mistral-7B-GGUF\openhermes-2.5-mistral-7b.Q4_K_M.gguf",
    n_gpu_layers=-1,
    n_ctx=32768,      # Context length (4K tokens)
    n_batch=1024,     # Adjust batch size for better performance
    n_threads=8          # Set based on your CPU cores

    )

# Check if the model is loaded on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"llm is loaded on: {device}")

llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from D:\ML\Thesis_chatbot\Models\TheBloke_OpenHermes-2.5-Mistral-7B-GGUF\openhermes-2.5-mistral-7b.Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = teknium_openhermes-2.5-mistral-7b
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u3

llm is loaded on: cuda


CPU : SSE3 = 1 | SSSE3 = 1 | AVX = 1 | AVX2 = 1 | F16C = 1 | FMA = 1 | AVX512 = 1 | LLAMAFILE = 1 | OPENMP = 1 | AARCH64_REPACK = 1 | 
Model metadata: {'general.name': 'teknium_openhermes-2.5-mistral-7b', 'general.architecture': 'llama', 'llama.context_length': '32768', 'llama.rope.dimension_count': '128', 'llama.embedding_length': '4096', 'llama.block_count': '32', 'llama.feed_forward_length': '14336', 'llama.attention.head_count': '32', 'tokenizer.ggml.eos_token_id': '32000', 'general.file_type': '15', 'llama.attention.head_count_kv': '8', 'llama.attention.layer_norm_rms_epsilon': '0.000010', 'llama.rope.freq_base': '10000.000000', 'tokenizer.ggml.model': 'llama', 'general.quantization_version': '2', 'tokenizer.ggml.bos_token_id': '1', 'tokenizer.ggml.padding_token_id': '0'}
Using fallback chat format: llama-2


In [4]:
# Step 1: Define the question
question = "how design office gives the estimation of work?"

In [None]:
# Step 2: Retrieve context from your vector store using LangChain.
results = vectorstore.similarity_search(question, k=3)
context = "\n".join([doc.page_content for doc in results])
print("Retrieved Context:")
print(context)

In [None]:
# prompt template creation using LangChain
from langchain import PromptTemplate

template = (
    "Using the following relevant context, please answer the question in a detailed manner.\n\n"
    "Context:\n"
    "{context}\n\n"
    "Question:\n"
    "{question}\n\n"
    "Answer:"
)

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

# You can then format your prompt like:
formatted_prompt = prompt_template.format(context=context, question=question)
print(formatted_prompt)

In [7]:
# Step 4: Generate an answer with the Llama model.
response = llm(
    formatted_prompt,
    max_tokens=256,      # Increase this as needed for longer responses
    temperature=0.7      # Adjust for response variability if desired
)

print("\nLLM Response:")
print(response)

  response = llm(
llama_perf_context_print:        load time =  104937.72 ms
llama_perf_context_print: prompt eval time =  104936.73 ms /  3708 tokens (   28.30 ms per token,    35.34 tokens per second)
llama_perf_context_print:        eval time =   12000.47 ms /    74 runs   (  162.17 ms per token,     6.17 tokens per second)
llama_perf_context_print:       total time =  116985.67 ms /  3782 tokens



LLM Response:

In the Design Office, an expert plans the resource loading (per month and cost for Architect, Lead Engineer, and Design Engineer) as planned in Figure [4.1] for Hannover office. For example, for complete seat architecture in the month of January, 6.8 resources are planned, and according to that hours and cost are calculated.
