In [30]:
# Construct a SentenceTransformer model manually using the local model


# Define the local model path 
local_model_path = "./models/sentence_transformer_all_mpnet_base_v2"

# Create a transformer model instance using the local model path.
# This module helps in building token level embeddings for each word in input sentences.
from sentence_transformers import SentenceTransformer, models
transformer_model = models.Transformer(
    model_name_or_path=local_model_path,
    tokenizer_args={"local_files_only": True}
)

# Create a pooling model that builds on top of the transformer model
# This module aggregates the token level embeddings from transformer_model into a single vector for each input sentence.
pooling_model = models.Pooling(transformer_model.get_word_embedding_dimension())

# Combine the modules into a complete SentenceTransformer model.
# Here we assemble the transformer and pooling models into a single SentenceTransformer instance. 
# we will use this model to encode sentences into fixed-size embeddings.(768 dimensions) 
model = SentenceTransformer(modules=[transformer_model, pooling_model])

print("SentenceTransformer model constructed successfully.")


SentenceTransformer model constructed successfully.


In [31]:
import torch
# Check for GPU availability and move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"Embedding model is loaded on: {device}")


Embedding model is loaded on: cuda


In [None]:
# Test with a sample sentence
sample_text = ["This is a sample sentence to test the embedding model."]
embedding = model.encode(sample_text)

print("Embedding shape:", embedding.shape)
print("Embedding vector:", embedding)


In [32]:
'''purposeof this script is to have the vector represenation of each sentance in the form of list of lists.
* For input documents, if single sentance or multiple sentances, the output will be a list of lists where each inner
   list is the vector representation of the corresponding input sentance.
*  While for input query, the output will be a single list representing the vector of the query.
  This is useful for various NLP tasks such as semantic search, clustering, and classification.
'''

import numpy as np

class CustomSentenceTransformerEmbeddings:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        ''' If a single string is accidentally passed, wrap it in a list.
        here if a texts is a single string, we convert it to a list with one element.
        This ensures that the model can handle it uniformly as a list of texts.'''
        if isinstance(texts, str):
            texts = [texts]
        embeddings = self.model.encode(texts, convert_to_numpy=True)
        
        '''If the result is a 1D array (should not happen for a list of texts) then wrap it.
        here the array is converted from a single list to, list of lists.. ex. [0.1, 0.2, ...]   converts to [[0.1, 0.2, ...]]
        '''
        if embeddings.ndim == 1:
            return [embeddings.tolist()]
        
        # If it's a 2D array, return the list of lists directly. no need to wrap it in list as output will be a list of lists.
        elif embeddings.ndim == 2:
            return embeddings.tolist()
        else:
            # Fallback, though typically not needed.
            return [emb.tolist() for emb in embeddings]

    def embed_query(self, text):
        embedding = self.model.encode(text, convert_to_numpy=True)
        # If the embedding comes back as a 2D array (e.g., shape [1, d]), get the first element.
        # Here  we ensure that if the embedding is a 2D array with one row, we convert it to a 1D list. ex. [[0.1, 0.2, ...]] converts to [0.1, 0.2, ...]
        if isinstance(embedding, np.ndarray):
            if embedding.ndim == 2:
                embedding = embedding[0]
            return embedding.tolist()
        return embedding


In [47]:
embedding_function=CustomSentenceTransformerEmbeddings(model)

In [None]:
import json
from langchain.docstore.document import Document

# Load JSON content
with open(r"D:\ML\Thesis_chatbot\Data\out\metadata_chunks.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Create a list of Document objects with both page content and metadata
documents = [
    Document(page_content=item["chunk_text"], metadata={"metadata": item["metadata"]})
    for item in data
]


In [None]:
from langchain.vectorstores import Chroma

# Path where you want to store the Chroma DB
persist_directory = r"D:\ML\Thesis_chatbot\Data\out\chroma_db"

# Create the vector store using your custom embedding function
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_function,
    persist_directory=persist_directory,  
    collection_name="my_collection"       
)
vectorstore.persist()



In [None]:
# Define a sample query
query = "What OEM names are mentioned?"

# Retrieve the top 3 similar documents
results = vectorstore.similarity_search(query, k=3)

# Examine the results
for doc in results:
    print("Text snippet:", doc.page_content[:500])
    print("Metadata:", doc.metadata)
    print("-" * 50)
