In [4]:

from dotenv import load_dotenv
from langchain_openai import  AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain,create_history_aware_retriever
from  langchain_community.embeddings import OllamaEmbeddings,HuggingFaceEmbeddings
import hashlib
import os
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import AutoTokenizer,AutoModel

load_dotenv()

os.environ['OPENAI_API_TYPE']=os.getenv("AL_OPENAI_API_TYPE")
os.environ['OPENAI_API_VERSION']=os.getenv("AL_OPENAI_API_VERSION")
os.environ['AZURE_OPENAI_ENDPOINT']=os.getenv("AL_AZURE_OPENAI_ENDPOINT")
os.environ['OPENAI_API_KEY']=os.getenv("AL_OPENAI_API_KEY")
os.environ['DEPLOYMENT_NAME']=os.getenv("AL_DEPLOYMENT_NAME")

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("AL_LANGCHAIN_PROJECT")

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

In [5]:
import torch
import torch.nn.functional as F
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import os
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings

class CustomLangChainEmbedding(Embeddings):
    def __init__(self, model_name="bert-base-uncased", use_gpu=False):
        """
        Initialize the embedding class with a specific transformer model.
        
        Args:
            model_name (str): Name of the pre-trained transformer model.
            use_gpu (bool): If True, use GPU (CUDA) for inference; otherwise, use CPU.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,clean_up_tokenization_spaces=True)
        self.model = AutoModel.from_pretrained(model_name)

        # Use GPU if available and requested
        self.device = torch.device("cuda" if torch.cuda.is_available() and use_gpu else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")


    def mean_pooling(self, model_output, attention_mask):
        """
        Mean pooling to compute sentence embeddings from token embeddings.
        """
        token_embeddings = model_output[0]  # First element is token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    
        
    def encode_data(self, sentences):
        """
        Encode the input sentences into sentence embeddings.
        
        Args:
            sentences (list of str): List of sentences to encode.
        
        Returns:
            np.ndarray: Sentence embeddings as a numpy array.
        """
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        
        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)  # L2-normalize embeddings
        return sentence_embeddings.cpu().numpy()  # Convert to numpy for FAISS or other downstream tasks

    def embed_documents(self, texts):
        """
        LangChain-compatible method to create embeddings for documents.
        
        Args:
            texts (list of str): List of documents (text) to create embeddings for.
        
        Returns:
            np.ndarray: Document embeddings as numpy arrays.
        """
        return self.encode_data(texts)

    def embed_query(self, text):
        """
        LangChain-compatible method to create embedding for a single query.
        
        Args:
            text (str): Query to create embedding for.
        
        Returns:
            np.ndarray: Query embedding as a numpy array.
        """
        return self.encode_data(text)


# Saving and Loading FAISS Index Locally

    # Function to save the FAISS index to disk
def save_faiss_index(vector_store, index_path):
    os.makedirs(index_path, exist_ok=True)
    vector_store.save_local(index_path)
    print(f"FAISS index and metadata saved to {index_path}")


def load_faiss_index(embedding,index_path):
     index_file = os.path.join(index_path, "index.faiss")
     pkl_file = os.path.join(index_path, "index.pkl")
     
     if os.path.exists(index_file) and os.path.exists(pkl_file):
         print(f"Loading FAISS index and metadata from {index_path}")
         return FAISS.load_local(index_path, embedding,allow_dangerous_deserialization=True)
     else:
         print(f"No FAISS index found at {index_path}, creating a new one.")
         return None





In [6]:
embedding = CustomLangChainEmbedding(model_name="./Models/all-MiniLM-L6-v2", use_gpu=False)

Model loaded on cpu


In [7]:
llm=AzureChatOpenAI()
# Initialize the embedding model
#embedding=OllamaEmbeddings(model="mxbai-embed-large")

In [8]:
# Function to split text into manageable chunks using a Recursive Text Splitter
def split_document_into_chunks(document, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.create_documents([document.page_content])
    return chunks


In [9]:
# Function to generate a unique document ID based on the hash of the content
def generate_doc_id(content):
    normalized_content = content.strip().lower()
    return hashlib.sha256(normalized_content.encode('utf-8')).hexdigest()

In [10]:
# Function to add PDF document to FAISS store with consistent doc_id generation
def add_pdf_to_faiss(pdf_path, vector_store=None, index_path="faiss_index"):
    if vector_store is None:
        # Load or create a new FAISS index
        vector_store = load_faiss_index(embedding, index_path)

    pdf_loader = PyPDFLoader(pdf_path)
    documents = pdf_loader.load()

    new_documents = []
    embeddings_list = []

    # Check for existing documents in vector store
    existing_ids = set(
        generate_doc_id(doc.page_content)
        for doc_id, doc in vector_store.docstore._dict.items()
    ) if vector_store is not None else set()

    for document in documents:
        chunks = split_document_into_chunks(document)

        for chunk in chunks:
            doc_id = generate_doc_id(chunk.page_content)
            if doc_id not in existing_ids:
                new_documents.append(Document(page_content=chunk.page_content, metadata={"id": doc_id}))
                print(f"Embedding new document chunk with doc_id: {doc_id}")

    if new_documents:
        if vector_store is None:
            # Initialize FAISS index manually, passing in precomputed embeddings
            vector_store = FAISS.from_documents(new_documents, embedding)
            print(f"Created new FAISS index for {pdf_path}.")
        else:
            # Add the new documents and embeddings to the existing FAISS index
            vector_store.add_documents(new_documents, embeddings=embeddings_list)
            for idx, doc in enumerate(new_documents):
                vector_store.index_to_docstore_id[vector_store.index.ntotal - len(new_documents) + idx] = doc.metadata["id"]
            print(f"Added {len(new_documents)} new chunks to FAISS index.")
    else:
        print("No new chunks to add to FAISS.")

    # Save the updated FAISS index
    save_faiss_index(vector_store, index_path)
    return vector_store


In [11]:
# Example usage with a PDF file
pdf_path = "./Requirements/SET MSA Schedule 23_updated.pdf"  # Path to the PDF file
# Specify where the FAISS index should be stored
index_path = "faiss_index"

In [12]:
vector_store = add_pdf_to_faiss(pdf_path, vector_store=None, index_path=index_path)

No FAISS index found at faiss_index, creating a new one.
Embedding new document chunk with doc_id: 9210a3f803bbce8a9de86aa23fe0259034da5f3295b67c1e43df146319ba73f9
Embedding new document chunk with doc_id: 9fea60f03bbd1cd49949fa0d19a0ab599eb240433500364a13139ef90b99f854
Embedding new document chunk with doc_id: b0f8b8bd3f42e49cb309d763d97b9b543e50bcb7638883b2d57ec07cff5c1c4c
Embedding new document chunk with doc_id: 94c4d33bff81a106758ebf7b7581bb9f431a3d74e6ae8dff635dc158e7d12edc
Embedding new document chunk with doc_id: 81119c46edc025a2e334e347defc2dee98eb3a607db6569e47bb310dfbe7ebd3
Embedding new document chunk with doc_id: 917f707ba9989c6ad59f6cc5f39107f1241ae22ae0db117af1aeb6e276c779f4
Embedding new document chunk with doc_id: 6e84932888188d8d5fa39a1d237f364c250b3cb9059a1af6fa06fd146a8494cb
Embedding new document chunk with doc_id: 7d6aa675be3a1f273d3dbd1eaedca26658d23ffa584d078ea4a7039809563487
Embedding new document chunk with doc_id: 19892e3d9a30d21299914d13d8a0b19d4c5e538814bb7

In [13]:
def inspect_faiss_store(vector_store):
    # Check number of vectors stored
    num_vectors = vector_store.index.ntotal
    print(f"Number of vectors stored: {num_vectors}")
    
    # Check stored documents and metadata
    print("Stored documents:")
    for doc_id, document in vector_store.docstore._dict.items():
        print(f"Document ID: {doc_id}")
        print(f"Content: {document.page_content[:200]}")  # Print first 200 characters of content
        print(f"Metadata: {document.metadata}")
    
    # Retrieve and check stored embeddings
    if num_vectors > 0:
        for i in range(min(50, num_vectors)):  # Print embeddings of first 5 documents
            vector = vector_store.index.reconstruct(i)
            print(f"Embedding {i}: {vector[:10]}...")  # Print first 10 dimensions of the embedding
            print(vector.shape)
    else:
        print("No embeddings stored.")

In [14]:

inspect_faiss_store(vector_store)

Number of vectors stored: 29
Stored documents:
Document ID: 5213ca4e-f68c-4d41-9e80-9bf993f9ba2b
Content: Manufacture and Supply Agreement 
Schedule 23 
Cyber Security  
AC_182190997_9 244 SCHEDULE 23 
Cyber Security 
1. DEFINITIONS AND INTERPRETATION 
1.1 In this Schedule, in addition to the words and ex
Metadata: {'id': '9210a3f803bbce8a9de86aa23fe0259034da5f3295b67c1e43df146319ba73f9'}
Document ID: 7969e74d-1b3d-4d26-a4e8-408ff48238ac
Content: optical or tangible media), databases, and information and/or 
communication technology systems (whether or not instal led on or 
forming part of the Purchased Equipment) delivered or made 
available 
Metadata: {'id': '9fea60f03bbd1cd49949fa0d19a0ab599eb240433500364a13139ef90b99f854'}
Document ID: fac6960d-1d21-46e3-9d11-98923f28b99f
Content: negligently or without knowledge of its existence, including any 
viruses, worms, trojan horses, adware, spyware, logic bombs or 
other similar things or devices; 
NIS Regulations means The Network an
Met

In [15]:
retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":5})

In [16]:

retriever.invoke("Security Policy")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: too many values to unpack (expected 2)

In [143]:

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [625]:
history_aware_retriever=create_history_aware_retriever(llm,retriever,contextualize_q_prompt)
question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [626]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store={}
def get_session_history(session_id:str)->BaseChatMessageHistory:
    if  session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]
        
with_message_history=RunnableWithMessageHistory(llm,get_session_history)

In [627]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [628]:
from langchain_core.messages import AIMessage,HumanMessage,SystemMessage
chat_history=[]


question="tell about Security Policy in this document "
response=conversational_rag_chain.invoke(
    {"input": question},
    config={ "configurable": {"session_id": "session-1"} },  # constructs a key "abc123" in `store`.
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)

print(response['answer'])


ValueError: too many values to unpack (expected 2)

In [55]:
question="The Manufacturer shall:"
response=conversational_rag_chain.invoke(
    {"input": question},
    config={"configurable": {"session_id": "session-1"}},
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)
print(response['answer'])

ValueError: too many values to unpack (expected 2)

In [25]:
chat_history.clear()