In [14]:

from dotenv import load_dotenv
from langchain_openai import  AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain,create_history_aware_retriever
from  langchain_community.embeddings import OllamaEmbeddings
import hashlib
import os
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
import torch
import torch.nn.functional as F
load_dotenv()

os.environ['OPENAI_API_TYPE']=os.getenv("AL_OPENAI_API_TYPE")
os.environ['OPENAI_API_VERSION']=os.getenv("AL_OPENAI_API_VERSION")
os.environ['AZURE_OPENAI_ENDPOINT']=os.getenv("AL_AZURE_OPENAI_ENDPOINT")
os.environ['OPENAI_API_KEY']=os.getenv("AL_OPENAI_API_KEY")
os.environ['DEPLOYMENT_NAME']=os.getenv("AL_DEPLOYMENT_NAME")

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("AL_LANGCHAIN_PROJECT")

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

In [46]:
def encode_data(sentences, tokenizer, model):
    try:
       import torch
       import torch.nn.functional as F
       
       def mean_pooling(model_output, attention_mask):
           token_embeddings = model_output[0] # First element of model_output contains all token embeddings
           input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
           return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1),min=1e-9)

       encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
       with torch.no_grad():
           model_output = model(**encoded_input)
       sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
       sentence_embeddings = F.normalize(sentence_embeddings)
       return torch.squeeze(sentence_embeddings) .numpy()

    except Exception as e:
        print (sentences)

In [15]:
llm=AzureChatOpenAI()
# Initialize the embedding model
embedding=OllamaEmbeddings(model="mxbai-embed-large")

In [16]:
# Function to split text into manageable chunks using a Recursive Text Splitter
def split_document_into_chunks(document, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.create_documents([document.page_content])
    return chunks


In [17]:
# Function to generate a unique document ID based on the hash of the content
def generate_doc_id(content):
    normalized_content = content.strip().lower()
    return hashlib.sha256(normalized_content.encode('utf-8')).hexdigest()

In [18]:
# Function to load the FAISS index from disk if it exists
def load_faiss_index(index_path, embedding):
    index_file = os.path.join(index_path, "index.faiss")
    pkl_file = os.path.join(index_path, "index.pkl")
    
    if os.path.exists(index_file) and os.path.exists(pkl_file):
        print(f"Loading FAISS index and metadata from {index_path}")
        return FAISS.load_local(index_path, embedding,allow_dangerous_deserialization=True)
    else:
        print(f"No FAISS index found at {index_path}, creating a new one.")
        return None


In [19]:
# Function to save the FAISS index to disk
def save_faiss_index(vector_store, index_path):
    os.makedirs(index_path, exist_ok=True)
    vector_store.save_local(index_path)
    print(f"FAISS index and metadata saved to {index_path}")

In [26]:

# Function to add PDF document to FAISS store with consistent doc_id generation
def add_pdf_to_faiss(pdf_path, vector_store=None, index_path="faiss_index"):
    if vector_store is None:
        vector_store = load_faiss_index(index_path, embedding)

    # Load the PDF document
    pdf_loader = PyPDFLoader(pdf_path)
    documents = pdf_loader.load()
    
    new_documents = []
    new_embeddings = []

    # Retrieve existing document IDs from FAISS and normalize them
    existing_ids = set()
    if vector_store is not None:
        existing_ids = set(
            generate_doc_id(doc.page_content)  # Re-generate hash-based doc_ids for existing documents
            for doc_id, doc in vector_store.docstore._dict.items()
        )
        

    # Iterate through the documents, chunk them, and check if they are already embedded
    for document in documents:
        chunks = split_document_into_chunks(document)
        
        for chunk in chunks:
            doc_id = generate_doc_id(chunk.page_content)
            

            if doc_id not in existing_ids:
                new_documents.append(Document(page_content=chunk.page_content, metadata={"id": doc_id}))
                new_embeddings.append(embedding.embed_documents([chunk.page_content])[0])
                print(f"Embedding new document chunk with doc_id: {doc_id}")
            else:
                print(f"Document chunk {doc_id} already exists in FAISS, skipping.")

    if new_documents:
        if vector_store is None:
            vector_store = FAISS.from_documents(new_documents, embedding)
            print(f"Created new FAISS index for {pdf_path}.")
        else:
            for i, document in enumerate(new_documents):
                vector_store.add_documents([document], embeddings=[new_embeddings[i]])
                # Map the generated doc_id to the FAISS index's internal mapping
                vector_store.index_to_docstore_id[vector_store.index.ntotal - 1] = document.metadata["id"]
            print(f"Added {len(new_documents)} new chunks to FAISS index.")
    else:
        print("No new chunks to add to FAISS.")

    save_faiss_index(vector_store, index_path)

    return vector_store

In [27]:
# Example usage with a PDF file
pdf_path = "./Requirements/SET MSA Schedule 23_updated.pdf"  # Path to the PDF file
# Specify where the FAISS index should be stored
index_path = "faiss_index"

In [28]:
vector_store = add_pdf_to_faiss(pdf_path, vector_store=None, index_path=index_path)

Loading FAISS index and metadata from faiss_index
Document chunk 9210a3f803bbce8a9de86aa23fe0259034da5f3295b67c1e43df146319ba73f9 already exists in FAISS, skipping.
Document chunk 9fea60f03bbd1cd49949fa0d19a0ab599eb240433500364a13139ef90b99f854 already exists in FAISS, skipping.
Document chunk b0f8b8bd3f42e49cb309d763d97b9b543e50bcb7638883b2d57ec07cff5c1c4c already exists in FAISS, skipping.
Document chunk 94c4d33bff81a106758ebf7b7581bb9f431a3d74e6ae8dff635dc158e7d12edc already exists in FAISS, skipping.
Document chunk 81119c46edc025a2e334e347defc2dee98eb3a607db6569e47bb310dfbe7ebd3 already exists in FAISS, skipping.
Document chunk 917f707ba9989c6ad59f6cc5f39107f1241ae22ae0db117af1aeb6e276c779f4 already exists in FAISS, skipping.
Document chunk 6e84932888188d8d5fa39a1d237f364c250b3cb9059a1af6fa06fd146a8494cb already exists in FAISS, skipping.
Document chunk 7d6aa675be3a1f273d3dbd1eaedca26658d23ffa584d078ea4a7039809563487 already exists in FAISS, skipping.
Document chunk 19892e3d9a30d21

In [38]:

retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":5})

In [39]:
# #embeddings=OllamaEmbeddings(model="mxbai-embed-large")
# embeddings=AzureOpenAIEmbeddings()
# 
# loader=PyPDFDirectoryLoader("Requirements")
# docs=loader.load()
# text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
# chunksDocuments=text_splitter.split_documents(docs)
# ##result = [chunksDocument.dict()['page_content'] for chunksDocument in chunksDocuments]
# vector_store_db=FAISS.from_documents(chunksDocuments,embeddings)
# ##retriever=vector_store_db.as_retriever(search_type="similarity",search_kwargs={"k":1})
# ##retriever_tool=create_retriever_tool(retriever,"PhaseFinder","Search phases in the document")

In [40]:

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [41]:
history_aware_retriever=create_history_aware_retriever(llm,retriever,contextualize_q_prompt)
question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [42]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store={}
def get_session_history(session_id:str)->BaseChatMessageHistory:
    if  session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]
        
with_message_history=RunnableWithMessageHistory(llm,get_session_history)

In [43]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [44]:
from langchain_core.messages import AIMessage,HumanMessage,SystemMessage
chat_history=[]


question="tell about Security Policy in this document "
response=conversational_rag_chain.invoke(
    {"input": question},
    config={ "configurable": {"session_id": "session-1"} },  # constructs a key "abc123" in `store`.
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)

print(response['answer'])


The Security Policy in this document requires the Manufacturer to ensure that all IT Systems, OT Systems, and associated systems, hardware, and firmware comply with its requirements. The Manufacturer must limit access to these systems to authorized personnel who need access solely for performing obligations under the MSA, with access rights reviewed regularly and personnel appropriately screened in accordance with best industry practice. Additionally, the Manufacturer must comply with the Security Policy and any other cyber security policies and procedures notified by the Operator.


In [45]:
question="The Manufacturer shall:"
response=conversational_rag_chain.invoke(
    {"input": question},
    config={"configurable": {"session_id": "session-1"}},
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)
print(response['answer'])

The Manufacturer shall:

1. Comply with legal and regulatory requirements, best industry practice, latest technological developments, threat intelligence, sections [A35 to A40] of the Functional Specification, security requirements set out in this Schedule, and any reasonable security guidelines or instructions provided by the Operator.
2. Continually measure, review, provide evidence of, and document compliance with all security requirements, reporting such compliance to the Operator upon request.
3. Allow the Operator and/or the Owner to carry out necessary audits or inspections.
4. Provide full details of any Security Incident to the Operator within five working days of its resolution, including root cause analysis.
5. Assist the Operator with information provision in connection with incident notifications to authorities and contribute to any subsequent investigations or inspections.
6. Certify annually that it has complied with the requirements of this Schedule. 
7. Take all reason

In [25]:
chat_history.clear()