In [1]:

from dotenv import load_dotenv
from langchain_openai import  AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain,create_history_aware_retriever
from  langchain_community.embeddings import OllamaEmbeddings,HuggingFaceEmbeddings
import hashlib
import os
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import AutoTokenizer,AutoModel

load_dotenv()

os.environ['OPENAI_API_TYPE']=os.getenv("AL_OPENAI_API_TYPE")
os.environ['OPENAI_API_VERSION']=os.getenv("AL_OPENAI_API_VERSION")
os.environ['AZURE_OPENAI_ENDPOINT']=os.getenv("AL_AZURE_OPENAI_ENDPOINT")
os.environ['OPENAI_API_KEY']=os.getenv("AL_OPENAI_API_KEY")
os.environ['DEPLOYMENT_NAME']=os.getenv("AL_DEPLOYMENT_NAME")

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("AL_LANGCHAIN_PROJECT")

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
import torch.nn.functional as F
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import os
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings

class CustomLangChainEmbedding(Embeddings):
    def __init__(self, model_name="bert-base-uncased", use_gpu=False):
        """
        Initialize the embedding class with a specific transformer model.
        
        Args:
            model_name (str): Name of the pre-trained transformer model.
            use_gpu (bool): If True, use GPU (CUDA) for inference; otherwise, use CPU.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,clean_up_tokenization_spaces=True)
        self.model = AutoModel.from_pretrained(model_name)

        # Use GPU if available and requested
        self.device = torch.device("cuda" if torch.cuda.is_available() and use_gpu else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")


    def mean_pooling(self, model_output, attention_mask):
        """
        Mean pooling to compute sentence embeddings from token embeddings.
        """
        token_embeddings = model_output[0]  # First element is token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    
        
    def encode_data(self, sentences):
        """
        Encode the input sentences into sentence embeddings.
        
        Args:
            sentences (list of str): List of sentences to encode.
        
        Returns:
            np.ndarray: Sentence embeddings as a numpy array.
        """
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
                                         
        
        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings)
        print(sentence_embeddings.shape)
        return torch.squeeze(sentence_embeddings).numpy() # Convert to numpy for FAISS or other downstream tasks

    def embed_documents(self, texts):
        """
        LangChain-compatible method to create embeddings for documents.
        
        Args:
            texts (list of str): List of documents (text) to create embeddings for.
        
        Returns:
            np.ndarray: Document embeddings as numpy arrays.
        """
        return self.encode_data(texts)

    def embed_query(self, text):
        """
        LangChain-compatible method to create embedding for a single query.
        
        Args:
            text (str): Query to create embedding for.
        
        Returns:
            np.ndarray: Query embedding as a numpy array.
        """
        return self.encode_data(text)


# Saving and Loading FAISS Index Locally

    # Function to save the FAISS index to disk
def save_faiss_index(vector_store, index_path):
    os.makedirs(index_path, exist_ok=True)
    vector_store.save_local(index_path)
    print(f"FAISS index and metadata saved to {index_path}")


def load_faiss_index(embedding,index_path):
     index_file = os.path.join(index_path, "index.faiss")
     pkl_file = os.path.join(index_path, "index.pkl")
     
     if os.path.exists(index_file) and os.path.exists(pkl_file):
         print(f"Loading FAISS index and metadata from {index_path}")
         return FAISS.load_local(index_path, embedding,allow_dangerous_deserialization=True)
     else:
         print(f"No FAISS index found at {index_path}, creating a new one.")
         return None





In [3]:
embedding = CustomLangChainEmbedding(model_name="./Models/all-MiniLM-L6-v2", use_gpu=False)

Model loaded on cpu


In [4]:
llm=AzureChatOpenAI()
# Initialize the embedding model
#embedding=OllamaEmbeddings(model="mxbai-embed-large")

In [5]:
# Function to split text into manageable chunks using a Recursive Text Splitter
def split_document_into_chunks(document, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.create_documents([document.page_content])
    return chunks


In [6]:
# Function to generate a unique document ID based on the hash of the content
def generate_doc_id(content):
    normalized_content = content.strip().lower()
    return hashlib.sha256(normalized_content.encode('utf-8')).hexdigest()

In [7]:
# Function to add PDF document to FAISS store with consistent doc_id generation
def add_pdf_to_faiss(pdf_path, vector_store=None, index_path="faiss_index"):
    if vector_store is None:
        # Load or create a new FAISS index
        vector_store = load_faiss_index(embedding, index_path)

    pdf_loader = PyPDFLoader(pdf_path)
    documents = pdf_loader.load()

    new_documents = []
    embeddings_list = []

    # Check for existing documents in vector store
    existing_ids = set(
        generate_doc_id(doc.page_content)
        for doc_id, doc in vector_store.docstore._dict.items()
    ) if vector_store is not None else set()

    for document in documents:
        chunks = split_document_into_chunks(document)

        for chunk in chunks:
            doc_id = generate_doc_id(chunk.page_content)
            if doc_id not in existing_ids:
                new_embedding = embedding.encode_data(chunk.page_content).reshape(1,-1)
                new_documents.append(Document(page_content=chunk.page_content, metadata={"id": doc_id}))
                print(f"Embedding new document chunk with doc_id: {doc_id}")
                embeddings_list.append(new_embedding)

     # Debugging information
    print(f"Total new documents: {len(new_documents)}")
    print(f"Total embeddings created: {len(embeddings_list)}")
    
    if new_documents:
        if vector_store is None:
            # Initialize FAISS index manually, passing in precomputed embeddings
            vector_store = FAISS.from_documents(new_documents, embedding)
            print(f"Created new FAISS index for {pdf_path}.")
        else:
            # Add the new documents and embeddings to the existing FAISS index
            vector_store.add_documents(new_documents, embeddings=embeddings_list)
            for idx, doc in enumerate(new_documents):
                vector_store.index_to_docstore_id[vector_store.index.ntotal - len(new_documents) + idx] = doc.metadata["id"]
            print(f"Added {len(new_documents)} new chunks to FAISS index.")
    else:
        print("No new chunks to add to FAISS.")

    # Save the updated FAISS index
    save_faiss_index(vector_store, index_path)
    return vector_store


In [8]:
# Example usage with a PDF file
pdf_path = "./Requirements/SET MSA Schedule 23_updated.pdf"  # Path to the PDF file
# Specify where the FAISS index should be stored
index_path = "faiss_index"

In [9]:
vector_store = add_pdf_to_faiss(pdf_path, vector_store=None, index_path=index_path)

Loading FAISS index and metadata from faiss_index
Total new documents: 0
Total embeddings created: 0
No new chunks to add to FAISS.
FAISS index and metadata saved to faiss_index


In [10]:
def inspect_faiss_store(vector_store):
    # Check number of vectors stored
    num_vectors = vector_store.index.ntotal
    print(f"Number of vectors stored: {num_vectors}")
    
    # Check stored documents and metadata
    print("Stored documents:")
    for doc_id, document in vector_store.docstore._dict.items():
        print(f"Document ID: {doc_id}")
        print(f"Content: {document.page_content[:200]}")  # Print first 200 characters of content
        print(f"Metadata: {document.metadata}")
    
    # Retrieve and check stored embeddings
    if num_vectors > 0:
        for i in range(min(5, num_vectors)):  # Print embeddings of first 5 documents
            vector = vector_store.index.reconstruct(i)
            print(f"Vector Shape: {vector.shape}...")
            print(f"Embedding {i}: {vector[:10]}...")  # Print first 10 dimensions of the embedding
            
    else:
        print("No embeddings stored.")

In [11]:

inspect_faiss_store(vector_store)

Number of vectors stored: 29
Stored documents:
Document ID: b287f7a1-0875-46b3-bd2a-d205ba83f7f8
Content: Manufacture and Supply Agreement 
Schedule 23 
Cyber Security  
AC_182190997_9 244 SCHEDULE 23 
Cyber Security 
1. DEFINITIONS AND INTERPRETATION 
1.1 In this Schedule, in addition to the words and ex
Metadata: {'id': '9210a3f803bbce8a9de86aa23fe0259034da5f3295b67c1e43df146319ba73f9'}
Document ID: 13009598-e897-4d3b-b928-7d2179495b05
Content: optical or tangible media), databases, and information and/or 
communication technology systems (whether or not instal led on or 
forming part of the Purchased Equipment) delivered or made 
available 
Metadata: {'id': '9fea60f03bbd1cd49949fa0d19a0ab599eb240433500364a13139ef90b99f854'}
Document ID: 41afc128-8b2c-4466-af1b-1a357c84a0d6
Content: negligently or without knowledge of its existence, including any 
viruses, worms, trojan horses, adware, spyware, logic bombs or 
other similar things or devices; 
NIS Regulations means The Network an
Met

In [12]:
retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":5})

In [15]:

retriever.invoke("Security Policy")
#em1="Security Policy"
#em=embedding.embed_query(em1)
#vector_store.similarity_search("Security Policy")
#print(em.shape)
#em=em.flatten()
#print(em.shape)
#vector_store.similarity_search_by_vector(em)

  

torch.Size([1, 384])


[Document(metadata={'id': '94c4d33bff81a106758ebf7b7581bb9f431a3d74e6ae8dff635dc158e7d12edc'}, page_content="(b) the occurrence of one or more events which, either \nindividually or collectively, have an adverse effect on the \nconfidentiality, integrity, availability or security of the railway, \nany data and/or the IT Systems and/or the OT Systems; \nSecurity Policy means the Operator's IT/Information Security Policy as may be \nnotified to the Manufacturer from time to time; and \nSecurity, Continuity & has the meaning given in paragraph 3.1 and as updated from time"),
 Document(metadata={'id': '6e84932888188d8d5fa39a1d237f364c250b3cb9059a1af6fa06fd146a8494cb'}, page_content='Operator, and shall comply with the following in respect thereof:  \n2.3.1 legal and regulatory requirements;  \n2.3.2 best industry practice; \n2.3.3 latest technological developments;  \n2.3.4 threat intelligence (e.g. from National Cyber Security Centre alerts); \n2.3.5 sections [A35 to A40]53 of the Functio

In [16]:
 
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [17]:
history_aware_retriever=create_history_aware_retriever(llm,retriever,contextualize_q_prompt)
question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [18]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store={}
def get_session_history(session_id:str)->BaseChatMessageHistory:
    if  session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]
        
with_message_history=RunnableWithMessageHistory(llm,get_session_history)

In [19]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [20]:
from langchain_core.messages import AIMessage,HumanMessage,SystemMessage
chat_history=[]


question="tell about Security Policy in this document "
response=conversational_rag_chain.invoke(
    {"input": question},
    config={ "configurable": {"session_id": "session-1"} },  # constructs a key "abc123" in `store`.
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)

print(response['answer'])


torch.Size([1, 384])
The Security Policy in this document refers to the Operator's IT/Information Security Policy, which is communicated to the Manufacturer as needed. The Manufacturer must comply with this policy, ensuring that all IT and OT Systems, along with related systems, hardware, and firmware, adhere to its requirements. Compliance involves implementing and maintaining appropriate security measures that meet or exceed industry standards and legal requirements.


In [21]:
question="The Manufacturer shall:"
response=conversational_rag_chain.invoke(
    {"input": question},
    config={"configurable": {"session_id": "session-1"}},
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)
print(response['answer'])

torch.Size([1, 384])


Failed to batch ingest runs: langsmith.utils.LangSmithConnectionError: Connection error caused failure to POST https://api.smith.langchain.com/runs/batch in LangSmith API. Please confirm your internet connection. SSLError(MaxRetryError("HTTPSConnectionPool(host='api.smith.langchain.com', port=443): Max retries exceeded with url: /runs/batch (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1006)')))"))
Content-Length: 55909
API Key: lsv2_********************************************21
post: trace=486f5be8-81fe-4821-9d50-3fd1602eb990,id=10bae9a2-58cc-4057-92db-3a305fea3870; patch: trace=486f5be8-81fe-4821-9d50-3fd1602eb990,id=486f5be8-81fe-4821-9d50-3fd1602eb990; trace=486f5be8-81fe-4821-9d50-3fd1602eb990,id=de62bc69-8c71-47f3-b7b1-83ef398ae979; trace=486f5be8-81fe-4821-9d50-3fd1602eb990,id=f3065379-034a-460b-adee-969c5d08c7de; trace=486f5be8-81fe-4821-9d50-3fd1602eb990,id=d033795b-2

The Manufacturer shall:

1. Use all reasonable endeavors to ensure Sub-contractors are subject to cyber security obligations that provide no less protection than those set out in the Schedule.
2. Fully indemnify the Operator against costs, liabilities, and losses from breaches of the Schedule or security incidents caused by the Manufacturer or its Sub-contractors.
3. Certify annually that it has complied with the Schedule's requirements.
4. Allow the Operator to carry out audits and inspections as deemed necessary, providing full access and cooperation.
5. Measure, review, and document compliance with all security requirements, reporting to the Operator upon request.
6. Provide details and root cause analysis of any Security Incident within five working days of resolution.
7. Assist the Operator with information provision for any incident notifications to competent authorities and contribute to investigations as required.


In [25]:
chat_history.clear()