In [66]:

from dotenv import load_dotenv
from langchain_openai import  AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain,create_history_aware_retriever
from  langchain_community.embeddings import OllamaEmbeddings,HuggingFaceEmbeddings
import hashlib
import os
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import AutoTokenizer,AutoModel

load_dotenv()

os.environ['OPENAI_API_TYPE']=os.getenv("AL_OPENAI_API_TYPE")
os.environ['OPENAI_API_VERSION']=os.getenv("AL_OPENAI_API_VERSION")
os.environ['AZURE_OPENAI_ENDPOINT']=os.getenv("AL_AZURE_OPENAI_ENDPOINT")
os.environ['OPENAI_API_KEY']=os.getenv("AL_OPENAI_API_KEY")
os.environ['DEPLOYMENT_NAME']=os.getenv("AL_DEPLOYMENT_NAME")

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("AL_LANGCHAIN_PROJECT")

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"


In [89]:
import torch
import torch.nn.functional as F
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import os
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings

class CustomLangChainEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2", use_gpu=False):
        """
        Initialize the embedding class with a specific transformer model.
        
        Args:
            model_name (str): Name of the pre-trained transformer model.
            use_gpu (bool): If True, use GPU (CUDA) for inference; otherwise, use CPU.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,clean_up_tokenization_spaces=True)
        self.model = AutoModel.from_pretrained(model_name)

        # Use GPU if available and requested
        self.device = torch.device("cuda" if torch.cuda.is_available() and use_gpu else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")


    def mean_pooling(self, model_output, attention_mask):
        """
        Mean pooling to compute sentence embeddings from token embeddings.
        """
        token_embeddings = model_output[0]  # First element is token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    
        
    def encode_data(self, sentences):
        """
        Encode the input sentences into sentence embeddings.
        
        Args:
            sentences (list of str): List of sentences to encode.
        
        Returns:
            np.ndarray: Sentence embeddings as a numpy array.
        """
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
                                         
        
        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings)
        print(sentence_embeddings.shape)
        return torch.squeeze(sentence_embeddings).numpy() # Convert to numpy for FAISS or other downstream tasks

    def embed_documents(self, texts):
        """
        LangChain-compatible method to create embeddings for documents.
        
        Args:
            texts (list of str): List of documents (text) to create embeddings for.
        
        Returns:
            np.ndarray: Document embeddings as numpy arrays.
        """
        return self.encode_data(texts)

    def embed_query(self, text):
        """
        LangChain-compatible method to create embedding for a single query.
        
        Args:
            text (str): Query to create embedding for.
        
        Returns:
            np.ndarray: Query embedding as a numpy array.
        """
        return self.encode_data(text)



In [90]:
import os
import hashlib
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

class FaissIndexManager:
    def __init__(self, embedding, index_path="faiss_index"):
        self.embedding = embedding
        self.index_path = index_path
        self.vector_store = self.load_faiss_index()
    
    # Function to save the FAISS index to disk
    def save_faiss_index(self):
        os.makedirs(self.index_path, exist_ok=True)
        self.vector_store.save_local(self.index_path)
        print(f"FAISS index and metadata saved to {self.index_path}")
    
    # Function to load FAISS index from disk
    def load_faiss_index(self):
        index_file = os.path.join(self.index_path, "index.faiss")
        pkl_file = os.path.join(self.index_path, "index.pkl")
        
        if os.path.exists(index_file) and os.path.exists(pkl_file):
            print(f"Loading FAISS index and metadata from {self.index_path}")
            return FAISS.load_local(self.index_path, self.embedding, allow_dangerous_deserialization=True)
        else:
            print(f"No FAISS index found at {self.index_path}, creating a new one.")
            return None
    
    # Function to split a document into chunks
    @staticmethod
    def split_document_into_chunks(document, chunk_size=1000, chunk_overlap=200):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )
        chunks = text_splitter.create_documents([document.page_content])
        return chunks
    
    # Function to generate a consistent document ID using a hash
    @staticmethod
    def generate_doc_id(content):
        normalized_content = content.strip().lower()
        return hashlib.sha256(normalized_content.encode('utf-8')).hexdigest()
    
    # Function to add a PDF document to the FAISS store
    def add_pdf_to_faiss(self, pdf_path):
        if self.vector_store is None:
            # Load or create a new FAISS index
            self.vector_store = self.load_faiss_index()

        pdf_loader = PyPDFLoader(pdf_path)
        documents = pdf_loader.load()

        new_documents = []
        embeddings_list = []

        # Check for existing documents in vector store
        existing_ids = set(
            self.generate_doc_id(doc.page_content)
            for doc_id, doc in self.vector_store.docstore._dict.items()
        ) if self.vector_store is not None else set()

        for document in documents:
            chunks = self.split_document_into_chunks(document)

            for chunk in chunks:
                doc_id = self.generate_doc_id(chunk.page_content)
                if doc_id not in existing_ids:
                    new_embedding = self.embedding.encode_data(chunk.page_content).reshape(1, -1)
                    new_documents.append(Document(page_content=chunk.page_content, metadata={"id": doc_id}))
                    print(f"Embedding new document chunk with doc_id: {doc_id}")
                    embeddings_list.append(new_embedding)

        # Debugging information
        print(f"Total new documents: {len(new_documents)}")
        print(f"Total embeddings created: {len(embeddings_list)}")

        if new_documents:
            if self.vector_store is None:
                # Initialize FAISS index manually, passing in precomputed embeddings
                self.vector_store = FAISS.from_documents(new_documents, self.embedding)
                print(f"Created new FAISS index for {pdf_path}.")
            else:
                # Add the new documents and embeddings to the existing FAISS index
                self.vector_store.add_documents(new_documents, embeddings=embeddings_list)
                for idx, doc in enumerate(new_documents):
                    self.vector_store.index_to_docstore_id[self.vector_store.index.ntotal - len(new_documents) + idx] = doc.metadata["id"]
                print(f"Added {len(new_documents)} new chunks to FAISS index.")
        else:
            print("No new chunks to add to FAISS.")

        # Save the updated FAISS index
        self.save_faiss_index()
        return self.vector_store
    
    # Function to inspect the FAISS store
    def inspect_faiss_store(self):
        if self.vector_store is None:
            print("FAISS store is empty or not loaded.")
            return
        
        # Check number of vectors stored
        num_vectors = self.vector_store.index.ntotal
        print(f"Number of vectors stored: {num_vectors}")
        
        # Check stored documents and metadata
        print("Stored documents:")
        for doc_id, document in self.vector_store.docstore._dict.items():
            print(f"Document ID: {doc_id}")
            print(f"Content: {document.page_content[:200]}")  # Print first 200 characters of content
            print(f"Metadata: {document.metadata}")
        
        # Retrieve and check stored embeddings
        if num_vectors > 0:
            for i in range(min(5, num_vectors)):  # Print embeddings of first 5 documents
                vector = self.vector_store.index.reconstruct(i)
                print(f"Vector Shape: {vector.shape}...")
                print(f"Embedding {i}: {vector[:10]}...")  # Print first 10 dimensions of the embedding
        else:
            print("No embeddings stored.")


In [91]:
# Initialize the embedding model
embedding = CustomLangChainEmbedding(model_name="./Models/all-MiniLM-L6-v2", use_gpu=False)
#embedding=OllamaEmbeddings(model="mxbai-embed-large")
llm=AzureChatOpenAI()
faiss_manager = FaissIndexManager(embedding)

Model loaded on cpu
Loading FAISS index and metadata from faiss_index


In [92]:
vector_store=faiss_manager.add_pdf_to_faiss("./Requirements/SET MSA Schedule 23_updated.pdf")

Total new documents: 0
Total embeddings created: 0
No new chunks to add to FAISS.
FAISS index and metadata saved to faiss_index


In [93]:
retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":5})

In [94]:

retriever.invoke("Security Policy")

torch.Size([1, 384])


[Document(metadata={'id': '94c4d33bff81a106758ebf7b7581bb9f431a3d74e6ae8dff635dc158e7d12edc'}, page_content="(b) the occurrence of one or more events which, either \nindividually or collectively, have an adverse effect on the \nconfidentiality, integrity, availability or security of the railway, \nany data and/or the IT Systems and/or the OT Systems; \nSecurity Policy means the Operator's IT/Information Security Policy as may be \nnotified to the Manufacturer from time to time; and \nSecurity, Continuity & has the meaning given in paragraph 3.1 and as updated from time"),
 Document(metadata={'id': '6e84932888188d8d5fa39a1d237f364c250b3cb9059a1af6fa06fd146a8494cb'}, page_content='Operator, and shall comply with the following in respect thereof:  \n2.3.1 legal and regulatory requirements;  \n2.3.2 best industry practice; \n2.3.3 latest technological developments;  \n2.3.4 threat intelligence (e.g. from National Cyber Security Centre alerts); \n2.3.5 sections [A35 to A40]53 of the Functio

In [84]:
 
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [85]:
history_aware_retriever=create_history_aware_retriever(llm,retriever,contextualize_q_prompt)
question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [86]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store={}
def get_session_history(session_id:str)->BaseChatMessageHistory:
    if  session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]
        
with_message_history=RunnableWithMessageHistory(llm,get_session_history)

In [87]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [88]:
from langchain_core.messages import AIMessage,HumanMessage,SystemMessage
chat_history=[]


question="tell about Security Policy in this document "
response=conversational_rag_chain.invoke(
    {"input": question},
    config={ "configurable": {"session_id": "session-1"} },  # constructs a key "abc123" in `store`.
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)

print(response['answer'])


torch.Size([1, 384])
The Security Policy refers to the Operator's IT/Information Security Policy, which may be notified to the Manufacturer from time to time. The Manufacturer must comply with this Security Policy and ensure that all IT Systems, OT Systems, and associated systems, hardware, and firmware adhere to its requirements. Compliance includes implementing and maintaining appropriate security measures, limiting access to authorized personnel, and adhering to industry standards and legal requirements.


In [54]:
question="The Manufacturer shall:"
response=conversational_rag_chain.invoke(
    {"input": question},
    config={"configurable": {"session_id": "session-1"}},
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)
print(response['answer'])

torch.Size([1, 384])
The Manufacturer shall:

1. Comply with legal and regulatory requirements, best industry practice, latest technological developments, threat intelligence, sections [A35 to A40] of the Functional Specification, the security requirements set out in the Schedule, and any reasonable security guidelines/instructions provided by the Operator.
2. Continually measure, review, provide evidence of, and document its compliance with all security requirements, and report such compliance to the Operator on request.
3. Ensure that all personnel with access to data, network, information systems, IT Systems, and OT Systems are subject to a contractual duty of confidence and comply with the Manufacturer's obligations.
4. Comply with the Rail Cyber Security Guidance to Industry and ensure network segmentation between safety integrity levels (SILs).
5. Comply at all times with the Security, Continuity & Recovery Plan.
6. Notify and assist the Operator in the event of any security inci

In [55]:
chat_history.clear()