In [100]:

from dotenv import load_dotenv
from langchain_openai import  AzureChatOpenAI
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain,create_history_aware_retriever
from  langchain_community.embeddings import OllamaEmbeddings,HuggingFaceEmbeddings
import hashlib
import os
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from transformers import AutoTokenizer,AutoModel

load_dotenv()

os.environ['OPENAI_API_TYPE']=os.getenv("AL_OPENAI_API_TYPE")
os.environ['OPENAI_API_VERSION']=os.getenv("AL_OPENAI_API_VERSION")
os.environ['AZURE_OPENAI_ENDPOINT']=os.getenv("AL_AZURE_OPENAI_ENDPOINT")
os.environ['OPENAI_API_KEY']=os.getenv("AL_OPENAI_API_KEY")
os.environ['DEPLOYMENT_NAME']=os.getenv("AL_DEPLOYMENT_NAME")

os.environ["LANGCHAIN_TRACING_V2"]="true"
os.environ["LANGCHAIN_PROJECT"]=os.getenv("AL_LANGCHAIN_PROJECT")

os.environ["HF_TOKEN"]=os.getenv("HF_TOKEN")

In [101]:
import torch
import torch.nn.functional as F
import faiss
import numpy as np
from transformers import AutoTokenizer, AutoModel
import os
from langchain.vectorstores import FAISS
from langchain.embeddings.base import Embeddings

class CustomLangChainEmbedding(Embeddings):
    def __init__(self, model_name="bert-base-uncased", use_gpu=False):
        """
        Initialize the embedding class with a specific transformer model.
        
        Args:
            model_name (str): Name of the pre-trained transformer model.
            use_gpu (bool): If True, use GPU (CUDA) for inference; otherwise, use CPU.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,clean_up_tokenization_spaces=True)
        self.model = AutoModel.from_pretrained(model_name)

        # Use GPU if available and requested
        self.device = torch.device("cuda" if torch.cuda.is_available() and use_gpu else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")

    def mean_pooling(self, model_output, attention_mask):
        """
        Mean pooling to compute sentence embeddings from token embeddings.
        """
        token_embeddings = model_output[0]  # First element is token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    def encode_data(self, sentences):
        """
        Encode the input sentences into sentence embeddings.
        
        Args:
            sentences (list of str): List of sentences to encode.
        
        Returns:
            np.ndarray: Sentence embeddings as a numpy array.
        """
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to(self.device)
        with torch.no_grad():
            model_output = self.model(**encoded_input)

        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)  # L2-normalize embeddings

        return sentence_embeddings.cpu().numpy()  # Convert to numpy for FAISS or other downstream tasks

    def embed_documents(self, texts):
        """
        LangChain-compatible method to create embeddings for documents.
        
        Args:
            texts (list of str): List of documents (text) to create embeddings for.
        
        Returns:
            np.ndarray: Document embeddings as numpy arrays.
        """
        return self.encode_data(texts)

    def embed_query(self, text):
        """
        LangChain-compatible method to create embedding for a single query.
        
        Args:
            text (str): Query to create embedding for.
        
        Returns:
            np.ndarray: Query embedding as a numpy array.
        """
        return self.encode_data([text])


# Saving and Loading FAISS Index Locally

def save_faiss_index(embedder, sentences, index_path):
    """
    Create and save a FAISS index locally.

    Args:
        embedder (CustomLangChainEmbedding): The embedding class instance.
        sentences (list of str): Sentences to add to the FAISS index.
        index_path (str): Path to save the FAISS index.
    """
    # Step 1: Create FAISS index with the correct embedding dimension
    dimension = embedder.encode_data([sentences[0]]).shape[1]  # Get embedding dimension
    index = faiss.IndexFlatIP(dimension)  # Inner product (cosine similarity) index

    # Step 2: Add embeddings to the FAISS index
    embeddings = embedder.embed_documents(sentences)
    index.add(np.array(embeddings, dtype=np.float32))  # Add embeddings

    # Step 3: Save the FAISS index locally
    faiss.write_index(index, os.path.join(index_path, "index.faiss"))
    print(f"FAISS index saved at {index_path}")


def load_faiss_index(embedder, index_path):
    """
    Load a FAISS index from disk and return the FAISS object from LangChain.
    
    Args:
        embedder (CustomLangChainEmbedding): The embedding class instance.
        index_path (str): Path to the saved FAISS index.
    
    Returns:
        FAISS: LangChain FAISS object with the loaded index.
    """
    index_file = os.path.join(index_path, "index.faiss")
    if not os.path.exists(index_file):
        raise FileNotFoundError(f"FAISS index not found at {index_file}")

    # Step 1: Load the FAISS index from file
    index = faiss.read_index(index_file)

    # Step 2: Load the FAISS index in LangChain
    vector_store = FAISS(embedding=embedder, index=index)
    print(f"FAISS index loaded from {index_path}")
    return vector_store




In [102]:
def encode_data(sentences, tokenizer, model):
    try:
       import torch
       import torch.nn.functional as F
       
       def mean_pooling(model_output, attention_mask):
           token_embeddings = model_output[0] # First element of model_output contains all token embeddings
           input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
           return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1),min=1e-9)

       encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
       with torch.no_grad():
           model_output = model(**encoded_input)
       sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
       sentence_embeddings = F.normalize(sentence_embeddings)
       return torch.squeeze(sentence_embeddings) .numpy()

    except Exception as e:
        print (sentences)

In [103]:
minilmTokenizer = AutoTokenizer.from_pretrained("./Models/all-MiniLM-L6-v2",clean_up_tokenization_spaces=True)
minilmModel = AutoModel.from_pretrained("./Models/all-MiniLM-L6-v2")

In [104]:
embedding = CustomLangChainEmbedding(model_name="./Models/all-MiniLM-L6-v2", use_gpu=False)


Model loaded on cpu


In [105]:
llm=AzureChatOpenAI()
# Initialize the embedding model
#embedding=OllamaEmbeddings(model="mxbai-embed-large")

In [106]:
# Function to split text into manageable chunks using a Recursive Text Splitter
def split_document_into_chunks(document, chunk_size=1000, chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", " ", ""]
    )
    chunks = text_splitter.create_documents([document.page_content])
    return chunks


In [107]:
# Function to generate a unique document ID based on the hash of the content
def generate_doc_id(content):
    normalized_content = content.strip().lower()
    return hashlib.sha256(normalized_content.encode('utf-8')).hexdigest()

In [108]:
# Function to load the FAISS index from disk if it exists
def load_faiss_index(index_path, embedding):
    index_file = os.path.join(index_path, "index.faiss")
    pkl_file = os.path.join(index_path, "index.pkl")
    
    if os.path.exists(index_file) and os.path.exists(pkl_file):
        print(f"Loading FAISS index and metadata from {index_path}")
        return FAISS.load_local(index_path, embedding,allow_dangerous_deserialization=True)
    else:
        print(f"No FAISS index found at {index_path}, creating a new one.")
        return None


In [109]:
# Function to save the FAISS index to disk
def save_faiss_index(vector_store, index_path):
    os.makedirs(index_path, exist_ok=True)
    vector_store.save_local(index_path)
    print(f"FAISS index and metadata saved to {index_path}")

In [113]:

# Function to add PDF document to FAISS store with consistent doc_id generation
def add_pdf_to_faiss(pdf_path, vector_store=None, index_path="faiss_index"):
    if vector_store is None:
        vector_store = embedding.load_faiss_index(embedding,index_path)

    # Load the PDF document
    pdf_loader = PyPDFLoader(pdf_path)
    documents = pdf_loader.load()
    
    new_documents = []
    new_embeddings = []

    # Retrieve existing document IDs from FAISS and normalize them
    existing_ids = set()
    if vector_store is not None:
        existing_ids = set(
            generate_doc_id(doc.page_content)  # Re-generate hash-based doc_ids for existing documents
            for doc_id, doc in vector_store.docstore._dict.items()
        )
        

    # Iterate through the documents, chunk them, and check if they are already embedded
    for document in documents:
        chunks = split_document_into_chunks(document)
        
        for chunk in chunks:
            doc_id = generate_doc_id(chunk.page_content)
            
            #encoded_data=encode_data(chunk.page_content,minilmTokenizer,minilmModel)
            #print(encoded_data)
        
            if doc_id not in existing_ids:
                new_documents.append(Document(page_content=chunk.page_content, metadata={"id": doc_id}))
                new_embeddings.append(embedding.embed_documents([chunk.page_content])[0])
                print(f"Embedding new document chunk with doc_id: {doc_id}")
            else:
                print(f"Document chunk {doc_id} already exists in FAISS, skipping.")

    if new_documents:
        if vector_store is None:
            vector_store = FAISS.from_documents(new_documents, embedding)
            print(f"Created new FAISS index for {pdf_path}.")
        else:
            for i, document in enumerate(new_documents):
                vector_store.add_documents([document], embeddings=[new_embeddings[i]])
                # Map the generated doc_id to the FAISS index's internal mapping
                vector_store.index_to_docstore_id[vector_store.index.ntotal - 1] = document.metadata["id"]
            print(f"Added {len(new_documents)} new chunks to FAISS index.")
    else:
        print("No new chunks to add to FAISS.")

    embedding.save_faiss_index(vector_store, index_path)

    return vector_store

In [114]:
# Example usage with a PDF file
pdf_path = "./Requirements/SET MSA Schedule 23_updated.pdf"  # Path to the PDF file
# Specify where the FAISS index should be stored
index_path = "faiss_index"

In [115]:
vector_store = add_pdf_to_faiss(pdf_path, vector_store=None, index_path=index_path)

AttributeError: 'CustomLangChainEmbedding' object has no attribute 'load_faiss_index'

In [94]:

retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":1})

In [95]:
# #embeddings=OllamaEmbeddings(model="mxbai-embed-large")
# embeddings=AzureOpenAIEmbeddings()
# 
# loader=PyPDFDirectoryLoader("Requirements")
# docs=loader.load()
# text_splitter=RecursiveCharacterTextSplitter(chunk_size=1000,chunk_overlap=200)
# chunksDocuments=text_splitter.split_documents(docs)
# ##result = [chunksDocument.dict()['page_content'] for chunksDocument in chunksDocuments]
# vector_store_db=FAISS.from_documents(chunksDocuments,embeddings)
# ##retriever=vector_store_db.as_retriever(search_type="similarity",search_kwargs={"k":1})
# ##retriever_tool=create_retriever_tool(retriever,"PhaseFinder","Search phases in the document")

In [96]:
search_results=vector_store.similarity_search("Security Policy",k=1)
#retriever.invoke("Security Policy")
for result in search_results:
    print(f"Text: {result.page_content}")

ValueError: too many values to unpack (expected 2)

In [71]:

contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)
contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

In [72]:
history_aware_retriever=create_history_aware_retriever(llm,retriever,contextualize_q_prompt)
question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [73]:
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

store={}
def get_session_history(session_id:str)->BaseChatMessageHistory:
    if  session_id not in store:
        store[session_id]=ChatMessageHistory()
    return store[session_id]
        
with_message_history=RunnableWithMessageHistory(llm,get_session_history)

In [74]:
conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [75]:
from langchain_core.messages import AIMessage,HumanMessage,SystemMessage
chat_history=[]


question="tell about Security Policy in this document "
response=conversational_rag_chain.invoke(
    {"input": question},
    config={ "configurable": {"session_id": "session-1"} },  # constructs a key "abc123" in `store`.
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)

print(response['answer'])


ValueError: too many values to unpack (expected 2)

In [55]:
question="The Manufacturer shall:"
response=conversational_rag_chain.invoke(
    {"input": question},
    config={"configurable": {"session_id": "session-1"}},
)

chat_history.extend(
    [
        HumanMessage(content=question),
        AIMessage(content=response["answer"]),
    ]
)
print(response['answer'])

ValueError: too many values to unpack (expected 2)

In [25]:
chat_history.clear()