In [286]:

from dotenv import load_dotenv
import json
import torch
import torch.nn.functional as F
import numpy as np
from transformers import AutoTokenizer, AutoModel
from langchain.embeddings.base import Embeddings
import pathlib
import os
import pandas as pd
import hashlib
from langchain.document_loaders import PyPDFLoader
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

from openai import AzureOpenAI
import jinja2
load_dotenv()

True

In [288]:
class CustomLangChainEmbedding(Embeddings):
    def __init__(self, model_name="all-MiniLM-L6-v2", use_gpu=False):
        """
        Initialize the embedding class with a specific transformer model.
        
        Args:
            model_name (str): Name of the pre-trained transformer model.
            use_gpu (bool): If True, use GPU (CUDA) for inference; otherwise, use CPU.
        """
        self.tokenizer = AutoTokenizer.from_pretrained(model_name,clean_up_tokenization_spaces=True)
        self.model = AutoModel.from_pretrained(model_name)

        # Use GPU if available and requested
        self.device = torch.device("cuda" if torch.cuda.is_available() and use_gpu else "cpu")
        self.model.to(self.device)
        print(f"Model loaded on {self.device}")


    def mean_pooling(self, model_output, attention_mask):
        """
        Mean pooling to compute sentence embeddings from token embeddings.
        """
        token_embeddings = model_output[0]  # First element is token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

    
        
    def encode_data(self, sentences):
        """
        
        In summary, the encode_data method tokenizes the input sentences, computes their embeddings using a pre-trained transformer model, 
        normalizes the embeddings, and returns them as a NumPy array
         
        Encode the input sentences into sentence embeddings.
        
        Args:
            sentences (list of str): List of sentences to encode.
        
        Returns:
            np.ndarray: Sentence embeddings as a numpy array.
        """
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
        with torch.no_grad():
            model_output = self.model(**encoded_input)
                                         
        
        sentence_embeddings = self.mean_pooling(model_output, encoded_input['attention_mask'])
        sentence_embeddings = F.normalize(sentence_embeddings)
        return torch.squeeze(sentence_embeddings).numpy() # Convert to numpy for FAISS or other downstream tasks

    def embed_documents(self, texts):
        """
        LangChain-compatible method to create embeddings for documents.
        
        Args:
            texts (list of str): List of documents (text) to create embeddings for.
        
        Returns:
            np.ndarray: Document embeddings as numpy arrays.
        """
        return self.encode_data(texts)

    def embed_query(self, text):
        """
        LangChain-compatible method to create embedding for a single query.
        
        Args:
            text (str): Query to create embedding for.
        
        Returns:
            np.ndarray: Query embedding as a numpy array.
        """
        return self.encode_data(text)

In [289]:
class FaissIndexManager:
    def __init__(self, embedding, index_path="faiss_index"):
        self.embedding = embedding
        self.index_path = index_path
        self.vector_store = self.load_faiss_index()
    
    # Function to save the FAISS index to disk
    def save_faiss_index(self):
        os.makedirs(self.index_path, exist_ok=True)
        self.vector_store.save_local(self.index_path)
        print(f"FAISS index and metadata saved to {self.index_path}")
    
    # Function to load FAISS index from disk
    def load_faiss_index(self):
        index_file = os.path.join(self.index_path, "index.faiss")
        pkl_file = os.path.join(self.index_path, "index.pkl")
        
        if os.path.exists(index_file) and os.path.exists(pkl_file):
            print(f"Loading FAISS index and metadata from {self.index_path}")
            return FAISS.load_local(self.index_path, self.embedding, allow_dangerous_deserialization=True)
        else:
            print(f"No FAISS index found at {self.index_path}, creating a new one.")
            return None
    
    # Function to split a document into chunks
    @staticmethod
    def split_document_into_chunks(document, chunk_size=1000, chunk_overlap=200):
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size, 
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""]
        )
        chunks = text_splitter.create_documents([document.page_content])
        return chunks
    
    # Function to generate a consistent document ID using a hash
    @staticmethod
    def generate_doc_id(content):
        normalized_content = content.strip().lower()
        return hashlib.sha256(normalized_content.encode('utf-8')).hexdigest()
    
    # Function to add a PDF document to the FAISS store
    def add_pdf_to_faiss(self, pdf_path):
        if self.vector_store is None:
            # Load or create a new FAISS index
            self.vector_store = self.load_faiss_index()

        pdf_loader = PyPDFLoader(pdf_path)
        documents = pdf_loader.load()

        new_documents = []
        embeddings_list = []

        # Check for existing documents in vector store
        existing_ids = set(
            self.generate_doc_id(doc.page_content)
            for doc_id, doc in self.vector_store.docstore._dict.items()
        ) if self.vector_store is not None else set()

        for document in documents:
            chunks = self.split_document_into_chunks(document)

            for chunk in chunks:
                doc_id = self.generate_doc_id(chunk.page_content)
                if doc_id not in existing_ids:
                    new_embedding = self.embedding.encode_data(chunk.page_content).reshape(1, -1)
                    new_documents.append(Document(page_content=chunk.page_content, metadata={"id": doc_id}))
                    print(f"Embedding new document chunk with doc_id: {doc_id}")
                    embeddings_list.append(new_embedding)

        # Debugging information
        print(f"Total new documents: {len(new_documents)}")
        print(f"Total embeddings created: {len(embeddings_list)}")

        if new_documents:
            if self.vector_store is None:
                # Initialize FAISS index manually, passing in precomputed embeddings
                self.vector_store = FAISS.from_documents(new_documents, self.embedding)
                print(f"Created new FAISS index for {pdf_path}.")
            else:
                # Add the new documents and embeddings to the existing FAISS index
                self.vector_store.add_documents(new_documents, embeddings=embeddings_list)
                for idx, doc in enumerate(new_documents):
                    self.vector_store.index_to_docstore_id[self.vector_store.index.ntotal - len(new_documents) + idx] = doc.metadata["id"]
                print(f"Added {len(new_documents)} new chunks to FAISS index.")
        else:
            print("No new chunks to add to FAISS.")

        # Save the updated FAISS index
        self.save_faiss_index()
        return self.vector_store
    
    # Function to add an Excel document to the FAISS store, using content from the 'description' column
    def add_excel_to_faiss(self, excel_path, sheet_name=0):
        if self.vector_store is None:
            # Load or create a new FAISS index
            self.vector_store = self.load_faiss_index()

        # Load the Excel file
        df = pd.read_excel(excel_path, sheet_name=sheet_name)

        # Make sure the 'description' column exists
        if 'requirements' not in df.columns:
            print(f"The column 'requirements' was not found in the Excel file.")
            return

        new_documents = []
        embeddings_list = []

        # Check for existing documents in vector store
        existing_ids = set(
            self.generate_doc_id(doc.page_content)
            for doc_id, doc in self.vector_store.docstore._dict.items()
        ) if self.vector_store is not None else set()

        # Iterate through the 'description' column and treat each cell as a document chunk
        for _, row in df.iterrows():
            content = str(row['requirements'])  # Extract content from the 'description' column

            if pd.isna(content) or not content.strip():
                continue  # Skip empty or NaN entries

            doc_id = self.generate_doc_id(content)
            if doc_id not in existing_ids:
                # Split the content into chunks if necessary
                chunks = self.split_document_into_chunks(Document(page_content=content))
                for chunk in chunks:
                    doc_id = self.generate_doc_id(chunk.page_content)
                    if doc_id not in existing_ids:
                        new_embedding = self.embedding.embed_documents(chunk.page_content)
                        new_documents.append(Document(page_content=chunk.page_content, metadata={"id": doc_id}))
                        print(f"Embedding new document chunk with doc_id: {doc_id}")
                        embeddings_list.append(new_embedding)

        # Debugging information
        print(f"Total new documents: {len(new_documents)}")
        print(f"Total embeddings created: {len(embeddings_list)}")

        if new_documents:
            if self.vector_store is None:
                # Initialize FAISS index manually, passing in precomputed embeddings
                self.vector_store = FAISS.from_documents(new_documents, self.embedding)
                print(f"Created new FAISS index for {excel_path}.")
            else:
                # Add the new documents and embeddings to the existing FAISS index
                self.vector_store.add_documents(new_documents, embeddings=embeddings_list)
                for idx, doc in enumerate(new_documents):
                    self.vector_store.index_to_docstore_id[self.vector_store.index.ntotal - len(new_documents) + idx] = doc.metadata["id"]
                print(f"Added {len(new_documents)} new chunks to FAISS index.")
        else:
            print("No new chunks to add to FAISS.")

        # Save the updated FAISS index
        self.save_faiss_index()
        return self.vector_store
    
    # Function to inspect the FAISS store
    def inspect_faiss_store(self):
        if self.vector_store is None:
            print("FAISS store is empty or not loaded.")
            return
        
        # Check number of vectors stored
        num_vectors = self.vector_store.index.ntotal
        print(f"Number of vectors stored: {num_vectors}")
        
        # Check stored documents and metadata
        print("Stored documents:")
        for doc_id, document in self.vector_store.docstore._dict.items():
            print(f"Document ID: {doc_id}")
            print(f"Content: {document.page_content[:200]}")  # Print first 200 characters of content
            print(f"Metadata: {document.metadata}")
        
        # Retrieve and check stored embeddings
        if num_vectors > 0:
            for i in range(min(5, num_vectors)):  # Print embeddings of first 5 documents
                vector = self.vector_store.index.reconstruct(i)
                print(f"Vector Shape: {vector.shape}...")
                print(f"Embedding {i}: {vector[:10]}...")  # Print first 10 dimensions of the embedding
        else:
            print("No embeddings stored.")

In [307]:
embedding = CustomLangChainEmbedding(model_name="./Models/all-MiniLM-L6-v2", use_gpu=False)
faiss_manager = FaissIndexManager(embedding)
vector_store=faiss_manager.add_excel_to_faiss(f"{os.getcwd()}/Requirements/Phases.xlsx")
retriever=vector_store.as_retriever(search_type="similarity",search_kwargs={"k":5})

Model loaded on cpu
Loading FAISS index and metadata from faiss_index
Total new documents: 0
Total embeddings created: 0
No new chunks to add to FAISS.
FAISS index and metadata saved to faiss_index


In [404]:

 
 #print(prompt_template.render(keyword="phases"))   

In [406]:
client = AzureOpenAI(
    azure_endpoint=os.getenv("AL_AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AL_OPENAI_API_KEY"),
    api_version=os.getenv("AL_OPENAI_API_VERSION"))

def get_context_for_phase_detection_with_keyword(keyword):
    return retriever.invoke(f"Find the {keyword}")

functions=[{"name": "get_context_for_phase_detection_with_keyword",
                "description":"get correct context from list of documents related to keyword for metro line", 
                "parameters": 
                {
                    "type": "object",
                    "properties": 
                        {
                        "keyword":{"type":"string","description":"get the keyword"}
                        },
                    "required": ["keyword"],
                },
            }
]

path = pathlib.Path(f"{os.getcwd()}/Templates/PhaseDetectionTemplate.jinja2")
with path.open() as f:
    prompt_template = jinja2.Template(f.read())
prompt = prompt_template.render(keyword="phases")

messages=[
        {"role": "system", "content": "You are a helpful assistant to find out relevant information from given context."},
        {"role": "user", "content": prompt}
    ]

response = client.chat.completions.create(
    model="gpt-4o",
    temperature=0,
    messages=messages,
    functions=functions,
    function_call={'name': 'get_context_for_phase_detection_with_keyword'}
    
)
available_functions={"get_context_for_phase_detection_with_keyword": get_context_for_phase_detection_with_keyword}

response_message=response.choices[0].message
function_name=response_message.function_call.name
function_args=json.loads(response_message.function_call.arguments)

function_to_call=available_functions.get(function_name)
function_response=function_to_call(function_args.get("keyword"))


messages.append(response_message)
messages.append({"role": "function", "name" : function_name, "content": f"{function_response}"})

second_response = client.chat.completions.create(model="gpt-4o",temperature=0,messages=messages)
print(second_response.choices[0].message.content)

Based on the provided context, the relevant phases mentioned are:

1. **Phases of the Bhopal Metro**:
   - From AIIMS to Karond Circle Station
   - Including the Depot
   - From Bhadbhada Square to Ratnagiri Tiraha

2. **Phases of the Indore Metro**:
   - All lines

Additionally, the location of the common OCC (Operational Control Center) for the two phases of Bhopal Metro is at the Depot, and similarly, the location of the OCC for the Indore Metro is at its Depot.
