In [3]:
import os 
from dotenv import load_dotenv 
load_dotenv()

from langchain_pinecone import PineconeVectorStore
from langchain_experimental.text_splitter import SemanticChunker
from langchain_community.document_loaders import S3FileLoader
from langchain_core.documents import Document

from pinecone import Pinecone
from langchain_pinecone import PineconeVectorStore

def get_pinecone_vector_store(embeddings_model,namespace="default",index_name: str = os.getenv("PINECONE_INDEX_NAME")) -> PineconeVectorStore:
    try:
        # Initialize Pinecone client
        pinecone_api_key = os.getenv("PINECONE_API_KEY")
        if not pinecone_api_key:
            raise ValueError("Pinecone API key is not set. Please set it in the environment variables.")
        
        pc = Pinecone(api_key=pinecone_api_key)

        # Check if the index exists
        existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
        if index_name in existing_indexes:
            print(f"Index '{index_name}' already exists. Retrieving it...")
            index = pc.Index(index_name)
        else:
            print(f"Index '{index_name}' does not exist. Please create one")


        # Initialize and return the vector store
        vector_store = PineconeVectorStore(index=index, embedding=embeddings_model,namespace=namespace)
        return vector_store

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [5]:
import logging 

def chunk_docs(documents, embed_model) :
    """
    Chunks the provided documents into smaller text segments using a semantic chunking model.

    Args:
        documents (List[Document]): A list of documents to be chunked. Each document is expected to have a `page_content` attribute.
        embed_model: The embedding model to be used by the semantic chunker.

    Returns:
        List[Document]: A list of chunks created from the input documents.

    Raises:
        Exception: If chunking fails.
    """
    
    try:
        logging.info("Initializing SemanticChunker...")
        text_splitter = SemanticChunker(embeddings=embed_model)
        logging.info("Performing document chunking...")
        chunks = text_splitter.transform_documents(documents=documents)
        logging.info(f"Created {len(chunks)} chunks from the documents.")
        return chunks
    except Exception as e:
        logging.error(f"Failed to chunk documents: {str(e)}")
        raise

In [6]:
def load_to_vectordb(
    docs,
    index_name: str,
    embeddings,
    namespace: str = "default" 
) :
    """
    Loads the given documents into a vector database, using the specified embeddings and namespace.

    Args:
        docs (List[Document]): A list of document chunks to be loaded into the vector database.
        index_name (str): The name of the index in the vector database.
        dimension (int): The dimensionality of the vectors. Defaults to 1024.
        metric (str): The similarity metric to use. Defaults to "cosine".
        namespace (str): The namespace under which the documents are to be stored in the vector database.

    Returns:
        Tuple[Union[PineconeVectorStore, None], bool]: 
            - If successful: (vector_store_object, True)
            - If failed: (None, False)

    Raises:
        Exception: If loading to the vector database fails.
    """
    
    try:
        # Retrieve or create the Pinecone vector store
        vector_store = get_pinecone_vector_store(embeddings,namespace=namespace, index_name=index_name)
        
        if vector_store is None:
            logging.error("Failed to create or retrieve the Pinecone vector store.")
            return None, False

        logging.info(f"Loading {len(docs)} documents into Pinecone vector store '{index_name}' under namespace '{namespace}'...")
        
        # TODO: add cache 
        vector_store.add_documents(documents=docs)

        logging.info("Documents successfully loaded into the vector database.")
        return vector_store, True

    except Exception as e:
        logging.error(f"Failed to load documents into vector database: {str(e)}")
        return None, False

In [None]:
import nest_asyncio

nest_asyncio.apply()
from langchain_community.document_loaders.mongodb import MongodbLoader

def load_document():

    loader = MongodbLoader(
        connection_string=os.getenv("MONGODB_CONNECTION_STRING"),
        db_name="streamlit-documents",
        collection_name="user-documents",
    )

    docs = loader.load()
    return docs

In [7]:
from langchain_community.document_loaders import Docx2txtLoader

loader = Docx2txtLoader("/home/asmaa/work/BV_Lab/POCs/vessel_property_mang/assets/VPM Problems_Solutions, FAQ, AI Training Tool.docx")

data = loader.load()

data

[Document(metadata={'source': '/home/asmaa/work/BV_Lab/POCs/vessel_property_mang/assets/VPM Problems_Solutions, FAQ, AI Training Tool.docx'}, page_content="Below is a list of problems that our property management company occasionally encounters. \n\n\n\nA unit owner reached out about lights that are either out, not working properly, or need replacing. What do we do?\n\nIf the light in question is in the common area of the building, reach out to Larry our Handyman to come and fix the issue. Larry can be reached at (201) 496-9438. If the problem is within an owners unit, than refer them to Larry. We only solve problems in common areas, not within an owners unit.\n\n\n\nA unit owner reached out asking if we will need access to their unit during a fire inspection. Do they?\n\nNo. During Cintas inspections (fire alarm, smoke detector, sprinkler) we do not need access to units.\n\n\n\nA unit owner reached out asking if we will need access to their unit during a DCA. Do they?\n\nYES. During D

In [8]:
from langchain_openai import OpenAIEmbeddings

chunks = chunk_docs(data,OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY")))

In [11]:
load_to_vectordb(chunks,"poc-vessel-manag",OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"),model="text-embedding-3-large"))

Index 'poc-vessel-manag' already exists. Retrieving it...


(<langchain_pinecone.vectorstores.PineconeVectorStore at 0x7f67b0424280>, True)