In [5]:
import os
from dotenv import load_dotenv, find_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_core.documents import Document
from pinecone import Pinecone
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from app.graph.consts import INDEX_NAME


_ = load_dotenv(find_dotenv())

In [6]:
pc = Pinecone(environment="northamerica-northeast1-gcp")

In [7]:
current_dir = os.getcwd()
docs_path = os.path.join(
    current_dir,
    "..",
    "langchain_docs",
)
docs_path

'/home/voldemort/Downloads/Code/Eden/C_RAG/app/../langchain_docs'

In [8]:
md_elements = []

for filename in os.listdir(docs_path):
    if filename.endswith(".md") or filename.endswith(".mdx"):
        file_path = os.path.join(docs_path, filename)
        md_elements.extend(partition_md(filename=file_path))

elements = chunk_by_title(md_elements)
elements

[]

In [9]:
documents = []
for element in elements:
    metadata = element.metadata.to_dict()
    del metadata["languages"]
    metadata["source"] = metadata["filename"]
    documents.append(Document(page_content=element.text, metadata=metadata))

documents

[]

In [None]:
print(f"Going to insert {len(documents)} Documents to Pinecone index {INDEX_NAME}")
embeddings = OpenAIEmbeddings(disallowed_special=set())
docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,
    index_name=INDEX_NAME,
)
print("****** All Embeddings Added to Pinecone Vectorstore ******")