In [1]:
import os
from dotenv import load_dotenv
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone
from unstructured.chunking.title import chunk_by_title
from unstructured.partition.md import partition_md
from unstructured.staging.base import dict_to_elements
from code_wizard.consts import INDEX_NAME

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()
pinecone_api_key = os.environ.get("PINECONE_API_KEY")
pc = Pinecone(api_key=pinecone_api_key, environment="northamerica-northeast1-gcp")

In [3]:
current_dir = os.getcwd()
docs_path = os.path.join(
    current_dir,
    "..",
    "langchain_docs",
)
docs_path

'/home/voldemort/Downloads/Code/code_wizard/code_wizard/../langchain_docs'

In [4]:
md_elements = []

for filename in os.listdir(docs_path):
    if filename.endswith(".md") or filename.endswith(".mdx"):
        file_path = os.path.join(docs_path, filename)
        md_elements.extend(partition_md(filename=file_path))

elements = chunk_by_title(md_elements)
elements

[<unstructured.documents.elements.CompositeElement at 0x7f85e5662490>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662610>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e56627d0>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662990>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662b50>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662ed0>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e56630d0>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662d50>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662d10>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5663590>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662c10>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5663890>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e5662f50>,
 <unstructured.documents.elements.CompositeElement at 0x7f85e56639d0>,
 <unst

In [5]:
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings

In [6]:
documents = []
for element in elements:
    metadata = element.metadata.to_dict()
    del metadata["languages"]
    metadata["source"] = metadata["filename"]
    documents.append(Document(page_content=element.text, metadata=metadata))

documents

[Document(page_content='Multiple callback handlers\n\nIn the previous examples, we passed in callback handlers upon creation of an object by using callbacks=. In this case, the callbacks will be scoped to that particular object.', metadata={'file_directory': '/home/voldemort/Downloads/Code/code_wizard/code_wizard/../langchain_docs', 'filename': 'multiple_callbacks.md', 'filetype': 'text/markdown', 'last_modified': '2024-04-25T07:29:47', 'page_number': 1, 'orig_elements': 'eJzlUj1v2zAQ/SsEZ0eyZUqyDHRqlw7t5C0IhCN5stnwQyApO26Q/96TGgdpOnXuQhze3XvvPnj/zNGiQ597o/me8abdtWK7HWoQnarVpqpRikoMspMomh3yFeMOM2jIQPXPfDAWe20iqhzidZYoT8FheQ5Wowsxl1/CxdsAOpWfg8ZS0dNfzE+I+o+4KEoL/qhOYHyvg0qz1azuweGs6yabzUhuCqyVoB5T4fStKF/HpSjjUy4dxEdNrnNy1pzgiImy9xz9kT8saMq9C9oMBpe5q3Ul7tbirqoP63ZfdXvRzuyRmL2fnMRIVZuX1W+HmfHttR12a4edwGuLcWn81s/BZIuceB8XvRkEtM1GViC30ICqddeorm7kbl3VWtT/96IXJP7Dt3x/ma+e5ROyMeLZhCkxfAJH46QVuxAKKaFmxv99NzaNgeCIkA0FYWBAr/xBG2fyyqZk/PGNlT4VbDEyibCEq8XzLcsuxlomkSUVRvLLgfKQyT5moyYL8VW5eP9bvkOMZH7

In [7]:
openai_api_key = os.environ.get("OPENAI_API_KEY")
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key, disallowed_special=set())

In [8]:
print(f"Going to insert {len(documents)} Documents to Pinecone index {INDEX_NAME}")
docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,
    index_name=INDEX_NAME,
)
print("****** All Embeddings Added to Pinecone Vectorstore ******")

Going to insert 9833 Documents to Pinecone index langchain-doc-index-unstructured
****** All Embeddings Added to Pinecone Vectorstore ******
