#Open Context Documentaion Loader

In [None]:
%pip install --quiet langchain-community langchain unstructured
%pip install --quiet pandas sentence-transformers scikit-learn numpy
%pip install --quiet pymongo langchain-openai tiktoken

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.document_loaders import TextLoader

from langchain.document_loaders import TextLoader
from langchain_community.document_loaders import UnstructuredMarkdownLoader

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np
import time
import os

from langchain.chains import RetrievalQA
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import (
    DirectoryLoader,
    UnstructuredMarkdownLoader,
)

In [None]:
!pip install google-cloud-secret-manager
!pip install --upgrade google-auth

from google.cloud import secretmanager
from google.colab import auth
from google.colab import drive



In [None]:
def load_secrets(secrets_name, project_id):
  # Build a client
  auth.authenticate_user()
  client = secretmanager.SecretManagerServiceClient()
  secret_name = secrets_name
  # Create path to latest secret
  resource_name = f"projects/{project_id}/secrets/{secret_name}/versions/latest"
  # Get your secret :
  response = client.access_secret_version(request={"name": resource_name})
  secret_string = response.payload.data.decode('UTF-8')
  return secret_string

In [None]:
project_id = 'botchagalupep1'
openai_api_key = load_secrets("openai_api_key",project_id)
os.environ['OPENAI_API_KEY'] = openai_api_key
#MONGODB_ATLAS_CLUSTER_URI = load_secrets("mdb_uri",project_id)
MONGODB_ATLAS_CLUSTER_URI = load_secrets("MDB_CLUSTER0_URI",project_id)
langsmith_api_key = load_secrets("langsmith_api_key",project_id)
#print(langsmith_api_key )
#print(MONGODB_ATLAS_CLUSTER_URI)

In [None]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from pymongo import MongoClient

# initialize MongoDB python client
client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

DB_NAME = "Cluster0"
COLLECTION_NAME = "OpenContext0"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "vector_index"

embedded_items_collection = client[DB_NAME][COLLECTION_NAME]

In [None]:
doc_count = embedded_items_collection.count_documents (filter = {})
print (f"Document count before delete : {doc_count:,}")

result = embedded_items_collection.delete_many(filter= {})
print (f"Deleted docs : {result.deleted_count}")


Document count before delete : 0
Deleted docs : 0


# Load Example 1
## Load Unstructured Markdown (Directory Loader)

In [None]:
loader = DirectoryLoader(
    "/content/gdrive/MyDrive/GAI/catalog-yaml-format", glob="*.md", loader_cls=UnstructuredMarkdownLoader
)
docs = loader.load()
docs

# Load Example 2
## Load Unstructured Markdown (Directory Loader - With Chunking)


In [None]:
# load text splitter and split docs into snippets of text
text_splitter = TokenTextSplitter(chunk_size=400, chunk_overlap=50)
split_docs = text_splitter.split_documents(docs)
docs = split_docs

# Load Examplw 3
## Load Unstructured Markdown (File Loader)


In [None]:
# Example usage
directory_path = '/content/gdrive/MyDrive/GAI/catalog-yaml-format'
markdown_contents = []

for file in os.listdir(directory_path):
    if os.path.isfile(os.path.join(directory_path, file)):
        full_path = os.path.join(directory_path, file)
        loader = UnstructuredMarkdownLoader(full_path)
        #loader = UnstructuredMarkdownLoader(full_path)
        md = loader.load()
        markdown_contents.extend(md)

docs = markdown_contents
print(docs)




# Load Example 4
##Load Unstructured Markdown (File Loader - elements mode)


In [None]:
# Example usage
directory_path = '/content/gdrive/MyDrive/GAI/catalog-yaml-format'

for file in os.listdir(directory_path):
    if os.path.isfile(os.path.join(directory_path, file)):
        full_path = os.path.join(directory_path, file)
        loader = UnstructuredMarkdownLoader(full_path,mode="elements")
        #loader = UnstructuredMarkdownLoader(full_path)
        md = loader.load()
        markdown_contents.extend(md)

docs = markdown_contents
print(docs)




# Load Example 5
## Load MarkdownHeaderTextSplitter



In [None]:
def read_markdown_files(directory_path):
    """
    Reads and returns the content of all Markdown files in the given directory path.

    :param directory_path: Path to the directory whose Markdown files are to be read.
    :return: A list of dictionaries, each containing the file name and content of a Markdown file.
    """
    markdown_contents = []

    # Check if the given path is a directory
    if not os.path.isdir(directory_path):
        print(f"The path {directory_path} is not a valid directory.")
        return markdown_contents

    # Find all Markdown files in the directory
    markdown_files = [f for f in os.listdir(directory_path) if os.path.isfile(os.path.join(directory_path, f)) and f.endswith('.md')]

    for file in markdown_files:
        file_path = os.path.join(directory_path, file)
        with open(file_path, 'r', encoding='utf-8') as md_file:
            content = md_file.read()

            #print(file)
            markdown_splitter = MarkdownHeaderTextSplitter(
                 headers_to_split_on=headers_to_split_on, strip_headers=False)
            md = markdown_splitter.split_text(str(content))

            docx = []
            for mdx in md:
              mdx.metadata['source'] = file
              docx.append(mdx)

            #print(docx)
            markdown_contents.extend(docx)

    return markdown_contents

Need to add source document in the meta data

In [None]:
# Example usage


headers_to_split_on = [
                       ("#", "Header 1"),
                       ("##", "Header 2"),
                       ("###", "Header 3"),
                       ("####", "Header 4"),
                       ("#####", "Header 5"),
                       ('\n\n<Tab name="',"Tab Name"),
                       ("\n\n<Tabs>\n\n","Tabs Container"),
                       ("<table>\n","Table"),
                       ("<tr>\n","Table Row"),
                       ("<th>\n","Header Cell"),
                       ("<td>\n","Data Cell"),
                       #("\n```","Code Block"),
                       ("\n\n***\n\n","Horizontal Rule"),
                       ("\n\n---\n\n","Horizontal Rule"),
                       ("\n\n","Whitespace"),
                       ("\n\n\n","Whitespace"),
                       ("\n","Newline"),
                       (" ","Space"),
                       #("","Empty String"),
                       ("`","Backtick")

                      # From packages/mongodb-rag-ingest/src/embed/chunkMd.ts
                    ]
directory_path = '/content/gdrive/MyDrive/GAI/catalog-yaml-format'
docs = read_markdown_files(directory_path)

print("Number of docs:", len(docs))
for doc in docs:
  print(len(doc.page_content))
  print(doc)
#print(docs

In [None]:
from langchain_community.vectorstores import MongoDBAtlasVectorSearch
from langchain_openai import OpenAIEmbeddings

embedding = "text-embedding-3-small"

# insert the documents in MongoDB Atlas with their embedding
vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=docs,
    embedding=OpenAIEmbeddings(model="text-embedding-3-small",dimensions=1536,disallowed_special=()),
    collection=embedded_items_collection,
    index_name=ATLAS_VECTOR_SEARCH_INDEX_NAME,
)