## Read the files and extract topics for metadata

In [None]:
from pathlib import Path


def get_md_files(directory):
    files = []
    for path in sorted(directory.rglob("*.md")):
        relative_path = path.relative_to(directory)
        topics = str(relative_path).split("\\")[:-1]
        files.append([path, topics])
    return files


doc_dir = Path("../data/docs")
files = get_md_files(doc_dir)

for file, topic in files[:5]:
    print(f"{file} -> {topic}")

## Split files into chunks / langchain docs

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Title"),
    ("##", "Subheader")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)

def get_chunks(path, topics):
    chunks = []

    with open(path, "r", encoding="utf8") as f:
        try:
            markdown_text = f.read()
            chunks = markdown_splitter.split_text(markdown_text)
        except Exception as ex:
            print(path, ex)

    for chunk in chunks:
        chunk.metadata["topics"] = ','.join(topics)
        chunk.metadata["path"] = str(path)

    return chunks

docs = []

for path, topic in files:
    docs += get_chunks(path, topic)

for doc in docs[:5]:
    print(doc)
    print(doc.metadata)

# Embed the docs and add them to a vector DB

In [None]:
# load environment variables
import os
from dotenv import load_dotenv, find_dotenv


load_dotenv(find_dotenv())

In [None]:
# load the embedding model
from langchain_openai import AzureOpenAIEmbeddings

embedding_model = AzureOpenAIEmbeddings(
    azure_deployment="text-embedding-ada-002",
    api_version="2023-05-15",
)

In [None]:
# show a sample embedding for the text "hello world"
test_text = "Hello, world!"
embedding = embedding_model.embed_query(test_text)

print(f"# features: {len(embedding)}")
print(f"first 5 features: {embedding[:5]}")

In [None]:
# initialize the vector store and the index (I picked the index name ef_docs for engineering fundamentals docs)
from langchain_community.vectorstores.azuresearch import AzureSearch

index_name = "ef_docs"
vector_store = AzureSearch(
    azure_search_endpoint=os.getenv("AZURE_SEARCH_ENDPOINT"),
    azure_search_key=os.getenv("AZURE_SEARCH_KEY"),
    index_name=index_name,
    embedding_function=embedding_model.embed_query,
)

In [None]:
# index the docs - this will embed them as well
vector_store.add_documents(docs)

## Test it out with a query

In [None]:
query = "What are the agile ceremonies"
result_docs = vector_store.similarity_search(query)

In [None]:
from IPython.display import Markdown, display


for doc in result_docs:
    print("------------------------------------")
    print("TITLE:", doc.metadata["Title"])
    print("TOPICS:", doc.metadata["topics"])
    print("PATH:", doc.metadata["path"])
    display(Markdown(doc.page_content))